changed inline assembler labels to short form
This commit is contained in:
		
							parent
							
								
									1943ea91a8
								
							
						
					
					
						commit
						bc5fff7085
					
				| 
						 | 
				
			
			@ -40,7 +40,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 | 
			
		|||
	"vbroadcastss	       4(%4), %%xmm1		    \n\t"  // imag part of alpha
 | 
			
		||||
 | 
			
		||||
	".align 16				            \n\t"
 | 
			
		||||
	".L01LOOP%=:				            \n\t"
 | 
			
		||||
	"1:				            \n\t"
 | 
			
		||||
 | 
			
		||||
	"prefetcht0  768(%2,%0,4)                           \n\t"
 | 
			
		||||
	"vmovups        (%2,%0,4), %%xmm5                   \n\t" // 2 complex values from x
 | 
			
		||||
| 
						 | 
				
			
			@ -113,7 +113,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 | 
			
		|||
 | 
			
		||||
	"addq		$16, %0	  	 	             \n\t"
 | 
			
		||||
	"subq	        $8 , %1			             \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		             \n\t"
 | 
			
		||||
	"jnz		1b		             \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -49,10 +49,10 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"vbroadcastss	28(%2), %%ymm7                  \n\t"  // imag part x3
 | 
			
		||||
 | 
			
		||||
	"cmpq		$0 , %1				\n\t"
 | 
			
		||||
	"je		.L01END%=		        \n\t"
 | 
			
		||||
	"je		2f			        \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
	"prefetcht0      320(%4,%0,4)			\n\t"
 | 
			
		||||
	"vmovups	(%4,%0,4), %%ymm8	        \n\t" // 4 complex values form a0
 | 
			
		||||
	"vmovups      32(%4,%0,4), %%ymm9	        \n\t" // 4 complex values form a0
 | 
			
		||||
| 
						 | 
				
			
			@ -115,12 +115,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
 | 
			
		||||
        "addq		$16, %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $8 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
 | 
			
		||||
	".L01END%=:				        \n\t"
 | 
			
		||||
	"2:				        \n\t"
 | 
			
		||||
 | 
			
		||||
	"cmpq		$4, %8				\n\t"
 | 
			
		||||
	"jne		.L02END%=			\n\t"
 | 
			
		||||
	"jne		3f				\n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%4,%0,4), %%ymm8	        \n\t" // 4 complex values form a0
 | 
			
		||||
	"vmovups	(%5,%0,4), %%ymm10              \n\t" // 4 complex values form a1
 | 
			
		||||
| 
						 | 
				
			
			@ -155,7 +155,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
 | 
			
		||||
	"vmovups  %%ymm12,   (%3,%0,4)		        \n\t" // 4 complex values to y	
 | 
			
		||||
 | 
			
		||||
	".L02END%=:				        \n\t"
 | 
			
		||||
	"3:				        \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			@ -200,10 +200,10 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"vbroadcastss	12(%2), %%ymm3                  \n\t"  // imag part x1
 | 
			
		||||
 | 
			
		||||
	"cmpq		$0 , %1				\n\t"
 | 
			
		||||
	"je		.L01END%=		        \n\t"
 | 
			
		||||
	"je		2f			        \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
	"prefetcht0      320(%4,%0,4)			\n\t"
 | 
			
		||||
	"vmovups	(%4,%0,4), %%ymm8	        \n\t" // 4 complex values form a0
 | 
			
		||||
	"vmovups      32(%4,%0,4), %%ymm9	        \n\t" // 4 complex values form a0
 | 
			
		||||
| 
						 | 
				
			
			@ -248,12 +248,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
 | 
			
		||||
        "addq		$16, %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $8 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
 | 
			
		||||
	".L01END%=:				        \n\t"
 | 
			
		||||
	"2:				        \n\t"
 | 
			
		||||
 | 
			
		||||
	"cmpq		$4, %6				\n\t"
 | 
			
		||||
	"jne		.L02END%=			\n\t"
 | 
			
		||||
	"jne		3f				\n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%4,%0,4), %%ymm8	        \n\t" // 4 complex values form a0
 | 
			
		||||
	"vmovups	(%5,%0,4), %%ymm10              \n\t" // 4 complex values form a1
 | 
			
		||||
| 
						 | 
				
			
			@ -279,7 +279,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
 | 
			
		||||
	"vmovups  %%ymm12,   (%3,%0,4)		        \n\t" // 4 complex values to y	
 | 
			
		||||
 | 
			
		||||
	".L02END%=:				        \n\t"
 | 
			
		||||
	"3:				        \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			@ -320,10 +320,10 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"vbroadcastss	 4(%2), %%ymm1                  \n\t"  // imag part x0
 | 
			
		||||
 | 
			
		||||
	"cmpq		$0 , %1				\n\t"
 | 
			
		||||
	"je		.L01END%=		        \n\t"
 | 
			
		||||
	"je		2f			        \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
	"prefetcht0      320(%4,%0,4)			\n\t"
 | 
			
		||||
	"vmovups	(%4,%0,4), %%ymm8	        \n\t" // 4 complex values form a0
 | 
			
		||||
	"vmovups      32(%4,%0,4), %%ymm9	        \n\t" // 4 complex values form a0
 | 
			
		||||
| 
						 | 
				
			
			@ -359,12 +359,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"vmovups  %%ymm12,-64(%3,%0,4)		        \n\t" // 4 complex values to y	
 | 
			
		||||
	"vmovups  %%ymm13,-32(%3,%0,4)		        \n\t"	
 | 
			
		||||
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
 | 
			
		||||
	".L01END%=:				        \n\t"
 | 
			
		||||
	"2:				        \n\t"
 | 
			
		||||
 | 
			
		||||
	"cmpq		$4, %5				\n\t"
 | 
			
		||||
	"jne		.L02END%=			\n\t"
 | 
			
		||||
	"jne		3f				\n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%4,%0,4), %%ymm8	        \n\t" // 4 complex values form a0
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -386,7 +386,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
 | 
			
		||||
	"vmovups  %%ymm12,   (%3,%0,4)		        \n\t" // 4 complex values to y	
 | 
			
		||||
 | 
			
		||||
	".L02END%=:				        \n\t"
 | 
			
		||||
	"3:				        \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			@ -452,10 +452,10 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
 | 
			
		|||
	"vbroadcastss	  (%5), %%ymm1                  \n\t"  // alpha_i
 | 
			
		||||
 | 
			
		||||
	"cmpq		$0 , %1				\n\t"
 | 
			
		||||
	"je		.L01END%=		        \n\t"
 | 
			
		||||
	"je		2f			        \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
	"vmovups	(%2,%0,4), %%ymm8	        \n\t" // 4 complex values from src
 | 
			
		||||
	"vmovups      32(%2,%0,4), %%ymm9	        \n\t" 
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -489,12 +489,12 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
 | 
			
		|||
	"vmovups  %%ymm12,-64(%3,%0,4)		        \n\t" // 4 complex values to y	
 | 
			
		||||
	"vmovups  %%ymm13,-32(%3,%0,4)		        \n\t"	
 | 
			
		||||
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
 | 
			
		||||
	".L01END%=:				        \n\t"
 | 
			
		||||
	"2:				        \n\t"
 | 
			
		||||
 | 
			
		||||
	"cmpq		$4, %6				\n\t"
 | 
			
		||||
	"jne		.L02END%=			\n\t"
 | 
			
		||||
	"jne		3f				\n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%2,%0,4), %%ymm8	        \n\t" // 4 complex values src
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -516,7 +516,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
 | 
			
		|||
 | 
			
		||||
	"vmovups  %%ymm12,   (%3,%0,4)		        \n\t" // 4 complex values to y	
 | 
			
		||||
 | 
			
		||||
	".L02END%=:				        \n\t"
 | 
			
		||||
	"3:				        \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -47,7 +47,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"vxorps		%%ymm15, %%ymm15, %%ymm15	\n\t"
 | 
			
		||||
 | 
			
		||||
        "testq          $0x04, %1                      \n\t"
 | 
			
		||||
        "jz             .L08LABEL%=                    \n\t"
 | 
			
		||||
        "jz             2f                      \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
 | 
			
		||||
	"vmovups	(%5,%0,4), %%ymm5               \n\t" // 4 complex values from a1
 | 
			
		||||
| 
						 | 
				
			
			@ -72,12 +72,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
        "addq		$8  , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4  , %1			        \n\t"		
 | 
			
		||||
 | 
			
		||||
        ".L08LABEL%=:                                  \n\t"
 | 
			
		||||
        "2:                                  \n\t"
 | 
			
		||||
	"cmpq           $0, %1                         \n\t"
 | 
			
		||||
        "je             .L08END%=                      \n\t"
 | 
			
		||||
        "je             3f                      \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
        "prefetcht0      192(%4,%0,4)                   \n\t"
 | 
			
		||||
	"vmovups	(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
 | 
			
		||||
        "prefetcht0      192(%5,%0,4)                   \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -125,9 +125,9 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
 | 
			
		||||
        "addq		$16 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $8  , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L08END%=:                                   \n\t"
 | 
			
		||||
        "3:                                   \n\t"
 | 
			
		||||
 | 
			
		||||
        "vbroadcastss    (%8)  , %%xmm0                \n\t"  // value from alpha
 | 
			
		||||
        "vbroadcastss   4(%8)  , %%xmm1                \n\t"  // value from alpha
 | 
			
		||||
| 
						 | 
				
			
			@ -269,7 +269,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"vxorps		%%ymm11, %%ymm11, %%ymm11	\n\t" // temp
 | 
			
		||||
 | 
			
		||||
        "testq          $0x04, %1                      \n\t"
 | 
			
		||||
        "jz             .L08LABEL%=                    \n\t"
 | 
			
		||||
        "jz             2f                    \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
 | 
			
		||||
	"vmovups	(%5,%0,4), %%ymm5               \n\t" // 4 complex values from a1
 | 
			
		||||
| 
						 | 
				
			
			@ -288,12 +288,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
        "addq		$8  , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4  , %1			        \n\t"		
 | 
			
		||||
 | 
			
		||||
        ".L08LABEL%=:                                  \n\t"
 | 
			
		||||
        "2:                                  \n\t"
 | 
			
		||||
	"cmpq           $0, %1                         \n\t"
 | 
			
		||||
        "je             .L08END%=                      \n\t"
 | 
			
		||||
        "je             3f                      \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
        "prefetcht0      192(%4,%0,4)                   \n\t"
 | 
			
		||||
	"vmovups	(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
 | 
			
		||||
        "prefetcht0      192(%5,%0,4)                   \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -325,9 +325,9 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
 | 
			
		||||
        "addq		$16 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $8  , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L08END%=:                                   \n\t"
 | 
			
		||||
        "3:                                   \n\t"
 | 
			
		||||
 | 
			
		||||
        "vbroadcastss    (%6)  , %%xmm0                \n\t"  // value from alpha
 | 
			
		||||
        "vbroadcastss   4(%6)  , %%xmm1                \n\t"  // value from alpha
 | 
			
		||||
| 
						 | 
				
			
			@ -426,7 +426,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
 | 
			
		|||
	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 	\n\t" // temp
 | 
			
		||||
 | 
			
		||||
        "testq          $0x04, %1                      \n\t"
 | 
			
		||||
        "jz             .L08LABEL%=                    \n\t"
 | 
			
		||||
        "jz             2f                    \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -442,12 +442,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
 | 
			
		|||
        "addq		$8  , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4  , %1			        \n\t"		
 | 
			
		||||
 | 
			
		||||
        ".L08LABEL%=:                                  \n\t"
 | 
			
		||||
        "2:                                  \n\t"
 | 
			
		||||
	"cmpq           $0, %1                         \n\t"
 | 
			
		||||
        "je             .L08END%=                      \n\t"
 | 
			
		||||
        "je             3f                      \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
        "prefetcht0      192(%4,%0,4)                   \n\t"
 | 
			
		||||
	"vmovups	(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -472,9 +472,9 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
 | 
			
		|||
 | 
			
		||||
        "addq		$16 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $8  , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L08END%=:                                   \n\t"
 | 
			
		||||
        "3:                                   \n\t"
 | 
			
		||||
 | 
			
		||||
        "vbroadcastss    (%5)  , %%xmm0                \n\t"  // value from alpha
 | 
			
		||||
        "vbroadcastss   4(%5)  , %%xmm1                \n\t"  // value from alpha
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -39,7 +39,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 | 
			
		|||
	"vmovddup		(%4), %%xmm0		    \n\t"  // alpha	
 | 
			
		||||
 | 
			
		||||
	".align 16				            \n\t"
 | 
			
		||||
	".L01LOOP%=:				            \n\t"
 | 
			
		||||
	"1:				            \n\t"
 | 
			
		||||
 | 
			
		||||
        "prefetcht0      768(%3,%0,8)                       \n\t"
 | 
			
		||||
        "vmovups                  (%2,%0,8), %%xmm12         \n\t"  // 2 * x
 | 
			
		||||
| 
						 | 
				
			
			@ -61,7 +61,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 | 
			
		|||
 | 
			
		||||
	"addq		$8 , %0	  	 	             \n\t"
 | 
			
		||||
	"subq	        $8 , %1			             \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		             \n\t"
 | 
			
		||||
	"jnz		1b		             \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -40,7 +40,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 | 
			
		|||
	"shufpd          $0,  %%xmm0, %%xmm0                \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				            \n\t"
 | 
			
		||||
	".L01LOOP%=:				            \n\t"
 | 
			
		||||
	"1:				            \n\t"
 | 
			
		||||
        // "prefetcht0      192(%2,%0,8)                       \n\t"
 | 
			
		||||
        // "prefetcht0      192(%3,%0,8)                       \n\t"
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -70,7 +70,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 | 
			
		|||
 | 
			
		||||
	"addq		$8 , %0	  	 	             \n\t"
 | 
			
		||||
	"subq	        $8 , %1			             \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		             \n\t"
 | 
			
		||||
	"jnz		1b		             \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 | 
			
		|||
	"vxorpd		%%xmm7, %%xmm7, %%xmm7	             \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				             \n\t"
 | 
			
		||||
	".L01LOOP%=:				             \n\t"
 | 
			
		||||
	"1:				             \n\t"
 | 
			
		||||
        "vmovups                  (%2,%0,8), %%xmm12         \n\t"  // 2 * x
 | 
			
		||||
        "vmovups                16(%2,%0,8), %%xmm13         \n\t"  // 2 * x
 | 
			
		||||
        "vmovups                32(%2,%0,8), %%xmm14         \n\t"  // 2 * x
 | 
			
		||||
| 
						 | 
				
			
			@ -55,7 +55,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 | 
			
		|||
 | 
			
		||||
	"addq		$8 , %0	  	 	             \n\t"
 | 
			
		||||
	"subq	        $8 , %1			             \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		             \n\t"
 | 
			
		||||
	"jnz		1b		             \n\t"
 | 
			
		||||
 | 
			
		||||
	"vaddpd        %%xmm4, %%xmm5, %%xmm4	\n\t"
 | 
			
		||||
	"vaddpd        %%xmm6, %%xmm7, %%xmm6	\n\t"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 | 
			
		|||
	"xorpd		%%xmm7, %%xmm7	             \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				            \n\t"
 | 
			
		||||
	".L01LOOP%=:				            \n\t"
 | 
			
		||||
	"1:				            \n\t"
 | 
			
		||||
 | 
			
		||||
        "movups                  (%2,%0,8), %%xmm12         \n\t"  // 2 * x
 | 
			
		||||
        "movups                  (%3,%0,8), %%xmm8          \n\t"  // 2 * y
 | 
			
		||||
| 
						 | 
				
			
			@ -65,7 +65,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 | 
			
		|||
 | 
			
		||||
	"addq		$8 , %0	  	 	             \n\t"
 | 
			
		||||
	"subq	        $8 , %1			             \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		             \n\t"
 | 
			
		||||
	"jnz		1b		             \n\t"
 | 
			
		||||
 | 
			
		||||
	"addpd        %%xmm5, %%xmm4	\n\t"
 | 
			
		||||
	"addpd        %%xmm7, %%xmm6	\n\t"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -125,7 +125,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"shufpd $0,  %%xmm13, %%xmm13    \n\t"	
 | 
			
		||||
 | 
			
		||||
	".align 16				       \n\t"
 | 
			
		||||
	".L01LOOP%=:				       \n\t"
 | 
			
		||||
	"1:				       \n\t"
 | 
			
		||||
	"movups	       (%3,%0,8), %%xmm4	       \n\t"	// 2 * y
 | 
			
		||||
	"movups	     16(%3,%0,8), %%xmm5	       \n\t"	// 2 * y
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -148,7 +148,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
 | 
			
		||||
        "addq		$4 , %0	  	 	       \n\t"
 | 
			
		||||
	"subq	        $4 , %1			       \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		       \n\t"
 | 
			
		||||
	"jnz		1b		       \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
| 
						 | 
				
			
			@ -187,7 +187,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
 | 
			
		|||
        "shufpd $0,  %%xmm12, %%xmm12            \n\t"
 | 
			
		||||
 | 
			
		||||
        ".align 16                               \n\t"
 | 
			
		||||
        ".L01LOOP%=:                             \n\t"
 | 
			
		||||
        "1:                             \n\t"
 | 
			
		||||
        "movups       (%4,%0,8), %%xmm8          \n\t"  // 2 * a
 | 
			
		||||
        "movups     16(%4,%0,8), %%xmm9          \n\t"  // 2 * a
 | 
			
		||||
        "movups       (%3,%0,8), %%xmm4          \n\t"  // 2 * y
 | 
			
		||||
| 
						 | 
				
			
			@ -203,7 +203,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
 | 
			
		|||
        "addq           $4 , %0                  \n\t"
 | 
			
		||||
        "subq           $4 , %1                  \n\t"
 | 
			
		||||
 | 
			
		||||
        "jnz            .L01LOOP%=               \n\t"
 | 
			
		||||
        "jnz            1b               \n\t"
 | 
			
		||||
 | 
			
		||||
        :
 | 
			
		||||
        :
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -50,7 +50,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
	"vbroadcastsd    (%9), %%ymm6 	 \n\t"	// alpha 
 | 
			
		||||
 | 
			
		||||
        "testq          $0x04, %1                      \n\t"
 | 
			
		||||
        "jz             .L8LABEL%=                     \n\t"
 | 
			
		||||
        "jz             2f                     \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovupd	(%3,%0,8), %%ymm7	       \n\t"	// 4 * y
 | 
			
		||||
	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -77,14 +77,14 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
        "addq		$4 , %0	  	 	       \n\t"
 | 
			
		||||
	"subq	        $4 , %1			       \n\t"		
 | 
			
		||||
 | 
			
		||||
        ".L8LABEL%=:                                   \n\t"
 | 
			
		||||
        "2:                                   \n\t"
 | 
			
		||||
 | 
			
		||||
        "cmpq           $0, %1                         \n\t"
 | 
			
		||||
        "je             .L16END%=                      \n\t"
 | 
			
		||||
        "je             3f                      \n\t"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
 | 
			
		||||
	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 | 
			
		||||
	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -118,9 +118,9 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
	"subq	        $8 , %1			      \n\t"		
 | 
			
		||||
	"vmovupd  %%ymm9,-32(%3,%0,8)		      \n\t"	// 4 * y
 | 
			
		||||
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L16END%=:                             \n\t"
 | 
			
		||||
        "3:                             \n\t"
 | 
			
		||||
	"vzeroupper			        \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			@ -168,7 +168,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"vbroadcastsd    (%8), %%ymm6 	 \n\t"	// alpha 
 | 
			
		||||
 | 
			
		||||
        "testq          $0x04, %1                      \n\t"
 | 
			
		||||
        "jz             .L8LABEL%=                     \n\t"
 | 
			
		||||
        "jz             2f                     \n\t"
 | 
			
		||||
 | 
			
		||||
	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 | 
			
		||||
	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -188,14 +188,14 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
        "addq		$4 , %0	  	 	       \n\t"
 | 
			
		||||
	"subq	        $4 , %1			       \n\t"		
 | 
			
		||||
 | 
			
		||||
        ".L8LABEL%=:                                   \n\t"
 | 
			
		||||
        "2:                                   \n\t"
 | 
			
		||||
 | 
			
		||||
        "cmpq           $0, %1                         \n\t"
 | 
			
		||||
        "je             .L8END%=                       \n\t"
 | 
			
		||||
        "je             3f                       \n\t"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 | 
			
		||||
	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 | 
			
		||||
	"vmovupd	(%3,%0,8), %%ymm8	       \n\t"	// 4 * y
 | 
			
		||||
| 
						 | 
				
			
			@ -218,9 +218,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
 | 
			
		||||
        "addq		$8 , %0	  	 	      \n\t"
 | 
			
		||||
	"subq	        $8 , %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L8END%=:                                    \n\t"
 | 
			
		||||
        "3:                                    \n\t"
 | 
			
		||||
	"vzeroupper			              \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -60,7 +60,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
	"xorpd           %%xmm4 , %%xmm4	 \n\t"
 | 
			
		||||
	"xorpd           %%xmm5 , %%xmm5	 \n\t"
 | 
			
		||||
	"movups             (%3,%0,8), %%xmm7          \n\t" // 2 * y
 | 
			
		||||
| 
						 | 
				
			
			@ -142,7 +142,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
 | 
			
		||||
        "addq		$4 , %0	  	 	       \n\t"
 | 
			
		||||
	"subq	        $4 , %1			       \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		       \n\t"
 | 
			
		||||
	"jnz		1b		       \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
| 
						 | 
				
			
			@ -194,7 +194,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"shufpd $0,  %%xmm6 , %%xmm6 \n\t"	
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
	"xorpd           %%xmm4 , %%xmm4	 \n\t"
 | 
			
		||||
	"xorpd           %%xmm5 , %%xmm5	 \n\t"
 | 
			
		||||
	"movups	       (%3,%0,8), %%xmm7	 \n\t"	// 2 * y
 | 
			
		||||
| 
						 | 
				
			
			@ -239,7 +239,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
 | 
			
		||||
        "addq		$4 , %0	  	 	       \n\t"
 | 
			
		||||
	"subq	        $4 , %1			       \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		       \n\t"
 | 
			
		||||
	"jnz		1b		       \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -78,7 +78,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
 | 
			
		|||
	"xorpd %%xmm11 , %%xmm11		\n\t"
 | 
			
		||||
		
 | 
			
		||||
	"testq	$2 , %1				\n\t"
 | 
			
		||||
	"jz	.L01LABEL%=			\n\t"
 | 
			
		||||
	"jz	2f			\n\t"
 | 
			
		||||
 | 
			
		||||
	"movups  (%5,%0,8) , %%xmm14		\n\t" // x
 | 
			
		||||
	"movups  (%3,%0,8) , %%xmm12		\n\t" // ap0
 | 
			
		||||
| 
						 | 
				
			
			@ -90,13 +90,13 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
 | 
			
		|||
        "subq           $2 , %1                 \n\t"
 | 
			
		||||
	"addpd   %%xmm13   , %%xmm11		\n\t"
 | 
			
		||||
 | 
			
		||||
        ".L01LABEL%=:                           \n\t"
 | 
			
		||||
        "2:                           \n\t"
 | 
			
		||||
 | 
			
		||||
	"cmpq	$0, %1				\n\t"
 | 
			
		||||
	"je	.L01END%=			\n\t"
 | 
			
		||||
	"je	3f			\n\t"
 | 
			
		||||
 | 
			
		||||
        ".align 16                              \n\t"
 | 
			
		||||
        ".L01LOOP%=:                            \n\t"
 | 
			
		||||
        "1:                            \n\t"
 | 
			
		||||
 | 
			
		||||
	"movups  (%5,%0,8) , %%xmm14		\n\t" // x
 | 
			
		||||
	"movups  (%3,%0,8) , %%xmm12		\n\t" // ap0
 | 
			
		||||
| 
						 | 
				
			
			@ -116,9 +116,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
 | 
			
		|||
 | 
			
		||||
        "addq           $4 , %0                 \n\t"
 | 
			
		||||
        "subq           $4 , %1                 \n\t"
 | 
			
		||||
        "jnz            .L01LOOP%=              \n\t"
 | 
			
		||||
        "jnz            1b              \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L01END%=:                             \n\t"
 | 
			
		||||
        "3:                             \n\t"
 | 
			
		||||
 | 
			
		||||
	"haddpd        %%xmm10, %%xmm10         \n\t"
 | 
			
		||||
	"haddpd        %%xmm11, %%xmm11         \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -157,7 +157,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"xorpd %%xmm10 , %%xmm10		\n\t"
 | 
			
		||||
	
 | 
			
		||||
	"testq	$2 , %1				\n\t"
 | 
			
		||||
	"jz	.L01LABEL%=			\n\t"
 | 
			
		||||
	"jz	2f			\n\t"
 | 
			
		||||
 | 
			
		||||
	"movups  (%3,%0,8) , %%xmm12		\n\t"
 | 
			
		||||
	"movups  (%4,%0,8) , %%xmm11		\n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -166,13 +166,13 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"addpd   %%xmm12   , %%xmm10		\n\t"
 | 
			
		||||
        "subq           $2 , %1                 \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L01LABEL%=:                           \n\t"
 | 
			
		||||
        "2:                           \n\t"
 | 
			
		||||
 | 
			
		||||
	"cmpq	$0, %1				\n\t"
 | 
			
		||||
	"je	.L01END%=			\n\t"
 | 
			
		||||
	"je	3f			\n\t"
 | 
			
		||||
 | 
			
		||||
        ".align 16                              \n\t"
 | 
			
		||||
        ".L01LOOP%=:                            \n\t"
 | 
			
		||||
        "1:                            \n\t"
 | 
			
		||||
 | 
			
		||||
	"movups    (%3,%0,8) , %%xmm12		\n\t"
 | 
			
		||||
	"movups  16(%3,%0,8) , %%xmm14		\n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -185,9 +185,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
        "subq           $4 , %1                 \n\t"
 | 
			
		||||
	"addpd   %%xmm14   , %%xmm9 		\n\t"
 | 
			
		||||
 | 
			
		||||
        "jnz            .L01LOOP%=              \n\t"
 | 
			
		||||
        "jnz            1b              \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L01END%=:                             \n\t"
 | 
			
		||||
        "3:                             \n\t"
 | 
			
		||||
 | 
			
		||||
	"addpd	       %%xmm9 , %%xmm10         \n\t"
 | 
			
		||||
	"haddpd        %%xmm10, %%xmm10         \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -246,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
 | 
			
		|||
	"shufpd  $0 , %%xmm10 , %%xmm10		\n\t"
 | 
			
		||||
 | 
			
		||||
        ".align 16                              \n\t"
 | 
			
		||||
        ".L01LOOP%=:                            \n\t"
 | 
			
		||||
        "1:                            \n\t"
 | 
			
		||||
 | 
			
		||||
	"movups  (%3,%0,8) , %%xmm12		\n\t"
 | 
			
		||||
	"movups  (%4,%0,8) , %%xmm11		\n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -256,7 +256,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
 | 
			
		|||
        "subq           $2 , %1                 \n\t"
 | 
			
		||||
	"movups  %%xmm11, -16(%4,%0,8)		\n\t"
 | 
			
		||||
 | 
			
		||||
        "jnz            .L01LOOP%=              \n\t"
 | 
			
		||||
        "jnz            1b              \n\t"
 | 
			
		||||
 | 
			
		||||
        :
 | 
			
		||||
   	:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -42,7 +42,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"vxorpd		%%ymm7 , %%ymm7, %%ymm7  \n\t"
 | 
			
		||||
 | 
			
		||||
        "testq          $0x04, %1                      \n\t"
 | 
			
		||||
        "jz             .L08LABEL%=                    \n\t"
 | 
			
		||||
        "jz             2f                    \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%2,%0,8), %%ymm12       \n\t"	// 4 * x
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -54,13 +54,13 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
        "addq		$4 , %0	  	 	      \n\t"
 | 
			
		||||
	"subq	        $4 , %1			      \n\t"		
 | 
			
		||||
 | 
			
		||||
        ".L08LABEL%=:                                  \n\t"
 | 
			
		||||
        "2:                                  \n\t"
 | 
			
		||||
 | 
			
		||||
        "cmpq           $0, %1                         \n\t"
 | 
			
		||||
        "je             .L16END%=                      \n\t"
 | 
			
		||||
        "je             3f                      \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
	// "prefetcht0	 384(%2,%0,8)		 \n\t"
 | 
			
		||||
	"vmovups	(%2,%0,8), %%ymm12       \n\t"	// 4 * x
 | 
			
		||||
	"vmovups      32(%2,%0,8), %%ymm13       \n\t"	// 4 * x
 | 
			
		||||
| 
						 | 
				
			
			@ -80,9 +80,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"subq	        $8 , %1			       \n\t"		
 | 
			
		||||
	"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7     \n\t" 
 | 
			
		||||
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L16END%=:                                   \n\t"
 | 
			
		||||
        "3:                                   \n\t"
 | 
			
		||||
 | 
			
		||||
	"vextractf128   $1 , %%ymm4, %%xmm12	      \n\t"
 | 
			
		||||
	"vextractf128   $1 , %%ymm5, %%xmm13	      \n\t"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 | 
			
		|||
	"vmovddup 24(%8),    %%xmm7	             \n\t"	// temp1[1]
 | 
			
		||||
 | 
			
		||||
	".align 16				     \n\t"
 | 
			
		||||
	".L01LOOP%=:				     \n\t"
 | 
			
		||||
	"1:				     \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%4,%0,8), %%xmm12	           \n\t"  // 2 * a
 | 
			
		||||
	"vmovups	(%2,%0,8), %%xmm8	           \n\t"  // 2 * x
 | 
			
		||||
| 
						 | 
				
			
			@ -90,7 +90,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 | 
			
		|||
	"vmovups	%%xmm11 , -16(%3,%0,8)		   \n\t"
 | 
			
		||||
 | 
			
		||||
	"cmpq		%0 , %1			      \n\t"
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovsd		  (%9), %%xmm4		      \n\t"
 | 
			
		||||
	"vmovsd		 8(%9), %%xmm5		      \n\t"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -48,7 +48,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 | 
			
		|||
	"shufpd $0,  %%xmm7, %%xmm7                  \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16		  		       \n\t"
 | 
			
		||||
	".L01LOOP%=:				       \n\t"
 | 
			
		||||
	"1:				       \n\t"
 | 
			
		||||
	"movups	            (%4,%0,8), %%xmm12	       \n\t"	// 2 * a
 | 
			
		||||
	"movups	            (%2,%0,8), %%xmm8	       \n\t"	// 2 * x
 | 
			
		||||
	"movups		    %%xmm12  , %%xmm11	       \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -85,7 +85,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 | 
			
		|||
	"movups             %%xmm9,-16(%3,%0,8)       \n\t"    // 2 * y
 | 
			
		||||
 | 
			
		||||
	"cmpq	        %0 , %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
	"movsd	       (%9),    %%xmm4	             \n\t"	// temp1[0]
 | 
			
		||||
	"movsd        8(%9),    %%xmm5	             \n\t"	// temp1[1]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -47,7 +47,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 | 
			
		|||
	"xorq		%0,%0			     \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				     \n\t"
 | 
			
		||||
	".L01LOOP%=:				     \n\t"
 | 
			
		||||
	"1:				     \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%4,%0,8), %%xmm12	           \n\t"  // 2 * a
 | 
			
		||||
	"vmovups	(%2,%0,8), %%xmm8	           \n\t"  // 2 * x
 | 
			
		||||
| 
						 | 
				
			
			@ -93,7 +93,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 | 
			
		|||
	"vmovups	%%xmm9 ,  -32(%3,%0,8)		   \n\t"
 | 
			
		||||
	"vmovups	%%xmm11 , -16(%3,%0,8)		   \n\t"
 | 
			
		||||
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
	"vhaddpd        %%xmm0, %%xmm0, %%xmm0  \n\t"
 | 
			
		||||
	"vhaddpd        %%xmm1, %%xmm1, %%xmm1  \n\t"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -51,7 +51,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 | 
			
		|||
	"xorq		%0,%0			     \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16		  		       \n\t"
 | 
			
		||||
	".L01LOOP%=:				       \n\t"
 | 
			
		||||
	"1:				       \n\t"
 | 
			
		||||
	"movups	            (%4,%0,8), %%xmm12	       \n\t"	// 2 * a
 | 
			
		||||
	"movups	            (%2,%0,8), %%xmm8	       \n\t"	// 2 * x
 | 
			
		||||
	"movups		    %%xmm12  , %%xmm11	       \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -88,7 +88,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 | 
			
		|||
	"movups             %%xmm9,-16(%3,%0,8)       \n\t"    // 2 * y
 | 
			
		||||
 | 
			
		||||
	"subq	        $2 , %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
	"haddpd        %%xmm0, %%xmm0  \n\t"
 | 
			
		||||
	"haddpd        %%xmm1, %%xmm1  \n\t"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -40,7 +40,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 | 
			
		|||
	"shufps          $0,  %%xmm0, %%xmm0                \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				            \n\t"
 | 
			
		||||
	".L01LOOP%=:				            \n\t"
 | 
			
		||||
	"1:				            \n\t"
 | 
			
		||||
        // "prefetcht0      192(%2,%0,4)                       \n\t"
 | 
			
		||||
        // "prefetcht0      192(%3,%0,4)                       \n\t"
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -70,7 +70,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 | 
			
		|||
 | 
			
		||||
	"addq		$16, %0	  	 	             \n\t"
 | 
			
		||||
	"subq	        $16, %1			             \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		             \n\t"
 | 
			
		||||
	"jnz		1b		             \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 | 
			
		|||
	"vxorps		%%xmm7, %%xmm7, %%xmm7	             \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				             \n\t"
 | 
			
		||||
	".L01LOOP%=:				             \n\t"
 | 
			
		||||
	"1:				             \n\t"
 | 
			
		||||
        "vmovups                  (%2,%0,4), %%xmm12         \n\t"  // 4 * x
 | 
			
		||||
        "vmovups                16(%2,%0,4), %%xmm13         \n\t"  // 4 * x
 | 
			
		||||
        "vmovups                32(%2,%0,4), %%xmm14         \n\t"  // 4 * x
 | 
			
		||||
| 
						 | 
				
			
			@ -55,7 +55,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 | 
			
		|||
 | 
			
		||||
	"addq		$16, %0	  	 	             \n\t"
 | 
			
		||||
	"subq	        $16, %1			             \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		             \n\t"
 | 
			
		||||
	"jnz		1b		             \n\t"
 | 
			
		||||
 | 
			
		||||
	"vaddps        %%xmm4, %%xmm5, %%xmm4	\n\t"
 | 
			
		||||
	"vaddps        %%xmm6, %%xmm7, %%xmm6	\n\t"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 | 
			
		|||
	"xorps		%%xmm7, %%xmm7	             \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				            \n\t"
 | 
			
		||||
	".L01LOOP%=:				            \n\t"
 | 
			
		||||
	"1:				            \n\t"
 | 
			
		||||
        "movups                  (%2,%0,4), %%xmm12         \n\t"  // 4 * x
 | 
			
		||||
        "movups                  (%3,%0,4), %%xmm8          \n\t"  // 4 * x
 | 
			
		||||
        "movups                16(%2,%0,4), %%xmm13         \n\t"  // 4 * x
 | 
			
		||||
| 
						 | 
				
			
			@ -64,7 +64,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 | 
			
		|||
 | 
			
		||||
	"addq		$16, %0	  	 	             \n\t"
 | 
			
		||||
	"subq	        $16, %1			             \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		             \n\t"
 | 
			
		||||
	"jnz		1b		             \n\t"
 | 
			
		||||
 | 
			
		||||
	"addps        %%xmm5, %%xmm4	\n\t"
 | 
			
		||||
	"addps        %%xmm7, %%xmm6	\n\t"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -129,7 +129,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"shufps $0,  %%xmm13, %%xmm13    \n\t"	
 | 
			
		||||
 | 
			
		||||
	".align 16				       \n\t"
 | 
			
		||||
	".L01LOOP%=:				       \n\t"
 | 
			
		||||
	"1:				       \n\t"
 | 
			
		||||
	"movups	       (%3,%0,4), %%xmm4	       \n\t"	// 4 * y
 | 
			
		||||
 | 
			
		||||
	"movups             (%4,%0,4), %%xmm8          \n\t" 
 | 
			
		||||
| 
						 | 
				
			
			@ -143,7 +143,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"movups  %%xmm4 , -16(%3,%0,4)		       \n\t"	// 4 * y
 | 
			
		||||
 | 
			
		||||
	"subq	        $4 , %1			       \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		       \n\t"
 | 
			
		||||
	"jnz		1b		       \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
| 
						 | 
				
			
			@ -166,7 +166,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifndef HAVE_KERNEL_4x2
 | 
			
		||||
#ifndef HAVE_KERNEL_4x1
 | 
			
		||||
 | 
			
		||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -184,10 +184,10 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
 | 
			
		|||
        "shufps $0,  %%xmm12, %%xmm12            \n\t"
 | 
			
		||||
 | 
			
		||||
        "cmpq           $0, %1                   \n\t"
 | 
			
		||||
        "je             .L16END%=                \n\t"
 | 
			
		||||
        "je             2f                \n\t"
 | 
			
		||||
 | 
			
		||||
        ".align 16                               \n\t"
 | 
			
		||||
        ".L01LOOP%=:                             \n\t"
 | 
			
		||||
        "1:                             \n\t"
 | 
			
		||||
        "movups       (%3,%0,4), %%xmm4          \n\t"  // 4 * y
 | 
			
		||||
        "movups     16(%3,%0,4), %%xmm5          \n\t"  // 4 * y
 | 
			
		||||
        "movups       (%4,%0,4), %%xmm8          \n\t"  // 4 * a
 | 
			
		||||
| 
						 | 
				
			
			@ -203,12 +203,12 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
 | 
			
		|||
 | 
			
		||||
        "subq           $8 , %1                  \n\t"
 | 
			
		||||
 | 
			
		||||
        "jnz            .L01LOOP%=               \n\t"
 | 
			
		||||
        "jnz            1b               \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L16END%=:                              \n\t"
 | 
			
		||||
        "2:                              \n\t"
 | 
			
		||||
 | 
			
		||||
        "testq          $0x04, %5                \n\t"
 | 
			
		||||
        "jz             .L08LABEL%=              \n\t"
 | 
			
		||||
        "jz             3f              \n\t"
 | 
			
		||||
 | 
			
		||||
        "movups       (%3,%0,4), %%xmm4          \n\t"  // 4 * y
 | 
			
		||||
        "movups       (%4,%0,4), %%xmm8          \n\t"  // 4 * a
 | 
			
		||||
| 
						 | 
				
			
			@ -218,7 +218,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
 | 
			
		|||
        "addq           $4 , %0                  \n\t"
 | 
			
		||||
        "subq           $4 , %1                  \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L08LABEL%=:      			 \n\t" 
 | 
			
		||||
        "3:      			 \n\t" 
 | 
			
		||||
        :
 | 
			
		||||
        :
 | 
			
		||||
          "r" (i),      // 0    
 | 
			
		||||
| 
						 | 
				
			
			@ -262,7 +262,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
 | 
			
		|||
        (
 | 
			
		||||
 | 
			
		||||
        ".align 16                              \n\t"
 | 
			
		||||
        ".L01LOOP%=:                            \n\t"
 | 
			
		||||
        "1:                            \n\t"
 | 
			
		||||
 | 
			
		||||
        "movups  (%2,%0,4) , %%xmm12            \n\t"
 | 
			
		||||
        "movups  (%3,%0,4) , %%xmm11            \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -271,7 +271,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
 | 
			
		|||
        "movups  %%xmm11, -16(%3,%0,4)          \n\t"
 | 
			
		||||
 | 
			
		||||
        "subq           $4 , %1                 \n\t"
 | 
			
		||||
        "jnz            .L01LOOP%=              \n\t"
 | 
			
		||||
        "jnz            1b              \n\t"
 | 
			
		||||
 | 
			
		||||
        :
 | 
			
		||||
        :
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -49,7 +49,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
	"vbroadcastss    (%9), %%xmm8 	 \n\t"	// alpha 
 | 
			
		||||
 | 
			
		||||
        "testq          $0x04, %1                      \n\t"
 | 
			
		||||
        "jz             .L08LABEL%=                    \n\t"
 | 
			
		||||
        "jz             2f                    \n\t"
 | 
			
		||||
 | 
			
		||||
	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
 | 
			
		||||
	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -71,10 +71,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
	"subq	        $4 , %1			       \n\t"		
 | 
			
		||||
	"vmovups  %%xmm6, -16(%3,%0,4)		       \n\t"	// 4 * y
 | 
			
		||||
 | 
			
		||||
	".L08LABEL%=:                                  \n\t"
 | 
			
		||||
	"2:                                  \n\t"
 | 
			
		||||
 | 
			
		||||
        "testq          $0x08, %1                      \n\t"
 | 
			
		||||
        "jz             .L16LABEL%=                    \n\t"
 | 
			
		||||
        "jz             3f                    \n\t"
 | 
			
		||||
 | 
			
		||||
	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
 | 
			
		||||
	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -107,13 +107,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
	"subq	        $8 , %1			      \n\t"		
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        ".L16LABEL%=:                                  \n\t"
 | 
			
		||||
        "3:                                  \n\t"
 | 
			
		||||
 | 
			
		||||
        "cmpq           $0, %1                         \n\t"
 | 
			
		||||
        "je             .L16END%=                      \n\t"
 | 
			
		||||
        "je             4f                      \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
 | 
			
		||||
	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
 | 
			
		||||
	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -178,9 +178,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
	"vmovups  %%xmm7,-16(%3,%0,4)		      \n\t"	// 4 * y
 | 
			
		||||
 | 
			
		||||
	"subq	        $16, %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
	".L16END%=:                             \n\t"
 | 
			
		||||
	"4:                             \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
| 
						 | 
				
			
			@ -227,7 +227,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"vbroadcastss    (%8), %%xmm8 	 \n\t"	// alpha 
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
 | 
			
		||||
	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -243,7 +243,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
 | 
			
		||||
        "addq		$4 , %0	  	 	      \n\t"
 | 
			
		||||
	"subq	        $4 , %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      	\n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -50,7 +50,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
	"vbroadcastss    (%9), %%ymm6 	 \n\t"	// alpha 
 | 
			
		||||
 | 
			
		||||
        "testq          $0x04, %1                      \n\t"
 | 
			
		||||
        "jz             .L08LABEL%=                    \n\t"
 | 
			
		||||
        "jz             2f                    \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%3,%0,4), %%xmm7	       \n\t"	// 4 * y
 | 
			
		||||
	"vxorps		%%xmm4 , %%xmm4, %%xmm4        \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -76,10 +76,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
        "addq		$4 , %0	  	 	       \n\t"
 | 
			
		||||
	"subq	        $4 , %1			       \n\t"		
 | 
			
		||||
 | 
			
		||||
        ".L08LABEL%=:                                  \n\t"
 | 
			
		||||
        "2:                                  \n\t"
 | 
			
		||||
 | 
			
		||||
        "testq          $0x08, %1                      \n\t"
 | 
			
		||||
        "jz             .L16LABEL%=                    \n\t"
 | 
			
		||||
        "jz             3f                    \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%3,%0,4), %%ymm7	       \n\t"	// 8 * y
 | 
			
		||||
	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -106,14 +106,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
        "addq		$8 , %0	  	 	       \n\t"
 | 
			
		||||
	"subq	        $8 , %1			       \n\t"		
 | 
			
		||||
 | 
			
		||||
        ".L16LABEL%=:                                  \n\t"
 | 
			
		||||
        "3:                                  \n\t"
 | 
			
		||||
 | 
			
		||||
        "cmpq           $0, %1                         \n\t"
 | 
			
		||||
        "je             .L16END%=                      \n\t"
 | 
			
		||||
        "je             4f                      \n\t"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
 | 
			
		||||
	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 | 
			
		||||
	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -147,9 +147,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
	"subq	        $16, %1			      \n\t"		
 | 
			
		||||
	"vmovups  %%ymm9,-32(%3,%0,4)		      \n\t"	// 8 * y
 | 
			
		||||
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L16END%=:                             \n\t"
 | 
			
		||||
        "4:                             \n\t"
 | 
			
		||||
	"vzeroupper			        \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			@ -197,7 +197,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"vbroadcastss    (%8), %%ymm6 	 \n\t"	// alpha 
 | 
			
		||||
 | 
			
		||||
        "testq          $0x04, %1                      \n\t"
 | 
			
		||||
        "jz             .L08LABEL%=                    \n\t"
 | 
			
		||||
        "jz             2f                    \n\t"
 | 
			
		||||
 | 
			
		||||
	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 | 
			
		||||
	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -217,10 +217,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
        "addq		$4 , %0	  	 	       \n\t"
 | 
			
		||||
	"subq	        $4 , %1			       \n\t"		
 | 
			
		||||
 | 
			
		||||
        ".L08LABEL%=:                                  \n\t"
 | 
			
		||||
        "2:                                  \n\t"
 | 
			
		||||
 | 
			
		||||
        "testq          $0x08, %1                      \n\t"
 | 
			
		||||
        "jz             .L16LABEL%=                    \n\t"
 | 
			
		||||
        "jz             3f                    \n\t"
 | 
			
		||||
 | 
			
		||||
	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 | 
			
		||||
	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -240,14 +240,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
        "addq		$8 , %0	  	 	       \n\t"
 | 
			
		||||
	"subq	        $8 , %1			       \n\t"		
 | 
			
		||||
 | 
			
		||||
        ".L16LABEL%=:                                  \n\t"
 | 
			
		||||
        "3:                                  \n\t"
 | 
			
		||||
 | 
			
		||||
        "cmpq           $0, %1                         \n\t"
 | 
			
		||||
        "je             .L16END%=                      \n\t"
 | 
			
		||||
        "je             4f                      \n\t"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 | 
			
		||||
	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 | 
			
		||||
	"vmovups	(%3,%0,4), %%ymm8	 \n\t"	// 8 * y
 | 
			
		||||
| 
						 | 
				
			
			@ -270,9 +270,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
 | 
			
		||||
        "addq		$16, %0	  	 	      \n\t"
 | 
			
		||||
	"subq	        $16, %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L16END%=:                             \n\t"
 | 
			
		||||
        "4:                             \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -60,7 +60,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
	"xorps           %%xmm4 , %%xmm4	 \n\t"
 | 
			
		||||
	"xorps           %%xmm5 , %%xmm5	 \n\t"
 | 
			
		||||
	"movups             (%3,%0,4), %%xmm7          \n\t" // 4 * y
 | 
			
		||||
| 
						 | 
				
			
			@ -103,7 +103,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
 | 
			
		||||
	"movups  %%xmm7 , -16(%3,%0,4)		       \n\t"	// 4 * y
 | 
			
		||||
 | 
			
		||||
	"jnz		.L01LOOP%=		       \n\t"
 | 
			
		||||
	"jnz		1b		       \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
| 
						 | 
				
			
			@ -155,7 +155,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"shufps $0,  %%xmm6 , %%xmm6 \n\t"	
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
	"xorps           %%xmm4 , %%xmm4	 \n\t"
 | 
			
		||||
	"movups	       (%3,%0,4), %%xmm7	 \n\t"	// 4 * y
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -178,7 +178,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"addps		%%xmm7 , %%xmm11 	       \n\t"
 | 
			
		||||
	"movups  %%xmm11, -16(%3,%0,4)		       \n\t"	// 4 * y
 | 
			
		||||
 | 
			
		||||
	"jnz		.L01LOOP%=		       \n\t"
 | 
			
		||||
	"jnz		1b		       \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -51,7 +51,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
	"vbroadcastss    (%9), %%ymm6 	 \n\t"	// alpha 
 | 
			
		||||
 | 
			
		||||
        "testq          $0x04, %1               \n\t"
 | 
			
		||||
        "jz             .L08LABEL%=             \n\t"
 | 
			
		||||
        "jz             2f             \n\t"
 | 
			
		||||
 | 
			
		||||
	"vxorps	  %%xmm4 , %%xmm4 , %%xmm4        \n\t"
 | 
			
		||||
	"vxorps	  %%xmm5 , %%xmm5 , %%xmm5        \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -85,10 +85,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
        "addq		$4, %0	  	 	  \n\t"
 | 
			
		||||
	"subq	        $4, %1			  \n\t"		
 | 
			
		||||
 | 
			
		||||
        ".L08LABEL%=:                             \n\t"
 | 
			
		||||
        "2:                             \n\t"
 | 
			
		||||
 | 
			
		||||
        "testq          $0x08, %1                 \n\t"
 | 
			
		||||
        "jz             .L16LABEL%=               \n\t"
 | 
			
		||||
        "jz             3f               \n\t"
 | 
			
		||||
 | 
			
		||||
	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
 | 
			
		||||
	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -123,14 +123,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
	"subq	        $8, %1			  \n\t"		
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        ".L16LABEL%=:                             \n\t"
 | 
			
		||||
        "3:                             \n\t"
 | 
			
		||||
 | 
			
		||||
        "cmpq           $0, %1                    \n\t"
 | 
			
		||||
        "je             .L16END%=                 \n\t"
 | 
			
		||||
        "je             4f                 \n\t"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
 | 
			
		||||
	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -190,9 +190,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 | 
			
		|||
        "addq		$16, %8	  	 	  \n\t"
 | 
			
		||||
        "addq		$16, %0	  	 	  \n\t"
 | 
			
		||||
	"subq	        $16, %1			  \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		  \n\t"
 | 
			
		||||
	"jnz		1b		  \n\t"
 | 
			
		||||
 | 
			
		||||
	".L16END%=:                               \n\t"
 | 
			
		||||
	"4:                               \n\t"
 | 
			
		||||
	"vzeroupper			          \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			@ -241,7 +241,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"vbroadcastss    (%8), %%ymm6 	 \n\t"	// alpha 
 | 
			
		||||
 | 
			
		||||
        "testq          $0x04, %1               \n\t"
 | 
			
		||||
        "jz             .L08LABEL%=             \n\t"
 | 
			
		||||
        "jz             2f             \n\t"
 | 
			
		||||
 | 
			
		||||
	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
 | 
			
		||||
	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -265,10 +265,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
        "addq		$4, %0	  	 	  \n\t"
 | 
			
		||||
	"subq	        $4, %1			  \n\t"		
 | 
			
		||||
 | 
			
		||||
        ".L08LABEL%=:                           \n\t"
 | 
			
		||||
        "2:                           \n\t"
 | 
			
		||||
 | 
			
		||||
        "testq          $0x08, %1                 \n\t"
 | 
			
		||||
        "jz             .L16LABEL%=               \n\t"
 | 
			
		||||
        "jz             3f               \n\t"
 | 
			
		||||
 | 
			
		||||
	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
 | 
			
		||||
	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -293,14 +293,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"subq	        $8, %1			  \n\t"		
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        ".L16LABEL%=:                             \n\t"
 | 
			
		||||
        "3:                             \n\t"
 | 
			
		||||
 | 
			
		||||
        "cmpq           $0, %1                    \n\t"
 | 
			
		||||
        "je             .L16END%=                 \n\t"
 | 
			
		||||
        "je             4f                 \n\t"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
 | 
			
		||||
	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
 | 
			
		||||
	"vmovups	(%3,%0,4), %%ymm0	 \n\t"	// 8 * y
 | 
			
		||||
| 
						 | 
				
			
			@ -339,9 +339,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
 | 
			
		||||
        "addq		$16, %0	  	 	  \n\t"
 | 
			
		||||
	"subq	        $16, %1			  \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		  \n\t"
 | 
			
		||||
	"jnz		1b		  \n\t"
 | 
			
		||||
 | 
			
		||||
	".L16END%=:                               \n\t"
 | 
			
		||||
	"4:                               \n\t"
 | 
			
		||||
	"vzeroupper			          \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -84,7 +84,7 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
 | 
			
		|||
	"xorps %%xmm11 , %%xmm11		\n\t"
 | 
			
		||||
		
 | 
			
		||||
	"testq	$4 , %1				\n\t"
 | 
			
		||||
	"jz	.L01LABEL%=			\n\t"
 | 
			
		||||
	"jz	2f			\n\t"
 | 
			
		||||
 | 
			
		||||
	"movups  (%5,%0,4) , %%xmm14		\n\t" // x
 | 
			
		||||
	"movups  (%3,%0,4) , %%xmm12		\n\t" // ap0
 | 
			
		||||
| 
						 | 
				
			
			@ -96,13 +96,13 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
 | 
			
		|||
        "subq           $4 , %1                 \n\t"
 | 
			
		||||
	"addps   %%xmm13   , %%xmm11		\n\t"
 | 
			
		||||
 | 
			
		||||
        ".L01LABEL%=:                           \n\t"
 | 
			
		||||
        "2:                           \n\t"
 | 
			
		||||
 | 
			
		||||
	"cmpq	$0, %1				\n\t"
 | 
			
		||||
	"je	.L01END%=			\n\t"
 | 
			
		||||
	"je	3f			\n\t"
 | 
			
		||||
 | 
			
		||||
        ".align 16                              \n\t"
 | 
			
		||||
        ".L01LOOP%=:                            \n\t"
 | 
			
		||||
        "1:                            \n\t"
 | 
			
		||||
 | 
			
		||||
	"movups  (%5,%0,4) , %%xmm14		\n\t" // x
 | 
			
		||||
	"movups  (%3,%0,4) , %%xmm12		\n\t" // ap0
 | 
			
		||||
| 
						 | 
				
			
			@ -122,9 +122,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
 | 
			
		|||
 | 
			
		||||
        "addq           $8 , %0                 \n\t"
 | 
			
		||||
        "subq           $8 , %1                 \n\t"
 | 
			
		||||
        "jnz            .L01LOOP%=              \n\t"
 | 
			
		||||
        "jnz            1b              \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L01END%=:                             \n\t"
 | 
			
		||||
        "3:                             \n\t"
 | 
			
		||||
 | 
			
		||||
	"haddps        %%xmm10, %%xmm10         \n\t"
 | 
			
		||||
	"haddps        %%xmm11, %%xmm11         \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -165,7 +165,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"xorps %%xmm10 , %%xmm10		\n\t"
 | 
			
		||||
	
 | 
			
		||||
	"testq	$4 , %1				\n\t"
 | 
			
		||||
	"jz	.L01LABEL%=			\n\t"
 | 
			
		||||
	"jz	2f			\n\t"
 | 
			
		||||
 | 
			
		||||
	"movups  (%3,%0,4) , %%xmm12		\n\t"
 | 
			
		||||
	"movups  (%4,%0,4) , %%xmm11		\n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -174,13 +174,13 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"addps   %%xmm12   , %%xmm10		\n\t"
 | 
			
		||||
        "subq           $4 , %1                 \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L01LABEL%=:                           \n\t"
 | 
			
		||||
        "2:                           \n\t"
 | 
			
		||||
 | 
			
		||||
	"cmpq	$0, %1				\n\t"
 | 
			
		||||
	"je	.L01END%=			\n\t"
 | 
			
		||||
	"je	3f			\n\t"
 | 
			
		||||
 | 
			
		||||
        ".align 16                              \n\t"
 | 
			
		||||
        ".L01LOOP%=:                            \n\t"
 | 
			
		||||
        "1:                            \n\t"
 | 
			
		||||
 | 
			
		||||
	"movups    (%3,%0,4) , %%xmm12		\n\t"
 | 
			
		||||
	"movups  16(%3,%0,4) , %%xmm14		\n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -193,9 +193,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
        "subq           $8 , %1                 \n\t"
 | 
			
		||||
	"addps   %%xmm14   , %%xmm9 		\n\t"
 | 
			
		||||
 | 
			
		||||
        "jnz            .L01LOOP%=              \n\t"
 | 
			
		||||
        "jnz            1b              \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L01END%=:                             \n\t"
 | 
			
		||||
        "3:                             \n\t"
 | 
			
		||||
 | 
			
		||||
	"addps	       %%xmm9 , %%xmm10         \n\t"
 | 
			
		||||
	"haddps        %%xmm10, %%xmm10         \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -255,7 +255,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
 | 
			
		|||
	"shufps  $0 , %%xmm10 , %%xmm10		\n\t"
 | 
			
		||||
 | 
			
		||||
        ".align 16                              \n\t"
 | 
			
		||||
        ".L01LOOP%=:                            \n\t"
 | 
			
		||||
        "1:                            \n\t"
 | 
			
		||||
 | 
			
		||||
	"movups  (%3,%0,4) , %%xmm12		\n\t"
 | 
			
		||||
	"movups  (%4,%0,4) , %%xmm11		\n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -265,7 +265,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
 | 
			
		|||
        "subq           $4 , %1                 \n\t"
 | 
			
		||||
	"movups  %%xmm11, -16(%4,%0,4)		\n\t"
 | 
			
		||||
 | 
			
		||||
        "jnz            .L01LOOP%=              \n\t"
 | 
			
		||||
        "jnz            1b              \n\t"
 | 
			
		||||
 | 
			
		||||
        :
 | 
			
		||||
   	:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"vxorps		%%xmm7, %%xmm7, %%xmm7	 \n\t"
 | 
			
		||||
 | 
			
		||||
	"testq		$0x04, %1		       \n\t"
 | 
			
		||||
	"jz		.L08LABEL%=		       \n\t"
 | 
			
		||||
	"jz		2f		       \n\t"
 | 
			
		||||
 | 
			
		||||
        "vmovups        (%2,%0,4), %%xmm12             \n\t"  // 4 * x
 | 
			
		||||
	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
 | 
			
		||||
| 
						 | 
				
			
			@ -51,10 +51,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
        "addq		$4 , %0	  	 	       \n\t"
 | 
			
		||||
	"subq	        $4 , %1			       \n\t"		
 | 
			
		||||
 | 
			
		||||
	".L08LABEL%=:				       \n\t"
 | 
			
		||||
	"2:				       \n\t"
 | 
			
		||||
 | 
			
		||||
	"testq		$0x08, %1		       \n\t"
 | 
			
		||||
	"jz		.L16LABEL%=		       \n\t"
 | 
			
		||||
	"jz		3f		       \n\t"
 | 
			
		||||
 | 
			
		||||
        "vmovups        (%2,%0,4), %%xmm12             \n\t"  // 4 * x
 | 
			
		||||
        "vmovups      16(%2,%0,4), %%xmm13             \n\t"  // 4 * x
 | 
			
		||||
| 
						 | 
				
			
			@ -70,13 +70,13 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
        "addq		$8 , %0	  	 	       \n\t"
 | 
			
		||||
	"subq	        $8 , %1			       \n\t"		
 | 
			
		||||
 | 
			
		||||
	".L16LABEL%=:				       \n\t"
 | 
			
		||||
	"3:				       \n\t"
 | 
			
		||||
 | 
			
		||||
	"cmpq		$0, %1		               \n\t"
 | 
			
		||||
	"je		.L16END%=		       \n\t"
 | 
			
		||||
	"je		4f		       \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				       \n\t"
 | 
			
		||||
	".L01LOOP%=:				       \n\t"
 | 
			
		||||
	"1:				       \n\t"
 | 
			
		||||
        "vmovups        (%2,%0,4), %%xmm12             \n\t"  // 4 * x
 | 
			
		||||
 | 
			
		||||
	"prefetcht0	 384(%4,%0,4)		       \n\t"
 | 
			
		||||
| 
						 | 
				
			
			@ -107,9 +107,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"subq	        $16, %1			       \n\t"		
 | 
			
		||||
	"vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t" 
 | 
			
		||||
 | 
			
		||||
	"jnz		.L01LOOP%=		       \n\t"
 | 
			
		||||
	"jnz		1b		       \n\t"
 | 
			
		||||
 | 
			
		||||
	".L16END%=:				\n\t"
 | 
			
		||||
	"4:				\n\t"
 | 
			
		||||
	"vhaddps        %%xmm4, %%xmm4, %%xmm4	\n\t"
 | 
			
		||||
	"vhaddps        %%xmm5, %%xmm5, %%xmm5	\n\t"
 | 
			
		||||
	"vhaddps        %%xmm6, %%xmm6, %%xmm6	\n\t"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -42,7 +42,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"vxorps		%%ymm7 , %%ymm7, %%ymm7  \n\t"
 | 
			
		||||
 | 
			
		||||
        "testq          $0x04, %1                      \n\t"
 | 
			
		||||
        "jz             .L08LABEL%=                    \n\t"
 | 
			
		||||
        "jz             2f                    \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%2,%0,4), %%xmm12             \n\t"	// 4 * x
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -54,10 +54,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
        "addq		$4 , %0	  	 	      \n\t"
 | 
			
		||||
	"subq	        $4 , %1			      \n\t"		
 | 
			
		||||
 | 
			
		||||
        ".L08LABEL%=:                                  \n\t"
 | 
			
		||||
        "2:                                  \n\t"
 | 
			
		||||
 | 
			
		||||
        "testq          $0x08, %1                      \n\t"
 | 
			
		||||
        "jz             .L16LABEL%=                    \n\t"
 | 
			
		||||
        "jz             3f                    \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%2,%0,4), %%ymm12             \n\t"	// 8 * x
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -69,14 +69,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
        "addq		$8 , %0	  	 	      \n\t"
 | 
			
		||||
	"subq	        $8 , %1			      \n\t"		
 | 
			
		||||
 | 
			
		||||
        ".L16LABEL%=:                                  \n\t"
 | 
			
		||||
        "3:                                  \n\t"
 | 
			
		||||
 | 
			
		||||
        "cmpq           $0, %1                         \n\t"
 | 
			
		||||
        "je             .L16END%=                      \n\t"
 | 
			
		||||
        "je             4f                      \n\t"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
	"prefetcht0	 384(%2,%0,4)		 \n\t"
 | 
			
		||||
	"vmovups	(%2,%0,4), %%ymm12       \n\t"	// 8 * x
 | 
			
		||||
	"vmovups      32(%2,%0,4), %%ymm13       \n\t"	// 8 * x
 | 
			
		||||
| 
						 | 
				
			
			@ -96,9 +96,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
 | 
			
		||||
        "addq		$16, %0	  	 	      \n\t"
 | 
			
		||||
	"subq	        $16, %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L16END%=:                                   \n\t"
 | 
			
		||||
        "4:                                   \n\t"
 | 
			
		||||
 | 
			
		||||
	"vextractf128   $1 , %%ymm4, %%xmm12	      \n\t"
 | 
			
		||||
	"vextractf128   $1 , %%ymm5, %%xmm13	      \n\t"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"xorps		%%xmm7 , %%xmm7	         \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
 | 
			
		||||
	"movups	       (%2,%0,4), %%xmm12              \n\t"   // 4 * x
 | 
			
		||||
	"movups        (%4,%0,4), %%xmm8               \n\t"   // 4 * a0
 | 
			
		||||
| 
						 | 
				
			
			@ -60,7 +60,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"addps		%%xmm10, %%xmm6		       \n\t"
 | 
			
		||||
	"addps		%%xmm11, %%xmm7		       \n\t"
 | 
			
		||||
 | 
			
		||||
	"jnz		.L01LOOP%=		       \n\t"
 | 
			
		||||
	"jnz		1b		       \n\t"
 | 
			
		||||
 | 
			
		||||
        "haddps        %%xmm4, %%xmm4  \n\t"
 | 
			
		||||
        "haddps        %%xmm5, %%xmm5  \n\t"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -46,7 +46,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
        "vxorps         %%ymm7 , %%ymm7, %%ymm7  \n\t"
 | 
			
		||||
 | 
			
		||||
        "testq          $0x04, %1                      \n\t"
 | 
			
		||||
        "jz             .L08LABEL%=                    \n\t"
 | 
			
		||||
        "jz             2f                    \n\t"
 | 
			
		||||
 | 
			
		||||
        "vmovups        (%2,%0,4), %%xmm12       \n\t"  // 4 * x
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -61,10 +61,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"subq	        $4 , %1			      \n\t"		
 | 
			
		||||
	"vaddps	  %%xmm7, %%xmm11, %%xmm7	  \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L08LABEL%=:                                  \n\t"
 | 
			
		||||
        "2:                                  \n\t"
 | 
			
		||||
 | 
			
		||||
        "testq          $0x08, %1                      \n\t"
 | 
			
		||||
        "jz             .L16LABEL%=                    \n\t"
 | 
			
		||||
        "jz             3f                    \n\t"
 | 
			
		||||
 | 
			
		||||
        "vmovups        (%2,%0,4), %%ymm12       \n\t"  // 8 * x
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -79,14 +79,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"subq	        $8 , %1			      \n\t"		
 | 
			
		||||
	"vaddps	  %%ymm7, %%ymm11, %%ymm7	  \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L16LABEL%=:                                  \n\t"
 | 
			
		||||
        "3:                                  \n\t"
 | 
			
		||||
 | 
			
		||||
        "cmpq           $0, %1                         \n\t"
 | 
			
		||||
        "je             .L16END%=                      \n\t"
 | 
			
		||||
        "je             4f                      \n\t"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	".align 16				 \n\t"
 | 
			
		||||
	".L01LOOP%=:				 \n\t"
 | 
			
		||||
	"1:				 \n\t"
 | 
			
		||||
	"prefetcht0	 384(%2,%0,4)		       \n\t"
 | 
			
		||||
        "vmovups        (%2,%0,4), %%ymm12       \n\t"  // 8 * x
 | 
			
		||||
        "vmovups      32(%2,%0,4), %%ymm13       \n\t"  // 8 * x
 | 
			
		||||
| 
						 | 
				
			
			@ -114,9 +114,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"subq	        $16, %1			      \n\t"		
 | 
			
		||||
	"vaddps	  %%ymm3, %%ymm11, %%ymm3	  \n\t"
 | 
			
		||||
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
        ".L16END%=:				      \n\t"
 | 
			
		||||
        "4:				      \n\t"
 | 
			
		||||
 | 
			
		||||
        "vaddps         %%ymm4, %%ymm0, %%ymm4       \n\t"
 | 
			
		||||
        "vaddps         %%ymm5, %%ymm1, %%ymm5       \n\t"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -44,7 +44,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 | 
			
		|||
	"vbroadcastss 12(%8),    %%xmm7	             \n\t"	// temp1[3]
 | 
			
		||||
 | 
			
		||||
	".align 16				     \n\t"
 | 
			
		||||
	".L01LOOP%=:				     \n\t"
 | 
			
		||||
	"1:				     \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%4,%0,4), %%xmm12	           \n\t"  // 2 * a
 | 
			
		||||
	"vmovups	(%2,%0,4), %%xmm8	           \n\t"  // 2 * x
 | 
			
		||||
| 
						 | 
				
			
			@ -71,7 +71,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 | 
			
		|||
	"vmovups	%%xmm9 ,  -16(%3,%0,4)		   \n\t"
 | 
			
		||||
 | 
			
		||||
	"cmpq		%0 , %1			      \n\t"
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovss		  (%9), %%xmm4		      \n\t"
 | 
			
		||||
	"vmovss		 4(%9), %%xmm5		      \n\t"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -48,7 +48,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to,  FLOAT **a, FLOAT *x, F
 | 
			
		|||
	"shufps $0,  %%xmm7, %%xmm7                  \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16		  		       \n\t"
 | 
			
		||||
	".L01LOOP%=:				       \n\t"
 | 
			
		||||
	"1:				       \n\t"
 | 
			
		||||
	"movups	            (%2,%0,4), %%xmm8	       \n\t"	// 4 * x
 | 
			
		||||
	"movups	            (%3,%0,4), %%xmm9         \n\t"	// 4 * y
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -86,7 +86,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to,  FLOAT **a, FLOAT *x, F
 | 
			
		|||
 | 
			
		||||
        "addq		$4 , %0	  	 	      \n\t"
 | 
			
		||||
	"cmpq	        %0 , %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
	"movss	       (%9),    %%xmm4	             \n\t"	// temp1[0]
 | 
			
		||||
	"movss        4(%9),    %%xmm5	             \n\t"	// temp1[1]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -47,7 +47,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 | 
			
		|||
	"xorq		%0,%0			     \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				     \n\t"
 | 
			
		||||
	".L01LOOP%=:				     \n\t"
 | 
			
		||||
	"1:				     \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovups	(%2,%0,4), %%xmm8	           \n\t"  // 4 * x
 | 
			
		||||
	"vmovups	(%3,%0,4), %%xmm9	           \n\t"  // 4 * y
 | 
			
		||||
| 
						 | 
				
			
			@ -73,7 +73,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 | 
			
		|||
 | 
			
		||||
	"addq		$4 , %0	  	 	      \n\t"
 | 
			
		||||
	"subq	        $4 , %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
	"vhaddps        %%xmm0, %%xmm0, %%xmm0  \n\t"
 | 
			
		||||
	"vhaddps        %%xmm1, %%xmm1, %%xmm1  \n\t"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -51,7 +51,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 | 
			
		|||
	"xorq		%0,%0			     \n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16		  		       \n\t"
 | 
			
		||||
	".L01LOOP%=:				       \n\t"
 | 
			
		||||
	"1:				       \n\t"
 | 
			
		||||
	"movups	            (%2,%0,4), %%xmm8	       \n\t"	// 4 * x
 | 
			
		||||
	"movups	            (%3,%0,4), %%xmm9         \n\t"	// 4 * y
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -89,7 +89,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 | 
			
		|||
 | 
			
		||||
        "addq		$4 , %0	  	 	      \n\t"
 | 
			
		||||
	"subq	        $4 , %1			      \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		      \n\t"
 | 
			
		||||
	"jnz		1b		      \n\t"
 | 
			
		||||
 | 
			
		||||
	"haddps        %%xmm0, %%xmm0  \n\t"
 | 
			
		||||
	"haddps        %%xmm1, %%xmm1  \n\t"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -40,7 +40,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 | 
			
		|||
	"vmovddup	       8(%4), %%xmm1		    \n\t"  // imag part of alpha
 | 
			
		||||
 | 
			
		||||
	".align 16				            \n\t"
 | 
			
		||||
	".L01LOOP%=:				            \n\t"
 | 
			
		||||
	"1:				            \n\t"
 | 
			
		||||
 | 
			
		||||
	"prefetcht0  768(%2,%0,8)                           \n\t"
 | 
			
		||||
	"vmovups        (%2,%0,8), %%xmm5                   \n\t" // 1 complex values from x
 | 
			
		||||
| 
						 | 
				
			
			@ -113,7 +113,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 | 
			
		|||
 | 
			
		||||
	"addq		$8 , %0	  	 	             \n\t"
 | 
			
		||||
	"subq	        $4 , %1			             \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		             \n\t"
 | 
			
		||||
	"jnz		1b		             \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
        : 
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
	"prefetcht0      192(%4,%0,8)			\n\t"
 | 
			
		||||
	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 | 
			
		||||
	"vmovups      32(%4,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
 | 
			
		||||
| 
						 | 
				
			
			@ -111,7 +111,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			@ -153,7 +153,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
	"prefetcht0      192(%4,%0,8)			\n\t"
 | 
			
		||||
	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 | 
			
		||||
	"vmovups      32(%4,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
 | 
			
		||||
| 
						 | 
				
			
			@ -199,7 +199,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			@ -237,7 +237,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"vbroadcastsd	 8(%2), %%ymm1                  \n\t"  // imag part x0
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
	"prefetcht0      192(%4,%0,8)			\n\t"
 | 
			
		||||
	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 | 
			
		||||
	"vmovups      32(%4,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
 | 
			
		||||
| 
						 | 
				
			
			@ -273,7 +273,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			@ -339,7 +339,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
 | 
			
		|||
	"vbroadcastsd	  (%5), %%ymm1                  \n\t"  // alpha_i
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
	"prefetcht0      192(%2,%0,8)			\n\t"
 | 
			
		||||
	"vmovups	(%2,%0,8), %%ymm8	        \n\t" // 2 complex values from src
 | 
			
		||||
	"vmovups      32(%2,%0,8), %%ymm9	        \n\t" 
 | 
			
		||||
| 
						 | 
				
			
			@ -375,7 +375,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
 | 
			
		|||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
 | 
			
		||||
        //"prefetcht0      256(%4,%0,8)                   \n\t"
 | 
			
		||||
	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 | 
			
		||||
| 
						 | 
				
			
			@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			@ -165,7 +165,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"vbroadcastsd	24(%2), %%ymm3                  \n\t"  // imag part x1
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
 | 
			
		||||
        // "prefetcht0      256(%4,%0,8)                   \n\t"
 | 
			
		||||
	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 | 
			
		||||
| 
						 | 
				
			
			@ -216,7 +216,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			@ -254,7 +254,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
	"vbroadcastsd	 8(%2), %%ymm1                  \n\t"  // imag part x0
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
 | 
			
		||||
        // "prefetcht0      256(%4,%0,8)                   \n\t"
 | 
			
		||||
	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
 | 
			
		||||
| 
						 | 
				
			
			@ -291,7 +291,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 | 
			
		|||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			@ -356,7 +356,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
 | 
			
		|||
	"vbroadcastsd	  (%5), %%ymm1                  \n\t"  // alpha_i
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
	// "prefetcht0      192(%2,%0,8)			\n\t"
 | 
			
		||||
	"vmovups	(%2,%0,8), %%ymm8	        \n\t" // 2 complex values from src
 | 
			
		||||
	"vmovups      32(%2,%0,8), %%ymm9	        \n\t" 
 | 
			
		||||
| 
						 | 
				
			
			@ -392,7 +392,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
 | 
			
		|||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
	"vzeroupper			 \n\t"
 | 
			
		||||
 | 
			
		||||
	:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"vxorpd		%%xmm15, %%xmm15, %%xmm15	\n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovddup	   (%2,%0,8), %%xmm0            \n\t"  // real value from x0
 | 
			
		||||
	"vmovddup	  8(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
 | 
			
		||||
| 
						 | 
				
			
			@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovddup               (%8)  , %%xmm0                \n\t"  // value from alpha
 | 
			
		||||
	"vmovddup	       8(%8)  , %%xmm1                \n\t"  // value from alpha
 | 
			
		||||
| 
						 | 
				
			
			@ -236,7 +236,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"vxorpd		%%xmm11, %%xmm11, %%xmm11	\n\t" // temp
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovddup	   (%2,%0,8), %%xmm0            \n\t"  // real value from x0
 | 
			
		||||
	"vmovddup	  8(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
 | 
			
		||||
| 
						 | 
				
			
			@ -286,7 +286,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovddup               (%6)  , %%xmm0                \n\t"  // value from alpha
 | 
			
		||||
	"vmovddup	       8(%6)  , %%xmm1                \n\t"  // value from alpha
 | 
			
		||||
| 
						 | 
				
			
			@ -369,7 +369,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
 | 
			
		|||
	"vxorpd		%%xmm9 , %%xmm9 , %%xmm9 	\n\t" // temp
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovddup	   (%2,%0,8), %%xmm0            \n\t"  // real value from x0
 | 
			
		||||
	"vmovddup	  8(%2,%0,8), %%xmm1            \n\t"  // imag value from x0
 | 
			
		||||
| 
						 | 
				
			
			@ -404,7 +404,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
 | 
			
		|||
	"vfmaddpd   %%xmm8 ,   %%xmm5 , %%xmm2, %%xmm8       \n\t" // ar0*xr0,al0*xr0 
 | 
			
		||||
	"vfmaddpd   %%xmm9 ,   %%xmm5 , %%xmm3, %%xmm9       \n\t" // ar0*xl0,al0*xl0 
 | 
			
		||||
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
 | 
			
		||||
	"vmovddup               (%5)  , %%xmm0                \n\t"  // value from alpha
 | 
			
		||||
	"vmovddup	       8(%5)  , %%xmm1                \n\t"  // value from alpha
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"vxorpd		%%ymm15, %%ymm15, %%ymm15	\n\t"
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
 | 
			
		||||
        "prefetcht0      192(%2,%0,8)                   \n\t"
 | 
			
		||||
	"vmovddup	   (%2,%0,8), %%xmm0            \n\t"  // real value from x0
 | 
			
		||||
| 
						 | 
				
			
			@ -96,7 +96,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
 | 
			
		||||
        "vmovddup               (%8)  , %%xmm0                \n\t"  // value from alpha
 | 
			
		||||
        "vmovddup              8(%8)  , %%xmm1                \n\t"  // value from alpha
 | 
			
		||||
| 
						 | 
				
			
			@ -220,7 +220,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
	"vxorpd		%%ymm11, %%ymm11, %%ymm11	\n\t" // temp
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
 | 
			
		||||
        "prefetcht0      192(%2,%0,8)                   \n\t"
 | 
			
		||||
	"vmovddup	   (%2,%0,8), %%xmm0            \n\t"  // real value from x0
 | 
			
		||||
| 
						 | 
				
			
			@ -255,7 +255,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 | 
			
		|||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
 | 
			
		||||
        "vmovddup               (%6)  , %%xmm0                \n\t"  // value from alpha
 | 
			
		||||
        "vmovddup              8(%6)  , %%xmm1                \n\t"  // value from alpha
 | 
			
		||||
| 
						 | 
				
			
			@ -342,7 +342,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
 | 
			
		|||
	"vxorpd		%%ymm9 , %%ymm9 , %%ymm9 	\n\t" // temp
 | 
			
		||||
 | 
			
		||||
	".align 16				        \n\t"
 | 
			
		||||
	".L01LOOP%=:				        \n\t"
 | 
			
		||||
	"1:				        \n\t"
 | 
			
		||||
 | 
			
		||||
        "prefetcht0      192(%2,%0,8)                   \n\t"
 | 
			
		||||
	"vmovddup	   (%2,%0,8), %%xmm0            \n\t"  // real value from x0
 | 
			
		||||
| 
						 | 
				
			
			@ -370,7 +370,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
 | 
			
		|||
 | 
			
		||||
        "addq		$8 , %0	  	 	        \n\t"
 | 
			
		||||
	"subq	        $4 , %1			        \n\t"		
 | 
			
		||||
	"jnz		.L01LOOP%=		        \n\t"
 | 
			
		||||
	"jnz		1b		        \n\t"
 | 
			
		||||
 | 
			
		||||
        "vmovddup               (%5)  , %%xmm0                \n\t"  // value from alpha
 | 
			
		||||
        "vmovddup              8(%5)  , %%xmm1                \n\t"  // value from alpha
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue