Merge pull request #473 from wernsaar/develop

changed inline assembler labels to short form
This commit is contained in:
Zhang Xianyi 2014-12-08 13:22:18 +08:00
commit 8fe7a9ce6f
38 changed files with 241 additions and 241 deletions

View File

@ -40,7 +40,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 768(%2,%0,4) \n\t"
"vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x
@ -113,7 +113,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"addq $16, %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -49,10 +49,10 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3
"cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
"je 2f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 320(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
@ -115,12 +115,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $16, %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L01END%=: \n\t"
"2: \n\t"
"cmpq $4, %8 \n\t"
"jne .L02END%= \n\t"
"jne 3f \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
@ -155,7 +155,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
".L02END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t"
:
@ -200,10 +200,10 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
"cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
"je 2f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 320(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
@ -248,12 +248,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $16, %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L01END%=: \n\t"
"2: \n\t"
"cmpq $4, %6 \n\t"
"jne .L02END%= \n\t"
"jne 3f \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
@ -279,7 +279,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
".L02END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t"
:
@ -320,10 +320,10 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
"cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
"je 2f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 320(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
@ -359,12 +359,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L01END%=: \n\t"
"2: \n\t"
"cmpq $4, %5 \n\t"
"jne .L02END%= \n\t"
"jne 3f \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
@ -386,7 +386,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
".L02END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t"
:
@ -452,10 +452,10 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
"vbroadcastss (%5), %%ymm1 \n\t" // alpha_i
"cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
"je 2f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src
"vmovups 32(%2,%0,4), %%ymm9 \n\t"
@ -489,12 +489,12 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L01END%=: \n\t"
"2: \n\t"
"cmpq $4, %6 \n\t"
"jne .L02END%= \n\t"
"jne 3f \n\t"
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src
@ -516,7 +516,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
".L02END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t"
:

View File

@ -47,7 +47,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
@ -72,12 +72,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L08END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"prefetcht0 192(%5,%0,4) \n\t"
@ -125,9 +125,9 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $16 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L08END%=: \n\t"
"3: \n\t"
"vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha
"vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha
@ -269,7 +269,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
@ -288,12 +288,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L08END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"prefetcht0 192(%5,%0,4) \n\t"
@ -325,9 +325,9 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $16 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L08END%=: \n\t"
"3: \n\t"
"vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha
"vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha
@ -426,7 +426,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
@ -442,12 +442,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L08END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
@ -472,9 +472,9 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
"addq $16 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L08END%=: \n\t"
"3: \n\t"
"vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha
"vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha

View File

@ -39,7 +39,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vmovddup (%4), %%xmm0 \n\t" // alpha
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 768(%3,%0,8) \n\t"
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
@ -61,7 +61,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -40,7 +40,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"shufpd $0, %%xmm0, %%xmm0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
// "prefetcht0 192(%2,%0,8) \n\t"
// "prefetcht0 192(%3,%0,8) \n\t"
@ -70,7 +70,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
"vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
"vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
@ -55,7 +55,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
"vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"

View File

@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"xorpd %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%2,%0,8), %%xmm12 \n\t" // 2 * x
"movups (%3,%0,8), %%xmm8 \n\t" // 2 * y
@ -65,7 +65,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"addpd %%xmm5, %%xmm4 \n\t"
"addpd %%xmm7, %%xmm6 \n\t"

View File

@ -125,7 +125,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"shufpd $0, %%xmm13, %%xmm13 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
@ -148,7 +148,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:
@ -187,7 +187,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"shufpd $0, %%xmm12, %%xmm12 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%4,%0,8), %%xmm8 \n\t" // 2 * a
"movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
@ -203,7 +203,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -50,7 +50,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L8LABEL%= \n\t"
"jz 2f \n\t"
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
@ -77,14 +77,14 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L8LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
@ -118,9 +118,9 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"subq $8 , %1 \n\t"
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L16END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t"
:
@ -168,7 +168,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vbroadcastsd (%8), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L8LABEL%= \n\t"
"jz 2f \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
@ -188,14 +188,14 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L8LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L8END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
@ -218,9 +218,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L8END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t"
:

View File

@ -60,7 +60,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"xorpd %%xmm4 , %%xmm4 \n\t"
"xorpd %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
@ -142,7 +142,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:
@ -194,7 +194,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"shufpd $0, %%xmm6 , %%xmm6 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"xorpd %%xmm4 , %%xmm4 \n\t"
"xorpd %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
@ -239,7 +239,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -78,7 +78,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"xorpd %%xmm11 , %%xmm11 \n\t"
"testq $2 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"jz 2f \n\t"
"movups (%5,%0,8) , %%xmm14 \n\t" // x
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
@ -90,13 +90,13 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"subq $2 , %1 \n\t"
"addpd %%xmm13 , %%xmm11 \n\t"
".L01LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%5,%0,8) , %%xmm14 \n\t" // x
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
@ -116,9 +116,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L01END%=: \n\t"
"3: \n\t"
"haddpd %%xmm10, %%xmm10 \n\t"
"haddpd %%xmm11, %%xmm11 \n\t"
@ -157,7 +157,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"xorpd %%xmm10 , %%xmm10 \n\t"
"testq $2 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"jz 2f \n\t"
"movups (%3,%0,8) , %%xmm12 \n\t"
"movups (%4,%0,8) , %%xmm11 \n\t"
@ -166,13 +166,13 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"addpd %%xmm12 , %%xmm10 \n\t"
"subq $2 , %1 \n\t"
".L01LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%3,%0,8) , %%xmm12 \n\t"
"movups 16(%3,%0,8) , %%xmm14 \n\t"
@ -185,9 +185,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"subq $4 , %1 \n\t"
"addpd %%xmm14 , %%xmm9 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L01END%=: \n\t"
"3: \n\t"
"addpd %%xmm9 , %%xmm10 \n\t"
"haddpd %%xmm10, %%xmm10 \n\t"
@ -246,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"shufpd $0 , %%xmm10 , %%xmm10 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%3,%0,8) , %%xmm12 \n\t"
"movups (%4,%0,8) , %%xmm11 \n\t"
@ -256,7 +256,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"subq $2 , %1 \n\t"
"movups %%xmm11, -16(%4,%0,8) \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -42,7 +42,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t"
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
@ -54,13 +54,13 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
// "prefetcht0 384(%2,%0,8) \n\t"
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x
@ -80,9 +80,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"subq $8 , %1 \n\t"
"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L16END%=: \n\t"
"3: \n\t"
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"

View File

@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovddup 24(%8), %%xmm7 \n\t" // temp1[1]
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
@ -90,7 +90,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
"cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vmovsd (%9), %%xmm4 \n\t"
"vmovsd 8(%9), %%xmm5 \n\t"

View File

@ -48,7 +48,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"shufpd $0, %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
"movups %%xmm12 , %%xmm11 \n\t"
@ -85,7 +85,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
"cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"movsd (%9), %%xmm4 \n\t" // temp1[0]
"movsd 8(%9), %%xmm5 \n\t" // temp1[1]

View File

@ -47,7 +47,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"xorq %0,%0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
@ -93,7 +93,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vmovups %%xmm9 , -32(%3,%0,8) \n\t"
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"

View File

@ -51,7 +51,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"xorq %0,%0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
"movups %%xmm12 , %%xmm11 \n\t"
@ -88,7 +88,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
"subq $2 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"haddpd %%xmm0, %%xmm0 \n\t"
"haddpd %%xmm1, %%xmm1 \n\t"

View File

@ -40,7 +40,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"shufps $0, %%xmm0, %%xmm0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
// "prefetcht0 192(%2,%0,4) \n\t"
// "prefetcht0 192(%3,%0,4) \n\t"
@ -70,7 +70,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
"vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
@ -55,7 +55,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
"vaddps %%xmm6, %%xmm7, %%xmm6 \n\t"

View File

@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"xorps %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"movups (%3,%0,4), %%xmm8 \n\t" // 4 * x
"movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
@ -64,7 +64,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"addps %%xmm5, %%xmm4 \n\t"
"addps %%xmm7, %%xmm6 \n\t"

View File

@ -129,7 +129,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"shufps $0, %%xmm13, %%xmm13 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t"
@ -143,7 +143,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:
@ -166,7 +166,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
#endif
#ifndef HAVE_KERNEL_4x2
#ifndef HAVE_KERNEL_4x1
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
@ -184,10 +184,10 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"shufps $0, %%xmm12, %%xmm12 \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 2f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
@ -203,12 +203,12 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L16END%=: \n\t"
"2: \n\t"
"testq $0x04, %5 \n\t"
"jz .L08LABEL%= \n\t"
"jz 3f \n\t"
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
@ -218,7 +218,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"3: \n\t"
:
:
"r" (i), // 0
@ -262,7 +262,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
(
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%2,%0,4) , %%xmm12 \n\t"
"movups (%3,%0,4) , %%xmm11 \n\t"
@ -271,7 +271,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
"movups %%xmm11, -16(%3,%0,4) \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -49,7 +49,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
@ -71,10 +71,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"subq $4 , %1 \n\t"
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
".L08LABEL%=: \n\t"
"2: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
@ -107,13 +107,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"subq $8 , %1 \n\t"
".L16LABEL%=: \n\t"
"3: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
@ -178,9 +178,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L16END%=: \n\t"
"4: \n\t"
:
:
@ -227,7 +227,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vbroadcastss (%8), %%xmm8 \n\t" // alpha
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
@ -243,7 +243,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -50,7 +50,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
@ -76,10 +76,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"2: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
@ -106,14 +106,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
".L16LABEL%=: \n\t"
"3: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
@ -147,9 +147,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"subq $16, %1 \n\t"
"vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L16END%=: \n\t"
"4: \n\t"
"vzeroupper \n\t"
:
@ -197,7 +197,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
@ -217,10 +217,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"2: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
@ -240,14 +240,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
".L16LABEL%=: \n\t"
"3: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
@ -270,9 +270,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L16END%=: \n\t"
"4: \n\t"
"vzeroupper \n\t"
:

View File

@ -60,7 +60,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"xorps %%xmm4 , %%xmm4 \n\t"
"xorps %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
@ -103,7 +103,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:
@ -155,7 +155,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"shufps $0, %%xmm6 , %%xmm6 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"xorps %%xmm4 , %%xmm4 \n\t"
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
@ -178,7 +178,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addps %%xmm7 , %%xmm11 \n\t"
"movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -51,7 +51,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
@ -85,10 +85,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addq $4, %0 \n\t"
"subq $4, %1 \n\t"
".L08LABEL%=: \n\t"
"2: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
@ -123,14 +123,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"subq $8, %1 \n\t"
".L16LABEL%=: \n\t"
"3: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
@ -190,9 +190,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addq $16, %8 \n\t"
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L16END%=: \n\t"
"4: \n\t"
"vzeroupper \n\t"
:
@ -241,7 +241,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
@ -265,10 +265,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $4, %0 \n\t"
"subq $4, %1 \n\t"
".L08LABEL%=: \n\t"
"2: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
@ -293,14 +293,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"subq $8, %1 \n\t"
".L16LABEL%=: \n\t"
"3: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y
@ -339,9 +339,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L16END%=: \n\t"
"4: \n\t"
"vzeroupper \n\t"
:

View File

@ -84,7 +84,7 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"xorps %%xmm11 , %%xmm11 \n\t"
"testq $4 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"jz 2f \n\t"
"movups (%5,%0,4) , %%xmm14 \n\t" // x
"movups (%3,%0,4) , %%xmm12 \n\t" // ap0
@ -96,13 +96,13 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"subq $4 , %1 \n\t"
"addps %%xmm13 , %%xmm11 \n\t"
".L01LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%5,%0,4) , %%xmm14 \n\t" // x
"movups (%3,%0,4) , %%xmm12 \n\t" // ap0
@ -122,9 +122,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L01END%=: \n\t"
"3: \n\t"
"haddps %%xmm10, %%xmm10 \n\t"
"haddps %%xmm11, %%xmm11 \n\t"
@ -165,7 +165,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"xorps %%xmm10 , %%xmm10 \n\t"
"testq $4 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"jz 2f \n\t"
"movups (%3,%0,4) , %%xmm12 \n\t"
"movups (%4,%0,4) , %%xmm11 \n\t"
@ -174,13 +174,13 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"addps %%xmm12 , %%xmm10 \n\t"
"subq $4 , %1 \n\t"
".L01LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%3,%0,4) , %%xmm12 \n\t"
"movups 16(%3,%0,4) , %%xmm14 \n\t"
@ -193,9 +193,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"subq $8 , %1 \n\t"
"addps %%xmm14 , %%xmm9 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L01END%=: \n\t"
"3: \n\t"
"addps %%xmm9 , %%xmm10 \n\t"
"haddps %%xmm10, %%xmm10 \n\t"
@ -255,7 +255,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"shufps $0 , %%xmm10 , %%xmm10 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%3,%0,4) , %%xmm12 \n\t"
"movups (%4,%0,4) , %%xmm11 \n\t"
@ -265,7 +265,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"subq $4 , %1 \n\t"
"movups %%xmm11, -16(%4,%0,4) \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
@ -51,10 +51,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"2: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
@ -70,13 +70,13 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
".L16LABEL%=: \n\t"
"3: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"prefetcht0 384(%4,%0,4) \n\t"
@ -107,9 +107,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"subq $16, %1 \n\t"
"vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L16END%=: \n\t"
"4: \n\t"
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"

View File

@ -42,7 +42,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t"
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
@ -54,10 +54,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"2: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
@ -69,14 +69,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
".L16LABEL%=: \n\t"
"3: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 384(%2,%0,4) \n\t"
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
"vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x
@ -96,9 +96,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L16END%=: \n\t"
"4: \n\t"
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"

View File

@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"xorps %%xmm7 , %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a0
@ -60,7 +60,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addps %%xmm10, %%xmm6 \n\t"
"addps %%xmm11, %%xmm7 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"haddps %%xmm4, %%xmm4 \n\t"
"haddps %%xmm5, %%xmm5 \n\t"

View File

@ -46,7 +46,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t"
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
@ -61,10 +61,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"subq $4 , %1 \n\t"
"vaddps %%xmm7, %%xmm11, %%xmm7 \n\t"
".L08LABEL%=: \n\t"
"2: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
@ -79,14 +79,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"subq $8 , %1 \n\t"
"vaddps %%ymm7, %%ymm11, %%ymm7 \n\t"
".L16LABEL%=: \n\t"
"3: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 384(%2,%0,4) \n\t"
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
"vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x
@ -114,9 +114,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"subq $16, %1 \n\t"
"vaddps %%ymm3, %%ymm11, %%ymm3 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L16END%=: \n\t"
"4: \n\t"
"vaddps %%ymm4, %%ymm0, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm1, %%ymm5 \n\t"

View File

@ -44,7 +44,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[3]
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a
"vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x
@ -71,7 +71,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovups %%xmm9 , -16(%3,%0,4) \n\t"
"cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vmovss (%9), %%xmm4 \n\t"
"vmovss 4(%9), %%xmm5 \n\t"

View File

@ -48,7 +48,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F
"shufps $0, %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%2,%0,4), %%xmm8 \n\t" // 4 * x
"movups (%3,%0,4), %%xmm9 \n\t" // 4 * y
@ -86,7 +86,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F
"addq $4 , %0 \n\t"
"cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"movss (%9), %%xmm4 \n\t" // temp1[0]
"movss 4(%9), %%xmm5 \n\t" // temp1[1]

View File

@ -47,7 +47,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"xorq %0,%0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%xmm8 \n\t" // 4 * x
"vmovups (%3,%0,4), %%xmm9 \n\t" // 4 * y
@ -73,7 +73,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t"
"vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t"

View File

@ -51,7 +51,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"xorq %0,%0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%2,%0,4), %%xmm8 \n\t" // 4 * x
"movups (%3,%0,4), %%xmm9 \n\t" // 4 * y
@ -89,7 +89,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"haddps %%xmm0, %%xmm0 \n\t"
"haddps %%xmm1, %%xmm1 \n\t"

View File

@ -40,7 +40,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 768(%2,%0,8) \n\t"
"vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x
@ -113,7 +113,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%4,%0,8) \n\t"
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
@ -111,7 +111,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
@ -153,7 +153,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%4,%0,8) \n\t"
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
@ -199,7 +199,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
@ -237,7 +237,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%4,%0,8) \n\t"
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
@ -273,7 +273,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
@ -339,7 +339,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
"vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%2,%0,8) \n\t"
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src
"vmovups 32(%2,%0,8), %%ymm9 \n\t"
@ -375,7 +375,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:

View File

@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
//"prefetcht0 256(%4,%0,8) \n\t"
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
@ -165,7 +165,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
// "prefetcht0 256(%4,%0,8) \n\t"
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
@ -216,7 +216,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
@ -254,7 +254,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
// "prefetcht0 256(%4,%0,8) \n\t"
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
@ -291,7 +291,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
@ -356,7 +356,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
"vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
// "prefetcht0 192(%2,%0,8) \n\t"
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src
"vmovups 32(%2,%0,8), %%ymm9 \n\t"
@ -392,7 +392,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:

View File

@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
"vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vmovddup (%8) , %%xmm0 \n\t" // value from alpha
"vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha
@ -236,7 +236,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
"vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
@ -286,7 +286,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vmovddup (%6) , %%xmm0 \n\t" // value from alpha
"vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha
@ -369,7 +369,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
"vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
"vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
@ -404,7 +404,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
"vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0
"vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vmovddup (%5) , %%xmm0 \n\t" // value from alpha
"vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha

View File

@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%2,%0,8) \n\t"
"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
@ -96,7 +96,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vmovddup (%8) , %%xmm0 \n\t" // value from alpha
"vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha
@ -220,7 +220,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%2,%0,8) \n\t"
"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
@ -255,7 +255,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vmovddup (%6) , %%xmm0 \n\t" // value from alpha
"vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha
@ -342,7 +342,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
"vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%2,%0,8) \n\t"
"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
@ -370,7 +370,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vmovddup (%5) , %%xmm0 \n\t" // value from alpha
"vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha