Merge pull request #473 from wernsaar/develop
changed inline assembler labels to short form
This commit is contained in:
commit
8fe7a9ce6f
|
@ -40,7 +40,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"prefetcht0 768(%2,%0,4) \n\t"
|
||||
"vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x
|
||||
|
@ -113,7 +113,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -49,10 +49,10 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3
|
||||
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 320(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
|
||||
|
@ -115,12 +115,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $4, %8 \n\t"
|
||||
"jne .L02END%= \n\t"
|
||||
"jne 3f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
|
||||
|
@ -155,7 +155,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||
|
||||
".L02END%=: \n\t"
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -200,10 +200,10 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
|
||||
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 320(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
|
||||
|
@ -248,12 +248,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $4, %6 \n\t"
|
||||
"jne .L02END%= \n\t"
|
||||
"jne 3f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
|
||||
|
@ -279,7 +279,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||
|
||||
".L02END%=: \n\t"
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -320,10 +320,10 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
|
||||
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 320(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
|
||||
|
@ -359,12 +359,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
|
||||
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $4, %5 \n\t"
|
||||
"jne .L02END%= \n\t"
|
||||
"jne 3f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
|
||||
|
@ -386,7 +386,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||
|
||||
".L02END%=: \n\t"
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -452,10 +452,10 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
|
|||
"vbroadcastss (%5), %%ymm1 \n\t" // alpha_i
|
||||
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src
|
||||
"vmovups 32(%2,%0,4), %%ymm9 \n\t"
|
||||
|
||||
|
@ -489,12 +489,12 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
|
|||
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
|
||||
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $4, %6 \n\t"
|
||||
"jne .L02END%= \n\t"
|
||||
"jne 3f \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src
|
||||
|
||||
|
@ -516,7 +516,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
|
|||
|
||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||
|
||||
".L02END%=: \n\t"
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
|
|
@ -47,7 +47,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
|
||||
|
@ -72,12 +72,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L08END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
"prefetcht0 192(%5,%0,4) \n\t"
|
||||
|
@ -125,9 +125,9 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $16 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L08END%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha
|
||||
"vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha
|
||||
|
@ -269,7 +269,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
|
||||
|
@ -288,12 +288,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L08END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
"prefetcht0 192(%5,%0,4) \n\t"
|
||||
|
@ -325,9 +325,9 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $16 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L08END%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha
|
||||
"vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha
|
||||
|
@ -426,7 +426,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
|
|||
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
|
||||
|
@ -442,12 +442,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
|
|||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L08END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
|
||||
|
@ -472,9 +472,9 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
|
|||
|
||||
"addq $16 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L08END%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha
|
||||
"vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha
|
||||
|
|
|
@ -39,7 +39,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vmovddup (%4), %%xmm0 \n\t" // alpha
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"prefetcht0 768(%3,%0,8) \n\t"
|
||||
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
||||
|
@ -61,7 +61,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -40,7 +40,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"shufpd $0, %%xmm0, %%xmm0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
// "prefetcht0 192(%2,%0,8) \n\t"
|
||||
// "prefetcht0 192(%3,%0,8) \n\t"
|
||||
|
||||
|
@ -70,7 +70,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
||||
"vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
|
||||
"vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
|
||||
|
@ -55,7 +55,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||
"vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||
|
|
|
@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"xorpd %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
||||
"movups (%3,%0,8), %%xmm8 \n\t" // 2 * y
|
||||
|
@ -65,7 +65,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"addpd %%xmm5, %%xmm4 \n\t"
|
||||
"addpd %%xmm7, %%xmm6 \n\t"
|
||||
|
|
|
@ -125,7 +125,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"shufpd $0, %%xmm13, %%xmm13 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
|
||||
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
|
||||
|
||||
|
@ -148,7 +148,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
@ -187,7 +187,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
|||
"shufpd $0, %%xmm12, %%xmm12 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%4,%0,8), %%xmm8 \n\t" // 2 * a
|
||||
"movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a
|
||||
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
|
||||
|
@ -203,7 +203,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
|||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -50,7 +50,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L8LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
|
@ -77,14 +77,14 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L8LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
@ -118,9 +118,9 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"subq $8 , %1 \n\t"
|
||||
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -168,7 +168,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"vbroadcastsd (%8), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L8LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
@ -188,14 +188,14 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L8LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L8END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
||||
|
@ -218,9 +218,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L8END%=: \n\t"
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
|
|
@ -60,7 +60,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"xorpd %%xmm4 , %%xmm4 \n\t"
|
||||
"xorpd %%xmm5 , %%xmm5 \n\t"
|
||||
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
|
||||
|
@ -142,7 +142,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
@ -194,7 +194,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"shufpd $0, %%xmm6 , %%xmm6 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"xorpd %%xmm4 , %%xmm4 \n\t"
|
||||
"xorpd %%xmm5 , %%xmm5 \n\t"
|
||||
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
|
||||
|
@ -239,7 +239,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -78,7 +78,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
|||
"xorpd %%xmm11 , %%xmm11 \n\t"
|
||||
|
||||
"testq $2 , %1 \n\t"
|
||||
"jz .L01LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"movups (%5,%0,8) , %%xmm14 \n\t" // x
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
|
||||
|
@ -90,13 +90,13 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
|||
"subq $2 , %1 \n\t"
|
||||
"addpd %%xmm13 , %%xmm11 \n\t"
|
||||
|
||||
".L01LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%5,%0,8) , %%xmm14 \n\t" // x
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
|
||||
|
@ -116,9 +116,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
|||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"haddpd %%xmm10, %%xmm10 \n\t"
|
||||
"haddpd %%xmm11, %%xmm11 \n\t"
|
||||
|
@ -157,7 +157,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"xorpd %%xmm10 , %%xmm10 \n\t"
|
||||
|
||||
"testq $2 , %1 \n\t"
|
||||
"jz .L01LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t"
|
||||
"movups (%4,%0,8) , %%xmm11 \n\t"
|
||||
|
@ -166,13 +166,13 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"addpd %%xmm12 , %%xmm10 \n\t"
|
||||
"subq $2 , %1 \n\t"
|
||||
|
||||
".L01LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t"
|
||||
"movups 16(%3,%0,8) , %%xmm14 \n\t"
|
||||
|
@ -185,9 +185,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"subq $4 , %1 \n\t"
|
||||
"addpd %%xmm14 , %%xmm9 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"addpd %%xmm9 , %%xmm10 \n\t"
|
||||
"haddpd %%xmm10, %%xmm10 \n\t"
|
||||
|
@ -246,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
|||
"shufpd $0 , %%xmm10 , %%xmm10 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t"
|
||||
"movups (%4,%0,8) , %%xmm11 \n\t"
|
||||
|
@ -256,7 +256,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
|||
"subq $2 , %1 \n\t"
|
||||
"movups %%xmm11, -16(%4,%0,8) \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -42,7 +42,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t"
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
|
||||
|
||||
|
@ -54,13 +54,13 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
// "prefetcht0 384(%2,%0,8) \n\t"
|
||||
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
|
||||
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x
|
||||
|
@ -80,9 +80,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"subq $8 , %1 \n\t"
|
||||
"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
|
||||
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"
|
||||
|
|
|
@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|||
"vmovddup 24(%8), %%xmm7 \n\t" // temp1[1]
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||
|
@ -90,7 +90,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|||
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
|
||||
|
||||
"cmpq %0 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vmovsd (%9), %%xmm4 \n\t"
|
||||
"vmovsd 8(%9), %%xmm5 \n\t"
|
||||
|
|
|
@ -48,7 +48,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|||
"shufpd $0, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||
"movups %%xmm12 , %%xmm11 \n\t"
|
||||
|
@ -85,7 +85,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|||
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
|
||||
|
||||
"cmpq %0 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"movsd (%9), %%xmm4 \n\t" // temp1[0]
|
||||
"movsd 8(%9), %%xmm5 \n\t" // temp1[1]
|
||||
|
|
|
@ -47,7 +47,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
"xorq %0,%0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||
|
@ -93,7 +93,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
"vmovups %%xmm9 , -32(%3,%0,8) \n\t"
|
||||
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||
|
|
|
@ -51,7 +51,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
"xorq %0,%0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||
"movups %%xmm12 , %%xmm11 \n\t"
|
||||
|
@ -88,7 +88,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
|
||||
|
||||
"subq $2 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"haddpd %%xmm0, %%xmm0 \n\t"
|
||||
"haddpd %%xmm1, %%xmm1 \n\t"
|
||||
|
|
|
@ -40,7 +40,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"shufps $0, %%xmm0, %%xmm0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
// "prefetcht0 192(%2,%0,4) \n\t"
|
||||
// "prefetcht0 192(%3,%0,4) \n\t"
|
||||
|
||||
|
@ -70,7 +70,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
|
||||
"vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
|
||||
|
@ -55,7 +55,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||
"vaddps %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||
|
|
|
@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"xorps %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
"movups (%3,%0,4), %%xmm8 \n\t" // 4 * x
|
||||
"movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
|
||||
|
@ -64,7 +64,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"addps %%xmm5, %%xmm4 \n\t"
|
||||
"addps %%xmm7, %%xmm6 \n\t"
|
||||
|
|
|
@ -129,7 +129,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"shufps $0, %%xmm13, %%xmm13 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
||||
|
||||
"movups (%4,%0,4), %%xmm8 \n\t"
|
||||
|
@ -143,7 +143,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
@ -166,7 +166,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x2
|
||||
#ifndef HAVE_KERNEL_4x1
|
||||
|
||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
|
@ -184,10 +184,10 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
|||
"shufps $0, %%xmm12, %%xmm12 \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
||||
"movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
|
||||
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
|
||||
|
@ -203,12 +203,12 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
|||
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"testq $0x04, %5 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
||||
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
|
||||
|
@ -218,7 +218,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
|||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"3: \n\t"
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
|
@ -262,7 +262,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
|||
(
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%2,%0,4) , %%xmm12 \n\t"
|
||||
"movups (%3,%0,4) , %%xmm11 \n\t"
|
||||
|
@ -271,7 +271,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
|||
"movups %%xmm11, -16(%3,%0,4) \n\t"
|
||||
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -49,7 +49,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
@ -71,10 +71,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"subq $4 , %1 \n\t"
|
||||
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
@ -107,13 +107,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"subq $8 , %1 \n\t"
|
||||
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
"je 4f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
@ -178,9 +178,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"4: \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
@ -227,7 +227,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"vbroadcastss (%8), %%xmm8 \n\t" // alpha
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
|
@ -243,7 +243,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -50,7 +50,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
|
||||
|
@ -76,10 +76,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
|
@ -106,14 +106,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
"je 4f \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
@ -147,9 +147,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"subq $16, %1 \n\t"
|
||||
"vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"4: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -197,7 +197,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
@ -217,10 +217,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
@ -240,14 +240,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
"je 4f \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
|
||||
|
@ -270,9 +270,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"4: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
|
|
@ -60,7 +60,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"xorps %%xmm4 , %%xmm4 \n\t"
|
||||
"xorps %%xmm5 , %%xmm5 \n\t"
|
||||
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
|
@ -103,7 +103,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
"movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
@ -155,7 +155,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"shufps $0, %%xmm6 , %%xmm6 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"xorps %%xmm4 , %%xmm4 \n\t"
|
||||
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
|
||||
|
@ -178,7 +178,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"addps %%xmm7 , %%xmm11 \n\t"
|
||||
"movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -51,7 +51,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
|
||||
|
@ -85,10 +85,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"addq $4, %0 \n\t"
|
||||
"subq $4, %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
@ -123,14 +123,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"subq $8, %1 \n\t"
|
||||
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
"je 4f \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
|
@ -190,9 +190,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"addq $16, %8 \n\t"
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"4: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -241,7 +241,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
@ -265,10 +265,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"addq $4, %0 \n\t"
|
||||
"subq $4, %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
@ -293,14 +293,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"subq $8, %1 \n\t"
|
||||
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
"je 4f \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y
|
||||
|
@ -339,9 +339,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"4: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
|
|
@ -84,7 +84,7 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
|||
"xorps %%xmm11 , %%xmm11 \n\t"
|
||||
|
||||
"testq $4 , %1 \n\t"
|
||||
"jz .L01LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"movups (%5,%0,4) , %%xmm14 \n\t" // x
|
||||
"movups (%3,%0,4) , %%xmm12 \n\t" // ap0
|
||||
|
@ -96,13 +96,13 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
|||
"subq $4 , %1 \n\t"
|
||||
"addps %%xmm13 , %%xmm11 \n\t"
|
||||
|
||||
".L01LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%5,%0,4) , %%xmm14 \n\t" // x
|
||||
"movups (%3,%0,4) , %%xmm12 \n\t" // ap0
|
||||
|
@ -122,9 +122,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"haddps %%xmm10, %%xmm10 \n\t"
|
||||
"haddps %%xmm11, %%xmm11 \n\t"
|
||||
|
@ -165,7 +165,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"xorps %%xmm10 , %%xmm10 \n\t"
|
||||
|
||||
"testq $4 , %1 \n\t"
|
||||
"jz .L01LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"movups (%3,%0,4) , %%xmm12 \n\t"
|
||||
"movups (%4,%0,4) , %%xmm11 \n\t"
|
||||
|
@ -174,13 +174,13 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"addps %%xmm12 , %%xmm10 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L01LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%3,%0,4) , %%xmm12 \n\t"
|
||||
"movups 16(%3,%0,4) , %%xmm14 \n\t"
|
||||
|
@ -193,9 +193,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"subq $8 , %1 \n\t"
|
||||
"addps %%xmm14 , %%xmm9 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"addps %%xmm9 , %%xmm10 \n\t"
|
||||
"haddps %%xmm10, %%xmm10 \n\t"
|
||||
|
@ -255,7 +255,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
|||
"shufps $0 , %%xmm10 , %%xmm10 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%3,%0,4) , %%xmm12 \n\t"
|
||||
"movups (%4,%0,4) , %%xmm11 \n\t"
|
||||
|
@ -265,7 +265,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
|||
"subq $4 , %1 \n\t"
|
||||
"movups %%xmm11, -16(%4,%0,4) \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
|
@ -51,10 +51,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
|
||||
|
@ -70,13 +70,13 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
"je 4f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
|
||||
"prefetcht0 384(%4,%0,4) \n\t"
|
||||
|
@ -107,9 +107,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"subq $16, %1 \n\t"
|
||||
"vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"4: \n\t"
|
||||
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
|
||||
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
|
||||
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"
|
||||
|
|
|
@ -42,7 +42,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t"
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
|
||||
|
@ -54,10 +54,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
|
||||
|
||||
|
@ -69,14 +69,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
"je 4f \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 384(%2,%0,4) \n\t"
|
||||
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
|
||||
"vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x
|
||||
|
@ -96,9 +96,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"4: \n\t"
|
||||
|
||||
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
|
||||
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"
|
||||
|
|
|
@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"xorps %%xmm7 , %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a0
|
||||
|
@ -60,7 +60,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"addps %%xmm10, %%xmm6 \n\t"
|
||||
"addps %%xmm11, %%xmm7 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"haddps %%xmm4, %%xmm4 \n\t"
|
||||
"haddps %%xmm5, %%xmm5 \n\t"
|
||||
|
|
|
@ -46,7 +46,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t"
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
|
||||
|
@ -61,10 +61,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"subq $4 , %1 \n\t"
|
||||
"vaddps %%xmm7, %%xmm11, %%xmm7 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
|
||||
|
||||
|
@ -79,14 +79,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"subq $8 , %1 \n\t"
|
||||
"vaddps %%ymm7, %%ymm11, %%ymm7 \n\t"
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
"je 4f \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 384(%2,%0,4) \n\t"
|
||||
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
|
||||
"vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x
|
||||
|
@ -114,9 +114,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"subq $16, %1 \n\t"
|
||||
"vaddps %%ymm3, %%ymm11, %%ymm3 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"4: \n\t"
|
||||
|
||||
"vaddps %%ymm4, %%ymm0, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm1, %%ymm5 \n\t"
|
||||
|
|
|
@ -44,7 +44,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|||
"vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[3]
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a
|
||||
"vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x
|
||||
|
@ -71,7 +71,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|||
"vmovups %%xmm9 , -16(%3,%0,4) \n\t"
|
||||
|
||||
"cmpq %0 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vmovss (%9), %%xmm4 \n\t"
|
||||
"vmovss 4(%9), %%xmm5 \n\t"
|
||||
|
|
|
@ -48,7 +48,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F
|
|||
"shufps $0, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%2,%0,4), %%xmm8 \n\t" // 4 * x
|
||||
"movups (%3,%0,4), %%xmm9 \n\t" // 4 * y
|
||||
|
||||
|
@ -86,7 +86,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F
|
|||
|
||||
"addq $4 , %0 \n\t"
|
||||
"cmpq %0 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"movss (%9), %%xmm4 \n\t" // temp1[0]
|
||||
"movss 4(%9), %%xmm5 \n\t" // temp1[1]
|
||||
|
|
|
@ -47,7 +47,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
"xorq %0,%0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%xmm8 \n\t" // 4 * x
|
||||
"vmovups (%3,%0,4), %%xmm9 \n\t" // 4 * y
|
||||
|
@ -73,7 +73,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||
"vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||
|
|
|
@ -51,7 +51,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
"xorq %0,%0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%2,%0,4), %%xmm8 \n\t" // 4 * x
|
||||
"movups (%3,%0,4), %%xmm9 \n\t" // 4 * y
|
||||
|
||||
|
@ -89,7 +89,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"haddps %%xmm0, %%xmm0 \n\t"
|
||||
"haddps %%xmm1, %%xmm1 \n\t"
|
||||
|
|
|
@ -40,7 +40,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"prefetcht0 768(%2,%0,8) \n\t"
|
||||
"vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x
|
||||
|
@ -113,7 +113,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%4,%0,8) \n\t"
|
||||
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
||||
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
|
||||
|
@ -111,7 +111,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -153,7 +153,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%4,%0,8) \n\t"
|
||||
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
||||
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
|
||||
|
@ -199,7 +199,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -237,7 +237,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%4,%0,8) \n\t"
|
||||
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
||||
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
|
||||
|
@ -273,7 +273,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -339,7 +339,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
|
|||
"vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%2,%0,8) \n\t"
|
||||
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src
|
||||
"vmovups 32(%2,%0,8), %%ymm9 \n\t"
|
||||
|
@ -375,7 +375,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
|
|
@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
//"prefetcht0 256(%4,%0,8) \n\t"
|
||||
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
||||
|
@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -165,7 +165,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
// "prefetcht0 256(%4,%0,8) \n\t"
|
||||
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
||||
|
@ -216,7 +216,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -254,7 +254,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
// "prefetcht0 256(%4,%0,8) \n\t"
|
||||
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
||||
|
@ -291,7 +291,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -356,7 +356,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
|
|||
"vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
// "prefetcht0 192(%2,%0,8) \n\t"
|
||||
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src
|
||||
"vmovups 32(%2,%0,8), %%ymm9 \n\t"
|
||||
|
@ -392,7 +392,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
|
|
@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
|
||||
"vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
|
||||
|
@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vmovddup (%8) , %%xmm0 \n\t" // value from alpha
|
||||
"vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha
|
||||
|
@ -236,7 +236,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
|
||||
"vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
|
||||
|
@ -286,7 +286,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vmovddup (%6) , %%xmm0 \n\t" // value from alpha
|
||||
"vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha
|
||||
|
@ -369,7 +369,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
|
|||
"vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
|
||||
"vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
|
||||
|
@ -404,7 +404,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
|
|||
"vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0
|
||||
"vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vmovddup (%5) , %%xmm0 \n\t" // value from alpha
|
||||
"vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha
|
||||
|
|
|
@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"prefetcht0 192(%2,%0,8) \n\t"
|
||||
"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
|
||||
|
@ -96,7 +96,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vmovddup (%8) , %%xmm0 \n\t" // value from alpha
|
||||
"vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha
|
||||
|
@ -220,7 +220,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"prefetcht0 192(%2,%0,8) \n\t"
|
||||
"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
|
||||
|
@ -255,7 +255,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vmovddup (%6) , %%xmm0 \n\t" // value from alpha
|
||||
"vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha
|
||||
|
@ -342,7 +342,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
|
|||
"vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"prefetcht0 192(%2,%0,8) \n\t"
|
||||
"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
|
||||
|
@ -370,7 +370,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vmovddup (%5) , %%xmm0 \n\t" // value from alpha
|
||||
"vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha
|
||||
|
|
Loading…
Reference in New Issue