diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c index 86407028c..63575c374 100644 --- a/kernel/x86_64/caxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c @@ -40,7 +40,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 768(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x @@ -113,7 +113,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "addq $16, %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/cgemv_n_microk_haswell-4.c b/kernel/x86_64/cgemv_n_microk_haswell-4.c index 24417ba36..2b9b1f2f1 100644 --- a/kernel/x86_64/cgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/cgemv_n_microk_haswell-4.c @@ -49,10 +49,10 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3 "cmpq $0 , %1 \n\t" - "je .L01END%= \n\t" + "je 2f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 320(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 @@ -115,12 +115,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $16, %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "2: \n\t" "cmpq $4, %8 \n\t" - "jne .L02END%= \n\t" + "jne 3f \n\t" "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 @@ -155,7 +155,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y - ".L02END%=: \n\t" + "3: \n\t" "vzeroupper \n\t" : @@ -200,10 +200,10 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1 "cmpq $0 , %1 \n\t" - "je .L01END%= \n\t" + "je 2f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 320(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 @@ -248,12 +248,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $16, %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "2: \n\t" "cmpq $4, %6 \n\t" - "jne .L02END%= \n\t" + "jne 3f \n\t" "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 @@ -279,7 +279,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y - ".L02END%=: \n\t" + "3: \n\t" "vzeroupper \n\t" : @@ -320,10 +320,10 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 "cmpq $0 , %1 \n\t" - "je .L01END%= \n\t" + "je 2f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 320(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 @@ -359,12 +359,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y "vmovups %%ymm13,-32(%3,%0,4) \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "2: \n\t" "cmpq $4, %5 \n\t" - "jne .L02END%= \n\t" + "jne 3f \n\t" "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 @@ -386,7 +386,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y - ".L02END%=: \n\t" + "3: \n\t" "vzeroupper \n\t" : @@ -452,10 +452,10 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "vbroadcastss (%5), %%ymm1 \n\t" // alpha_i "cmpq $0 , %1 \n\t" - "je .L01END%= \n\t" + "je 2f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src "vmovups 32(%2,%0,4), %%ymm9 \n\t" @@ -489,12 +489,12 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y "vmovups %%ymm13,-32(%3,%0,4) \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "2: \n\t" "cmpq $4, %6 \n\t" - "jne .L02END%= \n\t" + "jne 3f \n\t" "vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src @@ -516,7 +516,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y - ".L02END%=: \n\t" + "3: \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/cgemv_t_microk_haswell-4.c b/kernel/x86_64/cgemv_t_microk_haswell-4.c index 2c506c9e9..5e48650e1 100644 --- a/kernel/x86_64/cgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/cgemv_t_microk_haswell-4.c @@ -47,7 +47,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 @@ -72,12 +72,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L08END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "prefetcht0 192(%5,%0,4) \n\t" @@ -125,9 +125,9 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $16 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L08END%=: \n\t" + "3: \n\t" "vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha "vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha @@ -269,7 +269,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 @@ -288,12 +288,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L08END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "prefetcht0 192(%5,%0,4) \n\t" @@ -325,9 +325,9 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $16 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L08END%=: \n\t" + "3: \n\t" "vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha "vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha @@ -426,7 +426,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 @@ -442,12 +442,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L08END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 @@ -472,9 +472,9 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "addq $16 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L08END%=: \n\t" + "3: \n\t" "vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha "vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c index b1ef84a18..8c520dcf1 100644 --- a/kernel/x86_64/daxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c @@ -39,7 +39,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vmovddup (%4), %%xmm0 \n\t" // alpha ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 768(%3,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x @@ -61,7 +61,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c index 32ed1857c..38472c520 100644 --- a/kernel/x86_64/daxpy_microk_nehalem-2.c +++ b/kernel/x86_64/daxpy_microk_nehalem-2.c @@ -40,7 +40,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "shufpd $0, %%xmm0, %%xmm0 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" // "prefetcht0 192(%2,%0,8) \n\t" // "prefetcht0 192(%3,%0,8) \n\t" @@ -70,7 +70,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c index 0c77b6349..9756ee46a 100644 --- a/kernel/x86_64/ddot_microk_bulldozer-2.c +++ b/kernel/x86_64/ddot_microk_bulldozer-2.c @@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x @@ -55,7 +55,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c index dd05053f7..1d10fc2d7 100644 --- a/kernel/x86_64/ddot_microk_nehalem-2.c +++ b/kernel/x86_64/ddot_microk_nehalem-2.c @@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "xorpd %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x "movups (%3,%0,8), %%xmm8 \n\t" // 2 * y @@ -65,7 +65,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "addpd %%xmm5, %%xmm4 \n\t" "addpd %%xmm7, %%xmm6 \n\t" diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 371fd73ee..4da73af3e 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -125,7 +125,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "shufpd $0, %%xmm13, %%xmm13 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y @@ -148,7 +148,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : @@ -187,7 +187,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "shufpd $0, %%xmm12, %%xmm12 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%4,%0,8), %%xmm8 \n\t" // 2 * a "movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y @@ -203,7 +203,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index 2c77f3469..e1587b57c 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -50,7 +50,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vbroadcastsd (%9), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" - "jz .L8LABEL%= \n\t" + "jz 2f \n\t" "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" @@ -77,14 +77,14 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L8LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" @@ -118,9 +118,9 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "subq $8 , %1 \n\t" "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "3: \n\t" "vzeroupper \n\t" : @@ -168,7 +168,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vbroadcastsd (%8), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" - "jz .L8LABEL%= \n\t" + "jz 2f \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" @@ -188,14 +188,14 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L8LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L8END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y @@ -218,9 +218,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L8END%=: \n\t" + "3: \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/dgemv_n_microk_nehalem-4.c b/kernel/x86_64/dgemv_n_microk_nehalem-4.c index e311326f1..0d2c24d52 100644 --- a/kernel/x86_64/dgemv_n_microk_nehalem-4.c +++ b/kernel/x86_64/dgemv_n_microk_nehalem-4.c @@ -60,7 +60,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "xorpd %%xmm4 , %%xmm4 \n\t" "xorpd %%xmm5 , %%xmm5 \n\t" "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y @@ -142,7 +142,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : @@ -194,7 +194,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "shufpd $0, %%xmm6 , %%xmm6 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "xorpd %%xmm4 , %%xmm4 \n\t" "xorpd %%xmm5 , %%xmm5 \n\t" "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y @@ -239,7 +239,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index ebec7d2c3..ee99228aa 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -78,7 +78,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "xorpd %%xmm11 , %%xmm11 \n\t" "testq $2 , %1 \n\t" - "jz .L01LABEL%= \n\t" + "jz 2f \n\t" "movups (%5,%0,8) , %%xmm14 \n\t" // x "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 @@ -90,13 +90,13 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "subq $2 , %1 \n\t" "addpd %%xmm13 , %%xmm11 \n\t" - ".L01LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L01END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%5,%0,8) , %%xmm14 \n\t" // x "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 @@ -116,9 +116,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "3: \n\t" "haddpd %%xmm10, %%xmm10 \n\t" "haddpd %%xmm11, %%xmm11 \n\t" @@ -157,7 +157,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "xorpd %%xmm10 , %%xmm10 \n\t" "testq $2 , %1 \n\t" - "jz .L01LABEL%= \n\t" + "jz 2f \n\t" "movups (%3,%0,8) , %%xmm12 \n\t" "movups (%4,%0,8) , %%xmm11 \n\t" @@ -166,13 +166,13 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "addpd %%xmm12 , %%xmm10 \n\t" "subq $2 , %1 \n\t" - ".L01LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L01END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%3,%0,8) , %%xmm12 \n\t" "movups 16(%3,%0,8) , %%xmm14 \n\t" @@ -185,9 +185,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "subq $4 , %1 \n\t" "addpd %%xmm14 , %%xmm9 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "3: \n\t" "addpd %%xmm9 , %%xmm10 \n\t" "haddpd %%xmm10, %%xmm10 \n\t" @@ -246,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "shufpd $0 , %%xmm10 , %%xmm10 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%3,%0,8) , %%xmm12 \n\t" "movups (%4,%0,8) , %%xmm11 \n\t" @@ -256,7 +256,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "subq $2 , %1 \n\t" "movups %%xmm11, -16(%4,%0,8) \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/dgemv_t_microk_haswell-4.c b/kernel/x86_64/dgemv_t_microk_haswell-4.c index 33b43515d..1e76a57a6 100644 --- a/kernel/x86_64/dgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_t_microk_haswell-4.c @@ -42,7 +42,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t" "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x @@ -54,13 +54,13 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" // "prefetcht0 384(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x @@ -80,9 +80,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "subq $8 , %1 \n\t" "vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "3: \n\t" "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c index 70d8df36b..d84470cc4 100644 --- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c +++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c @@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vmovddup 24(%8), %%xmm7 \n\t" // temp1[1] ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a "vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x @@ -90,7 +90,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vmovups %%xmm11 , -16(%3,%0,8) \n\t" "cmpq %0 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovsd (%9), %%xmm4 \n\t" "vmovsd 8(%9), %%xmm5 \n\t" diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c index 3ba596c5e..f7f7954b2 100644 --- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c +++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c @@ -48,7 +48,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "shufpd $0, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x "movups %%xmm12 , %%xmm11 \n\t" @@ -85,7 +85,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y "cmpq %0 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "movsd (%9), %%xmm4 \n\t" // temp1[0] "movsd 8(%9), %%xmm5 \n\t" // temp1[1] diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c index 492920253..d7166fe4b 100644 --- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c +++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c @@ -47,7 +47,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "xorq %0,%0 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a "vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x @@ -93,7 +93,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vmovups %%xmm9 , -32(%3,%0,8) \n\t" "vmovups %%xmm11 , -16(%3,%0,8) \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c index 6aab57500..75e3d02d1 100644 --- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c +++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c @@ -51,7 +51,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "xorq %0,%0 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x "movups %%xmm12 , %%xmm11 \n\t" @@ -88,7 +88,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y "subq $2 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "haddpd %%xmm0, %%xmm0 \n\t" "haddpd %%xmm1, %%xmm1 \n\t" diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c index 14ff51a0d..a09494935 100644 --- a/kernel/x86_64/saxpy_microk_nehalem-2.c +++ b/kernel/x86_64/saxpy_microk_nehalem-2.c @@ -40,7 +40,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "shufps $0, %%xmm0, %%xmm0 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" // "prefetcht0 192(%2,%0,4) \n\t" // "prefetcht0 192(%3,%0,4) \n\t" @@ -70,7 +70,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "addq $16, %0 \n\t" "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c index 024b2ce6d..36e61b077 100644 --- a/kernel/x86_64/sdot_microk_bulldozer-2.c +++ b/kernel/x86_64/sdot_microk_bulldozer-2.c @@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x @@ -55,7 +55,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "addq $16, %0 \n\t" "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c index 2a918b5ea..b5f6a1c91 100644 --- a/kernel/x86_64/sdot_microk_nehalem-2.c +++ b/kernel/x86_64/sdot_microk_nehalem-2.c @@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "xorps %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x "movups (%3,%0,4), %%xmm8 \n\t" // 4 * x "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x @@ -64,7 +64,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "addq $16, %0 \n\t" "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "addps %%xmm5, %%xmm4 \n\t" "addps %%xmm7, %%xmm6 \n\t" diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 0135306af..a840f8ba9 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -129,7 +129,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "shufps $0, %%xmm13, %%xmm13 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y "movups (%4,%0,4), %%xmm8 \n\t" @@ -143,7 +143,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : @@ -166,7 +166,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT #endif -#ifndef HAVE_KERNEL_4x2 +#ifndef HAVE_KERNEL_4x1 static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); @@ -184,10 +184,10 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "shufps $0, %%xmm12, %%xmm12 \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 2f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y "movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a @@ -203,12 +203,12 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "2: \n\t" "testq $0x04, %5 \n\t" - "jz .L08LABEL%= \n\t" + "jz 3f \n\t" "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a @@ -218,7 +218,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "3: \n\t" : : "r" (i), // 0 @@ -262,7 +262,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) ( ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%2,%0,4) , %%xmm12 \n\t" "movups (%3,%0,4) , %%xmm11 \n\t" @@ -271,7 +271,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) "movups %%xmm11, -16(%3,%0,4) \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c index 40238be49..2b83b1045 100644 --- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c +++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c @@ -49,7 +49,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vbroadcastss (%9), %%xmm8 \n\t" // alpha "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" @@ -71,10 +71,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "subq $4 , %1 \n\t" "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" @@ -107,13 +107,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "subq $8 , %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" @@ -178,9 +178,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" : : @@ -227,7 +227,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vbroadcastss (%8), %%xmm8 \n\t" // alpha ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" @@ -243,7 +243,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index 8f56655a9..79054f6c6 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -50,7 +50,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vbroadcastss (%9), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" @@ -76,10 +76,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" @@ -106,14 +106,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" @@ -147,9 +147,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "subq $16, %1 \n\t" "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vzeroupper \n\t" : @@ -197,7 +197,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vbroadcastss (%8), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" @@ -217,10 +217,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" @@ -240,14 +240,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y @@ -270,9 +270,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $16, %0 \n\t" "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c index 77a1b11aa..167c4be05 100644 --- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c +++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c @@ -60,7 +60,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "xorps %%xmm4 , %%xmm4 \n\t" "xorps %%xmm5 , %%xmm5 \n\t" "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y @@ -103,7 +103,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : @@ -155,7 +155,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "shufps $0, %%xmm6 , %%xmm6 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "xorps %%xmm4 , %%xmm4 \n\t" "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y @@ -178,7 +178,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addps %%xmm7 , %%xmm11 \n\t" "movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c index c162eeeb6..7377b545c 100644 --- a/kernel/x86_64/sgemv_n_microk_sandy-4.c +++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c @@ -51,7 +51,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vbroadcastss (%9), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" @@ -85,10 +85,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addq $4, %0 \n\t" "subq $4, %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" @@ -123,14 +123,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "subq $8, %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" @@ -190,9 +190,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addq $16, %8 \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vzeroupper \n\t" : @@ -241,7 +241,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vbroadcastss (%8), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" @@ -265,10 +265,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $4, %0 \n\t" "subq $4, %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" @@ -293,14 +293,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "subq $8, %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y @@ -339,9 +339,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $16, %0 \n\t" "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index b0e883252..cd13bb67d 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -84,7 +84,7 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "xorps %%xmm11 , %%xmm11 \n\t" "testq $4 , %1 \n\t" - "jz .L01LABEL%= \n\t" + "jz 2f \n\t" "movups (%5,%0,4) , %%xmm14 \n\t" // x "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 @@ -96,13 +96,13 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "subq $4 , %1 \n\t" "addps %%xmm13 , %%xmm11 \n\t" - ".L01LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L01END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%5,%0,4) , %%xmm14 \n\t" // x "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 @@ -122,9 +122,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "3: \n\t" "haddps %%xmm10, %%xmm10 \n\t" "haddps %%xmm11, %%xmm11 \n\t" @@ -165,7 +165,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "xorps %%xmm10 , %%xmm10 \n\t" "testq $4 , %1 \n\t" - "jz .L01LABEL%= \n\t" + "jz 2f \n\t" "movups (%3,%0,4) , %%xmm12 \n\t" "movups (%4,%0,4) , %%xmm11 \n\t" @@ -174,13 +174,13 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "addps %%xmm12 , %%xmm10 \n\t" "subq $4 , %1 \n\t" - ".L01LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L01END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%3,%0,4) , %%xmm12 \n\t" "movups 16(%3,%0,4) , %%xmm14 \n\t" @@ -193,9 +193,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "subq $8 , %1 \n\t" "addps %%xmm14 , %%xmm9 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "3: \n\t" "addps %%xmm9 , %%xmm10 \n\t" "haddps %%xmm10, %%xmm10 \n\t" @@ -255,7 +255,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "shufps $0 , %%xmm10 , %%xmm10 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%3,%0,4) , %%xmm12 \n\t" "movups (%4,%0,4) , %%xmm11 \n\t" @@ -265,7 +265,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "subq $4 , %1 \n\t" "movups %%xmm11, -16(%4,%0,4) \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/sgemv_t_microk_bulldozer-4.c b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c index 40e318de3..6e822fba3 100644 --- a/kernel/x86_64/sgemv_t_microk_bulldozer-4.c +++ b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c @@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" @@ -51,10 +51,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x @@ -70,13 +70,13 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "prefetcht0 384(%4,%0,4) \n\t" @@ -107,9 +107,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "subq $16, %1 \n\t" "vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" diff --git a/kernel/x86_64/sgemv_t_microk_haswell-4.c b/kernel/x86_64/sgemv_t_microk_haswell-4.c index 016cb35e7..14fe1ecad 100644 --- a/kernel/x86_64/sgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_t_microk_haswell-4.c @@ -42,7 +42,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x @@ -54,10 +54,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x @@ -69,14 +69,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 384(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x @@ -96,9 +96,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $16, %0 \n\t" "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" diff --git a/kernel/x86_64/sgemv_t_microk_nehalem-4.c b/kernel/x86_64/sgemv_t_microk_nehalem-4.c index 4a167900e..4f07d9640 100644 --- a/kernel/x86_64/sgemv_t_microk_nehalem-4.c +++ b/kernel/x86_64/sgemv_t_microk_nehalem-4.c @@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "xorps %%xmm7 , %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a0 @@ -60,7 +60,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addps %%xmm10, %%xmm6 \n\t" "addps %%xmm11, %%xmm7 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "haddps %%xmm4, %%xmm4 \n\t" "haddps %%xmm5, %%xmm5 \n\t" diff --git a/kernel/x86_64/sgemv_t_microk_sandy-4.c b/kernel/x86_64/sgemv_t_microk_sandy-4.c index 6550518f7..76868ab14 100644 --- a/kernel/x86_64/sgemv_t_microk_sandy-4.c +++ b/kernel/x86_64/sgemv_t_microk_sandy-4.c @@ -46,7 +46,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x @@ -61,10 +61,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "subq $4 , %1 \n\t" "vaddps %%xmm7, %%xmm11, %%xmm7 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x @@ -79,14 +79,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "subq $8 , %1 \n\t" "vaddps %%ymm7, %%ymm11, %%ymm7 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 384(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x @@ -114,9 +114,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "subq $16, %1 \n\t" "vaddps %%ymm3, %%ymm11, %%ymm3 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vaddps %%ymm4, %%ymm0, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm1, %%ymm5 \n\t" diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c index c9206c1be..9002228f3 100644 --- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c @@ -44,7 +44,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[3] ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x @@ -71,7 +71,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vmovups %%xmm9 , -16(%3,%0,4) \n\t" "cmpq %0 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovss (%9), %%xmm4 \n\t" "vmovss 4(%9), %%xmm5 \n\t" diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c index a1c62caf6..fb5337946 100644 --- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c +++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c @@ -48,7 +48,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F "shufps $0, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y @@ -86,7 +86,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F "addq $4 , %0 \n\t" "cmpq %0 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "movss (%9), %%xmm4 \n\t" // temp1[0] "movss 4(%9), %%xmm5 \n\t" // temp1[1] diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c index b8b3b73e9..8c01ab806 100644 --- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c @@ -47,7 +47,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "xorq %0,%0 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%2,%0,4), %%xmm8 \n\t" // 4 * x "vmovups (%3,%0,4), %%xmm9 \n\t" // 4 * y @@ -73,7 +73,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c index 9505a395a..2fb8f4494 100644 --- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c +++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c @@ -51,7 +51,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "xorq %0,%0 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y @@ -89,7 +89,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "haddps %%xmm0, %%xmm0 \n\t" "haddps %%xmm1, %%xmm1 \n\t" diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c index 780109b69..f9732cd4e 100644 --- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c @@ -40,7 +40,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 768(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x @@ -113,7 +113,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/zgemv_n_microk_haswell-4.c b/kernel/x86_64/zgemv_n_microk_haswell-4.c index 61358508a..b38cc5763 100644 --- a/kernel/x86_64/zgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/zgemv_n_microk_haswell-4.c @@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 @@ -111,7 +111,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -153,7 +153,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 @@ -199,7 +199,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -237,7 +237,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 @@ -273,7 +273,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -339,7 +339,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src "vmovups 32(%2,%0,8), %%ymm9 \n\t" @@ -375,7 +375,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/zgemv_n_microk_sandy-4.c b/kernel/x86_64/zgemv_n_microk_sandy-4.c index 009e4801e..82fc543de 100644 --- a/kernel/x86_64/zgemv_n_microk_sandy-4.c +++ b/kernel/x86_64/zgemv_n_microk_sandy-4.c @@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" //"prefetcht0 256(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 @@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -165,7 +165,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" // "prefetcht0 256(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 @@ -216,7 +216,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -254,7 +254,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" // "prefetcht0 256(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 @@ -291,7 +291,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -356,7 +356,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" // "prefetcht0 192(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src "vmovups 32(%2,%0,8), %%ymm9 \n\t" @@ -392,7 +392,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/zgemv_t_microk_bulldozer-4.c b/kernel/x86_64/zgemv_t_microk_bulldozer-4.c index 006db226b..792c7e952 100644 --- a/kernel/x86_64/zgemv_t_microk_bulldozer-4.c +++ b/kernel/x86_64/zgemv_t_microk_bulldozer-4.c @@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 @@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%8) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha @@ -236,7 +236,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 @@ -286,7 +286,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%6) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha @@ -369,7 +369,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 @@ -404,7 +404,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0 - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%5) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha diff --git a/kernel/x86_64/zgemv_t_microk_haswell-4.c b/kernel/x86_64/zgemv_t_microk_haswell-4.c index c87b5ce0f..8a851a54c 100644 --- a/kernel/x86_64/zgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/zgemv_t_microk_haswell-4.c @@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%2,%0,8) \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 @@ -96,7 +96,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%8) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha @@ -220,7 +220,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%2,%0,8) \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 @@ -255,7 +255,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%6) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha @@ -342,7 +342,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%2,%0,8) \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 @@ -370,7 +370,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%5) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha