From bc5fff7085f7d553468e8606d85bca0d8bc2fa66 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 7 Dec 2014 12:38:54 +0100 Subject: [PATCH 01/37] changed inline assembler labels to short form --- kernel/x86_64/caxpy_microk_bulldozer-2.c | 4 +- kernel/x86_64/cgemv_n_microk_haswell-4.c | 48 +++++++++++----------- kernel/x86_64/cgemv_t_microk_haswell-4.c | 36 ++++++++-------- kernel/x86_64/daxpy_microk_bulldozer-2.c | 4 +- kernel/x86_64/daxpy_microk_nehalem-2.c | 4 +- kernel/x86_64/ddot_microk_bulldozer-2.c | 4 +- kernel/x86_64/ddot_microk_nehalem-2.c | 4 +- kernel/x86_64/dgemv_n_4.c | 8 ++-- kernel/x86_64/dgemv_n_microk_haswell-4.c | 24 +++++------ kernel/x86_64/dgemv_n_microk_nehalem-4.c | 8 ++-- kernel/x86_64/dgemv_t_4.c | 28 ++++++------- kernel/x86_64/dgemv_t_microk_haswell-4.c | 12 +++--- kernel/x86_64/dsymv_L_microk_bulldozer-2.c | 4 +- kernel/x86_64/dsymv_L_microk_nehalem-2.c | 4 +- kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 4 +- kernel/x86_64/dsymv_U_microk_nehalem-2.c | 4 +- kernel/x86_64/saxpy_microk_nehalem-2.c | 4 +- kernel/x86_64/sdot_microk_bulldozer-2.c | 4 +- kernel/x86_64/sdot_microk_nehalem-2.c | 4 +- kernel/x86_64/sgemv_n_4.c | 22 +++++----- kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 20 ++++----- kernel/x86_64/sgemv_n_microk_haswell-4.c | 32 +++++++-------- kernel/x86_64/sgemv_n_microk_nehalem-4.c | 8 ++-- kernel/x86_64/sgemv_n_microk_sandy-4.c | 32 +++++++-------- kernel/x86_64/sgemv_t_4.c | 28 ++++++------- kernel/x86_64/sgemv_t_microk_bulldozer-4.c | 16 ++++---- kernel/x86_64/sgemv_t_microk_haswell-4.c | 16 ++++---- kernel/x86_64/sgemv_t_microk_nehalem-4.c | 4 +- kernel/x86_64/sgemv_t_microk_sandy-4.c | 16 ++++---- kernel/x86_64/ssymv_L_microk_bulldozer-2.c | 4 +- kernel/x86_64/ssymv_L_microk_nehalem-2.c | 4 +- kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 4 +- kernel/x86_64/ssymv_U_microk_nehalem-2.c | 4 +- kernel/x86_64/zaxpy_microk_bulldozer-2.c | 4 +- kernel/x86_64/zgemv_n_microk_haswell-4.c | 16 ++++---- kernel/x86_64/zgemv_n_microk_sandy-4.c | 16 ++++---- kernel/x86_64/zgemv_t_microk_bulldozer-4.c | 12 +++--- kernel/x86_64/zgemv_t_microk_haswell-4.c | 12 +++--- 38 files changed, 241 insertions(+), 241 deletions(-) diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c index 86407028c..63575c374 100644 --- a/kernel/x86_64/caxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c @@ -40,7 +40,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 768(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x @@ -113,7 +113,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "addq $16, %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/cgemv_n_microk_haswell-4.c b/kernel/x86_64/cgemv_n_microk_haswell-4.c index 24417ba36..2b9b1f2f1 100644 --- a/kernel/x86_64/cgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/cgemv_n_microk_haswell-4.c @@ -49,10 +49,10 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3 "cmpq $0 , %1 \n\t" - "je .L01END%= \n\t" + "je 2f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 320(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 @@ -115,12 +115,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $16, %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "2: \n\t" "cmpq $4, %8 \n\t" - "jne .L02END%= \n\t" + "jne 3f \n\t" "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 @@ -155,7 +155,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y - ".L02END%=: \n\t" + "3: \n\t" "vzeroupper \n\t" : @@ -200,10 +200,10 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1 "cmpq $0 , %1 \n\t" - "je .L01END%= \n\t" + "je 2f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 320(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 @@ -248,12 +248,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $16, %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "2: \n\t" "cmpq $4, %6 \n\t" - "jne .L02END%= \n\t" + "jne 3f \n\t" "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 @@ -279,7 +279,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y - ".L02END%=: \n\t" + "3: \n\t" "vzeroupper \n\t" : @@ -320,10 +320,10 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 "cmpq $0 , %1 \n\t" - "je .L01END%= \n\t" + "je 2f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 320(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 @@ -359,12 +359,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y "vmovups %%ymm13,-32(%3,%0,4) \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "2: \n\t" "cmpq $4, %5 \n\t" - "jne .L02END%= \n\t" + "jne 3f \n\t" "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 @@ -386,7 +386,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y - ".L02END%=: \n\t" + "3: \n\t" "vzeroupper \n\t" : @@ -452,10 +452,10 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "vbroadcastss (%5), %%ymm1 \n\t" // alpha_i "cmpq $0 , %1 \n\t" - "je .L01END%= \n\t" + "je 2f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src "vmovups 32(%2,%0,4), %%ymm9 \n\t" @@ -489,12 +489,12 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y "vmovups %%ymm13,-32(%3,%0,4) \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "2: \n\t" "cmpq $4, %6 \n\t" - "jne .L02END%= \n\t" + "jne 3f \n\t" "vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src @@ -516,7 +516,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y - ".L02END%=: \n\t" + "3: \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/cgemv_t_microk_haswell-4.c b/kernel/x86_64/cgemv_t_microk_haswell-4.c index 2c506c9e9..5e48650e1 100644 --- a/kernel/x86_64/cgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/cgemv_t_microk_haswell-4.c @@ -47,7 +47,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 @@ -72,12 +72,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L08END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "prefetcht0 192(%5,%0,4) \n\t" @@ -125,9 +125,9 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $16 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L08END%=: \n\t" + "3: \n\t" "vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha "vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha @@ -269,7 +269,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 @@ -288,12 +288,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L08END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "prefetcht0 192(%5,%0,4) \n\t" @@ -325,9 +325,9 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $16 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L08END%=: \n\t" + "3: \n\t" "vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha "vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha @@ -426,7 +426,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 @@ -442,12 +442,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L08END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 @@ -472,9 +472,9 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "addq $16 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L08END%=: \n\t" + "3: \n\t" "vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha "vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c index b1ef84a18..8c520dcf1 100644 --- a/kernel/x86_64/daxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c @@ -39,7 +39,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vmovddup (%4), %%xmm0 \n\t" // alpha ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 768(%3,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x @@ -61,7 +61,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c index 32ed1857c..38472c520 100644 --- a/kernel/x86_64/daxpy_microk_nehalem-2.c +++ b/kernel/x86_64/daxpy_microk_nehalem-2.c @@ -40,7 +40,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "shufpd $0, %%xmm0, %%xmm0 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" // "prefetcht0 192(%2,%0,8) \n\t" // "prefetcht0 192(%3,%0,8) \n\t" @@ -70,7 +70,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c index 0c77b6349..9756ee46a 100644 --- a/kernel/x86_64/ddot_microk_bulldozer-2.c +++ b/kernel/x86_64/ddot_microk_bulldozer-2.c @@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x @@ -55,7 +55,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c index dd05053f7..1d10fc2d7 100644 --- a/kernel/x86_64/ddot_microk_nehalem-2.c +++ b/kernel/x86_64/ddot_microk_nehalem-2.c @@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "xorpd %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x "movups (%3,%0,8), %%xmm8 \n\t" // 2 * y @@ -65,7 +65,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "addpd %%xmm5, %%xmm4 \n\t" "addpd %%xmm7, %%xmm6 \n\t" diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 371fd73ee..4da73af3e 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -125,7 +125,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "shufpd $0, %%xmm13, %%xmm13 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y @@ -148,7 +148,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : @@ -187,7 +187,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "shufpd $0, %%xmm12, %%xmm12 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%4,%0,8), %%xmm8 \n\t" // 2 * a "movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y @@ -203,7 +203,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index 2c77f3469..e1587b57c 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -50,7 +50,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vbroadcastsd (%9), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" - "jz .L8LABEL%= \n\t" + "jz 2f \n\t" "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" @@ -77,14 +77,14 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L8LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" @@ -118,9 +118,9 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "subq $8 , %1 \n\t" "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "3: \n\t" "vzeroupper \n\t" : @@ -168,7 +168,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vbroadcastsd (%8), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" - "jz .L8LABEL%= \n\t" + "jz 2f \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" @@ -188,14 +188,14 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L8LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L8END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y @@ -218,9 +218,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L8END%=: \n\t" + "3: \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/dgemv_n_microk_nehalem-4.c b/kernel/x86_64/dgemv_n_microk_nehalem-4.c index e311326f1..0d2c24d52 100644 --- a/kernel/x86_64/dgemv_n_microk_nehalem-4.c +++ b/kernel/x86_64/dgemv_n_microk_nehalem-4.c @@ -60,7 +60,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "xorpd %%xmm4 , %%xmm4 \n\t" "xorpd %%xmm5 , %%xmm5 \n\t" "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y @@ -142,7 +142,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : @@ -194,7 +194,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "shufpd $0, %%xmm6 , %%xmm6 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "xorpd %%xmm4 , %%xmm4 \n\t" "xorpd %%xmm5 , %%xmm5 \n\t" "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y @@ -239,7 +239,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index ebec7d2c3..ee99228aa 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -78,7 +78,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "xorpd %%xmm11 , %%xmm11 \n\t" "testq $2 , %1 \n\t" - "jz .L01LABEL%= \n\t" + "jz 2f \n\t" "movups (%5,%0,8) , %%xmm14 \n\t" // x "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 @@ -90,13 +90,13 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "subq $2 , %1 \n\t" "addpd %%xmm13 , %%xmm11 \n\t" - ".L01LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L01END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%5,%0,8) , %%xmm14 \n\t" // x "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 @@ -116,9 +116,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "3: \n\t" "haddpd %%xmm10, %%xmm10 \n\t" "haddpd %%xmm11, %%xmm11 \n\t" @@ -157,7 +157,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "xorpd %%xmm10 , %%xmm10 \n\t" "testq $2 , %1 \n\t" - "jz .L01LABEL%= \n\t" + "jz 2f \n\t" "movups (%3,%0,8) , %%xmm12 \n\t" "movups (%4,%0,8) , %%xmm11 \n\t" @@ -166,13 +166,13 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "addpd %%xmm12 , %%xmm10 \n\t" "subq $2 , %1 \n\t" - ".L01LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L01END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%3,%0,8) , %%xmm12 \n\t" "movups 16(%3,%0,8) , %%xmm14 \n\t" @@ -185,9 +185,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "subq $4 , %1 \n\t" "addpd %%xmm14 , %%xmm9 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "3: \n\t" "addpd %%xmm9 , %%xmm10 \n\t" "haddpd %%xmm10, %%xmm10 \n\t" @@ -246,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "shufpd $0 , %%xmm10 , %%xmm10 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%3,%0,8) , %%xmm12 \n\t" "movups (%4,%0,8) , %%xmm11 \n\t" @@ -256,7 +256,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "subq $2 , %1 \n\t" "movups %%xmm11, -16(%4,%0,8) \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/dgemv_t_microk_haswell-4.c b/kernel/x86_64/dgemv_t_microk_haswell-4.c index 33b43515d..1e76a57a6 100644 --- a/kernel/x86_64/dgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_t_microk_haswell-4.c @@ -42,7 +42,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t" "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x @@ -54,13 +54,13 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" // "prefetcht0 384(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x @@ -80,9 +80,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "subq $8 , %1 \n\t" "vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "3: \n\t" "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c index 70d8df36b..d84470cc4 100644 --- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c +++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c @@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vmovddup 24(%8), %%xmm7 \n\t" // temp1[1] ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a "vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x @@ -90,7 +90,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vmovups %%xmm11 , -16(%3,%0,8) \n\t" "cmpq %0 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovsd (%9), %%xmm4 \n\t" "vmovsd 8(%9), %%xmm5 \n\t" diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c index 3ba596c5e..f7f7954b2 100644 --- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c +++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c @@ -48,7 +48,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "shufpd $0, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x "movups %%xmm12 , %%xmm11 \n\t" @@ -85,7 +85,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y "cmpq %0 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "movsd (%9), %%xmm4 \n\t" // temp1[0] "movsd 8(%9), %%xmm5 \n\t" // temp1[1] diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c index 492920253..d7166fe4b 100644 --- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c +++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c @@ -47,7 +47,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "xorq %0,%0 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a "vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x @@ -93,7 +93,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vmovups %%xmm9 , -32(%3,%0,8) \n\t" "vmovups %%xmm11 , -16(%3,%0,8) \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c index 6aab57500..75e3d02d1 100644 --- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c +++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c @@ -51,7 +51,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "xorq %0,%0 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x "movups %%xmm12 , %%xmm11 \n\t" @@ -88,7 +88,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y "subq $2 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "haddpd %%xmm0, %%xmm0 \n\t" "haddpd %%xmm1, %%xmm1 \n\t" diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c index 14ff51a0d..a09494935 100644 --- a/kernel/x86_64/saxpy_microk_nehalem-2.c +++ b/kernel/x86_64/saxpy_microk_nehalem-2.c @@ -40,7 +40,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "shufps $0, %%xmm0, %%xmm0 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" // "prefetcht0 192(%2,%0,4) \n\t" // "prefetcht0 192(%3,%0,4) \n\t" @@ -70,7 +70,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "addq $16, %0 \n\t" "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c index 024b2ce6d..36e61b077 100644 --- a/kernel/x86_64/sdot_microk_bulldozer-2.c +++ b/kernel/x86_64/sdot_microk_bulldozer-2.c @@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x @@ -55,7 +55,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "addq $16, %0 \n\t" "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c index 2a918b5ea..b5f6a1c91 100644 --- a/kernel/x86_64/sdot_microk_nehalem-2.c +++ b/kernel/x86_64/sdot_microk_nehalem-2.c @@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "xorps %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x "movups (%3,%0,4), %%xmm8 \n\t" // 4 * x "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x @@ -64,7 +64,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "addq $16, %0 \n\t" "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "addps %%xmm5, %%xmm4 \n\t" "addps %%xmm7, %%xmm6 \n\t" diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 0135306af..a840f8ba9 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -129,7 +129,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "shufps $0, %%xmm13, %%xmm13 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y "movups (%4,%0,4), %%xmm8 \n\t" @@ -143,7 +143,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : @@ -166,7 +166,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT #endif -#ifndef HAVE_KERNEL_4x2 +#ifndef HAVE_KERNEL_4x1 static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); @@ -184,10 +184,10 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "shufps $0, %%xmm12, %%xmm12 \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 2f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y "movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a @@ -203,12 +203,12 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "2: \n\t" "testq $0x04, %5 \n\t" - "jz .L08LABEL%= \n\t" + "jz 3f \n\t" "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a @@ -218,7 +218,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "3: \n\t" : : "r" (i), // 0 @@ -262,7 +262,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) ( ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%2,%0,4) , %%xmm12 \n\t" "movups (%3,%0,4) , %%xmm11 \n\t" @@ -271,7 +271,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) "movups %%xmm11, -16(%3,%0,4) \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c index 40238be49..2b83b1045 100644 --- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c +++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c @@ -49,7 +49,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vbroadcastss (%9), %%xmm8 \n\t" // alpha "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" @@ -71,10 +71,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "subq $4 , %1 \n\t" "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" @@ -107,13 +107,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "subq $8 , %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" @@ -178,9 +178,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" : : @@ -227,7 +227,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vbroadcastss (%8), %%xmm8 \n\t" // alpha ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" @@ -243,7 +243,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index 8f56655a9..79054f6c6 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -50,7 +50,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vbroadcastss (%9), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" @@ -76,10 +76,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" @@ -106,14 +106,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" @@ -147,9 +147,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "subq $16, %1 \n\t" "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vzeroupper \n\t" : @@ -197,7 +197,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vbroadcastss (%8), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" @@ -217,10 +217,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" @@ -240,14 +240,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y @@ -270,9 +270,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $16, %0 \n\t" "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c index 77a1b11aa..167c4be05 100644 --- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c +++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c @@ -60,7 +60,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "xorps %%xmm4 , %%xmm4 \n\t" "xorps %%xmm5 , %%xmm5 \n\t" "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y @@ -103,7 +103,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : @@ -155,7 +155,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "shufps $0, %%xmm6 , %%xmm6 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "xorps %%xmm4 , %%xmm4 \n\t" "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y @@ -178,7 +178,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addps %%xmm7 , %%xmm11 \n\t" "movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c index c162eeeb6..7377b545c 100644 --- a/kernel/x86_64/sgemv_n_microk_sandy-4.c +++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c @@ -51,7 +51,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vbroadcastss (%9), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" @@ -85,10 +85,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addq $4, %0 \n\t" "subq $4, %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" @@ -123,14 +123,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "subq $8, %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" @@ -190,9 +190,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addq $16, %8 \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vzeroupper \n\t" : @@ -241,7 +241,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vbroadcastss (%8), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" @@ -265,10 +265,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $4, %0 \n\t" "subq $4, %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" @@ -293,14 +293,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "subq $8, %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y @@ -339,9 +339,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $16, %0 \n\t" "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index b0e883252..cd13bb67d 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -84,7 +84,7 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "xorps %%xmm11 , %%xmm11 \n\t" "testq $4 , %1 \n\t" - "jz .L01LABEL%= \n\t" + "jz 2f \n\t" "movups (%5,%0,4) , %%xmm14 \n\t" // x "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 @@ -96,13 +96,13 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "subq $4 , %1 \n\t" "addps %%xmm13 , %%xmm11 \n\t" - ".L01LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L01END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%5,%0,4) , %%xmm14 \n\t" // x "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 @@ -122,9 +122,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "3: \n\t" "haddps %%xmm10, %%xmm10 \n\t" "haddps %%xmm11, %%xmm11 \n\t" @@ -165,7 +165,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "xorps %%xmm10 , %%xmm10 \n\t" "testq $4 , %1 \n\t" - "jz .L01LABEL%= \n\t" + "jz 2f \n\t" "movups (%3,%0,4) , %%xmm12 \n\t" "movups (%4,%0,4) , %%xmm11 \n\t" @@ -174,13 +174,13 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "addps %%xmm12 , %%xmm10 \n\t" "subq $4 , %1 \n\t" - ".L01LABEL%=: \n\t" + "2: \n\t" "cmpq $0, %1 \n\t" - "je .L01END%= \n\t" + "je 3f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%3,%0,4) , %%xmm12 \n\t" "movups 16(%3,%0,4) , %%xmm14 \n\t" @@ -193,9 +193,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "subq $8 , %1 \n\t" "addps %%xmm14 , %%xmm9 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L01END%=: \n\t" + "3: \n\t" "addps %%xmm9 , %%xmm10 \n\t" "haddps %%xmm10, %%xmm10 \n\t" @@ -255,7 +255,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "shufps $0 , %%xmm10 , %%xmm10 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%3,%0,4) , %%xmm12 \n\t" "movups (%4,%0,4) , %%xmm11 \n\t" @@ -265,7 +265,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "subq $4 , %1 \n\t" "movups %%xmm11, -16(%4,%0,4) \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/sgemv_t_microk_bulldozer-4.c b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c index 40e318de3..6e822fba3 100644 --- a/kernel/x86_64/sgemv_t_microk_bulldozer-4.c +++ b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c @@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" @@ -51,10 +51,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x @@ -70,13 +70,13 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "prefetcht0 384(%4,%0,4) \n\t" @@ -107,9 +107,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "subq $16, %1 \n\t" "vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" diff --git a/kernel/x86_64/sgemv_t_microk_haswell-4.c b/kernel/x86_64/sgemv_t_microk_haswell-4.c index 016cb35e7..14fe1ecad 100644 --- a/kernel/x86_64/sgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_t_microk_haswell-4.c @@ -42,7 +42,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x @@ -54,10 +54,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x @@ -69,14 +69,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 384(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x @@ -96,9 +96,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $16, %0 \n\t" "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" diff --git a/kernel/x86_64/sgemv_t_microk_nehalem-4.c b/kernel/x86_64/sgemv_t_microk_nehalem-4.c index 4a167900e..4f07d9640 100644 --- a/kernel/x86_64/sgemv_t_microk_nehalem-4.c +++ b/kernel/x86_64/sgemv_t_microk_nehalem-4.c @@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "xorps %%xmm7 , %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a0 @@ -60,7 +60,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addps %%xmm10, %%xmm6 \n\t" "addps %%xmm11, %%xmm7 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "haddps %%xmm4, %%xmm4 \n\t" "haddps %%xmm5, %%xmm5 \n\t" diff --git a/kernel/x86_64/sgemv_t_microk_sandy-4.c b/kernel/x86_64/sgemv_t_microk_sandy-4.c index 6550518f7..76868ab14 100644 --- a/kernel/x86_64/sgemv_t_microk_sandy-4.c +++ b/kernel/x86_64/sgemv_t_microk_sandy-4.c @@ -46,7 +46,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x @@ -61,10 +61,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "subq $4 , %1 \n\t" "vaddps %%xmm7, %%xmm11, %%xmm7 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x @@ -79,14 +79,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "subq $8 , %1 \n\t" "vaddps %%ymm7, %%ymm11, %%ymm7 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 384(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x @@ -114,9 +114,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "subq $16, %1 \n\t" "vaddps %%ymm3, %%ymm11, %%ymm3 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vaddps %%ymm4, %%ymm0, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm1, %%ymm5 \n\t" diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c index c9206c1be..9002228f3 100644 --- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c @@ -44,7 +44,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[3] ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x @@ -71,7 +71,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vmovups %%xmm9 , -16(%3,%0,4) \n\t" "cmpq %0 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovss (%9), %%xmm4 \n\t" "vmovss 4(%9), %%xmm5 \n\t" diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c index a1c62caf6..fb5337946 100644 --- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c +++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c @@ -48,7 +48,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F "shufps $0, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y @@ -86,7 +86,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F "addq $4 , %0 \n\t" "cmpq %0 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "movss (%9), %%xmm4 \n\t" // temp1[0] "movss 4(%9), %%xmm5 \n\t" // temp1[1] diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c index b8b3b73e9..8c01ab806 100644 --- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c @@ -47,7 +47,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "xorq %0,%0 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%2,%0,4), %%xmm8 \n\t" // 4 * x "vmovups (%3,%0,4), %%xmm9 \n\t" // 4 * y @@ -73,7 +73,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c index 9505a395a..2fb8f4494 100644 --- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c +++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c @@ -51,7 +51,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "xorq %0,%0 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y @@ -89,7 +89,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "haddps %%xmm0, %%xmm0 \n\t" "haddps %%xmm1, %%xmm1 \n\t" diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c index 780109b69..f9732cd4e 100644 --- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c @@ -40,7 +40,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 768(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x @@ -113,7 +113,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/zgemv_n_microk_haswell-4.c b/kernel/x86_64/zgemv_n_microk_haswell-4.c index 61358508a..b38cc5763 100644 --- a/kernel/x86_64/zgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/zgemv_n_microk_haswell-4.c @@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 @@ -111,7 +111,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -153,7 +153,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 @@ -199,7 +199,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -237,7 +237,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 @@ -273,7 +273,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -339,7 +339,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src "vmovups 32(%2,%0,8), %%ymm9 \n\t" @@ -375,7 +375,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/zgemv_n_microk_sandy-4.c b/kernel/x86_64/zgemv_n_microk_sandy-4.c index 009e4801e..82fc543de 100644 --- a/kernel/x86_64/zgemv_n_microk_sandy-4.c +++ b/kernel/x86_64/zgemv_n_microk_sandy-4.c @@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" //"prefetcht0 256(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 @@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -165,7 +165,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" // "prefetcht0 256(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 @@ -216,7 +216,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -254,7 +254,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" // "prefetcht0 256(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 @@ -291,7 +291,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -356,7 +356,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" // "prefetcht0 192(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src "vmovups 32(%2,%0,8), %%ymm9 \n\t" @@ -392,7 +392,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/zgemv_t_microk_bulldozer-4.c b/kernel/x86_64/zgemv_t_microk_bulldozer-4.c index 006db226b..792c7e952 100644 --- a/kernel/x86_64/zgemv_t_microk_bulldozer-4.c +++ b/kernel/x86_64/zgemv_t_microk_bulldozer-4.c @@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 @@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%8) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha @@ -236,7 +236,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 @@ -286,7 +286,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%6) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha @@ -369,7 +369,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 @@ -404,7 +404,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0 - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%5) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha diff --git a/kernel/x86_64/zgemv_t_microk_haswell-4.c b/kernel/x86_64/zgemv_t_microk_haswell-4.c index c87b5ce0f..8a851a54c 100644 --- a/kernel/x86_64/zgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/zgemv_t_microk_haswell-4.c @@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%2,%0,8) \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 @@ -96,7 +96,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%8) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha @@ -220,7 +220,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%2,%0,8) \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 @@ -255,7 +255,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%6) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha @@ -342,7 +342,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%2,%0,8) \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 @@ -370,7 +370,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%5) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha From b8ff6892f6bbef4b92b06996ba88d7debe373ee9 Mon Sep 17 00:00:00 2001 From: xantares Date: Tue, 9 Dec 2014 10:18:18 +0100 Subject: [PATCH 02/37] set OPENBLAS_CMAKE_DIR to /lib/cmake/ usually these files are more often located in this subdir --- Makefile.install | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.install b/Makefile.install index 04323eef5..f6c3b9a02 100644 --- a/Makefile.install +++ b/Makefile.install @@ -9,7 +9,7 @@ OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin OPENBLAS_BUILD_DIR := $(CURDIR) -OPENBLAS_CMAKE_DIR := $(PREFIX)/cmake +OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake .PHONY : install From 7a6a141bc4c4bc84c8eeae8613d4deec6ee9f048 Mon Sep 17 00:00:00 2001 From: xantares Date: Tue, 9 Dec 2014 10:34:41 +0100 Subject: [PATCH 03/37] add OpenBLAS_VERSION to cmake config file --- Makefile.install | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.install b/Makefile.install index 04323eef5..eeb79f599 100644 --- a/Makefile.install +++ b/Makefile.install @@ -95,7 +95,8 @@ endif endif #Generating OpenBLASConfig.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) - @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) ifndef NO_SHARED #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) From 97de657d38cfd2ccacee54ec7920afa61a5967e7 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 11 Dec 2014 13:53:59 +0100 Subject: [PATCH 04/37] added tests to sep.as as workaround for gfortran-4.8.x --- lapack-netlib/TESTING/sep.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/TESTING/sep.in b/lapack-netlib/TESTING/sep.in index e0ed58512..4d10b6c19 100644 --- a/lapack-netlib/TESTING/sep.in +++ b/lapack-netlib/TESTING/sep.in @@ -1,6 +1,6 @@ SEP: Data file for testing Symmetric Eigenvalue Problem routines -6 Number of values of N -0 1 2 3 5 20 Values of N (dimension) +8 Number of values of N +0 1 2 3 5 19 20 21 Values of N (dimension) 5 Number of values of NB 1 3 3 3 10 Values of NB (blocksize) 2 2 2 2 2 Values of NBMIN (minimum blocksize) From ec85c4a51d01f7c4d2a9ffeff9c15ff451054a62 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 11 Dec 2014 14:57:41 +0100 Subject: [PATCH 05/37] Increased the Threshold value in sep.in --- lapack-netlib/TESTING/sep.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/sep.in b/lapack-netlib/TESTING/sep.in index 4d10b6c19..19bd7c3da 100644 --- a/lapack-netlib/TESTING/sep.in +++ b/lapack-netlib/TESTING/sep.in @@ -5,7 +5,7 @@ SEP: Data file for testing Symmetric Eigenvalue Problem routines 1 3 3 3 10 Values of NB (blocksize) 2 2 2 2 2 Values of NBMIN (minimum blocksize) 1 0 5 9 1 Values of NX (crossover point) -60.0 Threshold value +160.0 Threshold value T Put T to test the LAPACK routines T Put T to test the driver routines T Put T to test the error exits From 3e81c99b6be993dfea104cb1f87a4da8ca52a253 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 13 Dec 2014 13:05:06 +0800 Subject: [PATCH 06/37] Fixed installation bug on Mac OSX. --- Makefile.install | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile.install b/Makefile.install index 071d193ab..e1deaae3e 100644 --- a/Makefile.install +++ b/Makefile.install @@ -46,11 +46,11 @@ ifndef NO_CBLAS endif ifndef NO_LAPACKE - @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h - @-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h - @-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h - @-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h + @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) + @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h + @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h + @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h + @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h endif #for install static library From 113b48ca2222b5f46bb40425f21e52906958473e Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 17 Dec 2014 14:12:21 +0100 Subject: [PATCH 07/37] modified makefile for acml6.1 --- benchmark/Makefile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index cf219cef1..402a2e07b 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -6,8 +6,13 @@ include $(TOPDIR)/Makefile.system #LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm # ACML custom -ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib -LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm +#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib +#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm + +# ACML 6.1 custom +ACML=/home/saar/acml6.1/gfortran64_mp/lib +LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm + # Atlas Ubuntu #ATLAS=/usr/lib/atlas-base From 1e566223ed11a6b453a0e37cbb664a4192f04e8b Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 17 Dec 2014 15:02:11 +0100 Subject: [PATCH 08/37] added code for the size of n --- benchmark/gemm.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 4f9a58825..347cf0dfa 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -124,8 +124,9 @@ int MAIN__(int argc, char *argv[]){ FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char trans='N'; - blasint m, i, j; + blasint m, n, i, j; int loops = 1; + int has_param_n=0; int l; char *p; @@ -162,6 +163,11 @@ int MAIN__(int argc, char *argv[]){ if ( p != NULL ) loops = atoi(p); + if ((p = getenv("OPENBLAS_PARAM_N"))) { + n = atoi(p); + has_param_n=1; + } + #ifdef linux srandom(getpid()); @@ -174,7 +180,14 @@ int MAIN__(int argc, char *argv[]){ timeg=0; - fprintf(stderr, " %6d : ", (int)m); + if ( has_param_n == 1 && n <= m ) + n=n; + else + n=m; + + + + fprintf(stderr, " %6dx%d : ", (int)m, (int)n); for (l=0; l Date: Thu, 18 Dec 2014 20:35:51 +0100 Subject: [PATCH 09/37] small optimization on dgemm_kernel for N=1 --- kernel/x86_64/dgemm_kernel_4x4_haswell.S | 79 ++++++++++++++---------- 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x4_haswell.S b/kernel/x86_64/dgemm_kernel_4x4_haswell.S index a49a51ee9..0a2ca7ae3 100644 --- a/kernel/x86_64/dgemm_kernel_4x4_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x4_haswell.S @@ -1092,18 +1092,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x1 - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + + +.macro KERNEL4x1 + + vbroadcastsd -12 * SIZE(BO), %ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm1 + vbroadcastsd -10 * SIZE(BO), %ymm2 + vbroadcastsd -9 * SIZE(BO), %ymm3 + + vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 + + vbroadcastsd -8 * SIZE(BO), %ymm0 + vbroadcastsd -7 * SIZE(BO), %ymm1 + + vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 + + vbroadcastsd -6 * SIZE(BO), %ymm2 + vbroadcastsd -5 * SIZE(BO), %ymm3 + + vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 + vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 + + addq $ 8 *SIZE, BO + addq $ 32*SIZE, AO .endm .macro KERNEL4x1_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vmovups -14 * SIZE(AO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - vfmadd231pd %xmm1 ,%xmm2 , %xmm5 + vbroadcastsd -12 * SIZE(BO), %ymm2 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm2 , %ymm4 addq $ 1*SIZE, BO addq $ 4*SIZE, AO @@ -1112,21 +1142,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x1 - vmovddup ALPHA, %xmm0 + vbroadcastsd ALPHA, %ymm0 - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 + vaddpd %ymm4,%ymm5, %ymm4 + vaddpd %ymm6,%ymm7, %ymm6 + vaddpd %ymm4,%ymm6, %ymm4 + + vmulpd %ymm0 , %ymm4 , %ymm4 #if !defined(TRMMKERNEL) - vaddpd (CO1) , %xmm4, %xmm4 - vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 + vaddpd (CO1) , %ymm4, %ymm4 #endif - vmovups %xmm4 , (CO1) - vmovups %xmm5 , 2 * SIZE(CO1) + vmovups %ymm4 , (CO1) addq $ 4*SIZE, CO1 .endm @@ -2112,15 +2143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_12: - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB + KERNEL4x1 dec %rax jne .L1_12 @@ -3180,15 +3203,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_12: - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB + KERNEL4x1 dec %rax jne .L1_12 From 887aed634df973d93bb559d14b7dc08b343c60b9 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 19 Dec 2014 12:40:46 +0100 Subject: [PATCH 10/37] modified sources for OS Darwin --- benchmark/axpy.c | 4 ++-- benchmark/cholesky.c | 4 ++-- benchmark/dot.c | 4 ++-- benchmark/geev.c | 4 ++-- benchmark/gemm.c | 4 ++-- benchmark/gemm3m.c | 4 ++-- benchmark/gemv.c | 4 ++-- benchmark/ger.c | 4 ++-- benchmark/getri.c | 4 ++-- benchmark/hemm.c | 4 ++-- benchmark/hemv.c | 4 ++-- benchmark/her2k.c | 4 ++-- benchmark/herk.c | 4 ++-- benchmark/linpack.c | 4 ++-- benchmark/potrf.c | 4 ++-- benchmark/symm.c | 4 ++-- benchmark/symv.c | 4 ++-- benchmark/syr2k.c | 4 ++-- benchmark/syrk.c | 4 ++-- benchmark/trmm.c | 4 ++-- benchmark/trsm.c | 4 ++-- 21 files changed, 42 insertions(+), 42 deletions(-) diff --git a/benchmark/axpy.c b/benchmark/axpy.c index ef3b5ae4f..a7206b690 100644 --- a/benchmark/axpy.c +++ b/benchmark/axpy.c @@ -114,7 +114,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *x, *y; FLOAT alpha[2] = { 2.0, 2.0 }; @@ -198,4 +198,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c index 76c368eda..c8b96d80f 100644 --- a/benchmark/cholesky.c +++ b/benchmark/cholesky.c @@ -117,7 +117,7 @@ static __inline double getmflops(int ratio, int m, double secs){ } -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ #ifndef COMPLEX char *trans[] = {"T", "N"}; @@ -273,4 +273,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/dot.c b/benchmark/dot.c index 6132ed324..4c8d6cc38 100644 --- a/benchmark/dot.c +++ b/benchmark/dot.c @@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *x, *y; FLOAT result; @@ -192,4 +192,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/geev.c b/benchmark/geev.c index 3b7465360..a2ca2c315 100644 --- a/benchmark/geev.c +++ b/benchmark/geev.c @@ -139,7 +139,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork; FLOAT wkopt[4]; @@ -257,4 +257,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 347cf0dfa..5a3587622 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *b, *c; FLOAT alpha[] = {1.0, 1.0}; @@ -222,4 +222,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/gemm3m.c b/benchmark/gemm3m.c index 048d74be6..d39543585 100644 --- a/benchmark/gemm3m.c +++ b/benchmark/gemm3m.c @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *b, *c; FLOAT alpha[] = {1.0, 1.0}; @@ -209,4 +209,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/gemv.c b/benchmark/gemv.c index e21868259..42af2825a 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *x, *y; FLOAT alpha[] = {1.0, 1.0}; @@ -266,4 +266,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/ger.c b/benchmark/ger.c index 5085389da..354281006 100644 --- a/benchmark/ger.c +++ b/benchmark/ger.c @@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *x, *y; FLOAT alpha[] = {1.0, 1.0}; @@ -214,5 +214,5 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/getri.c b/benchmark/getri.c index 897f1ff04..083cdc9aa 100644 --- a/benchmark/getri.c +++ b/benchmark/getri.c @@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a,*work; FLOAT wkopt[4]; @@ -231,4 +231,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/hemm.c b/benchmark/hemm.c index f5d4b4fd9..318c407ba 100644 --- a/benchmark/hemm.c +++ b/benchmark/hemm.c @@ -107,7 +107,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *b, *c; FLOAT alpha[] = {1.0, 1.0}; @@ -189,4 +189,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/hemv.c b/benchmark/hemv.c index 79b7679cc..05028e3cf 100644 --- a/benchmark/hemv.c +++ b/benchmark/hemv.c @@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *x, *y; FLOAT alpha[] = {1.0, 1.0}; @@ -205,4 +205,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/her2k.c b/benchmark/her2k.c index 49ab8d214..028e2718f 100644 --- a/benchmark/her2k.c +++ b/benchmark/her2k.c @@ -106,7 +106,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *b, *c; FLOAT alpha[] = {1.0, 1.0}; @@ -188,4 +188,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/herk.c b/benchmark/herk.c index 8c053b019..d2e25ff46 100644 --- a/benchmark/herk.c +++ b/benchmark/herk.c @@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *c; FLOAT alpha[] = {1.0, 1.0}; @@ -186,4 +186,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/linpack.c b/benchmark/linpack.c index 98a874208..7d5c87163 100644 --- a/benchmark/linpack.c +++ b/benchmark/linpack.c @@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *b; blasint *ipiv; @@ -270,4 +270,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/potrf.c b/benchmark/potrf.c index 7b6cdd799..3caf61caa 100644 --- a/benchmark/potrf.c +++ b/benchmark/potrf.c @@ -114,7 +114,7 @@ int gettimeofday(struct timeval *tv, void *tz){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ #ifndef COMPLEX char *trans[] = {"T", "N"}; @@ -278,5 +278,5 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/symm.c b/benchmark/symm.c index 187dfe2ae..35ebcee97 100644 --- a/benchmark/symm.c +++ b/benchmark/symm.c @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *b, *c; FLOAT alpha[] = {1.0, 1.0}; @@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/symv.c b/benchmark/symv.c index 4bcfb411b..df2a5d301 100644 --- a/benchmark/symv.c +++ b/benchmark/symv.c @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *x, *y; FLOAT alpha[] = {1.0, 1.0}; @@ -215,4 +215,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/syr2k.c b/benchmark/syr2k.c index e11b04e42..9840b5f3e 100644 --- a/benchmark/syr2k.c +++ b/benchmark/syr2k.c @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *b, *c; FLOAT alpha[] = {1.0, 1.0}; @@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/syrk.c b/benchmark/syrk.c index f01549688..34817f2bb 100644 --- a/benchmark/syrk.c +++ b/benchmark/syrk.c @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *c; FLOAT alpha[] = {1.0, 1.0}; @@ -196,4 +196,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/trmm.c b/benchmark/trmm.c index 328dc9a10..f81e9d912 100644 --- a/benchmark/trmm.c +++ b/benchmark/trmm.c @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *b; FLOAT alpha[] = {1.0, 1.0}; @@ -199,4 +199,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/trsm.c b/benchmark/trsm.c index 908a0fcb7..ed969b707 100644 --- a/benchmark/trsm.c +++ b/benchmark/trsm.c @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *b; FLOAT alpha[] = {1.0, 1.0}; @@ -199,4 +199,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); From 4de7b9ae470fb98c4d5353371604b025f5b9fcd4 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 22 Dec 2014 14:04:27 +0100 Subject: [PATCH 11/37] increased NMAX to 128 --- lapack-netlib/BLAS/TESTING/cblat2.f | 2 +- lapack-netlib/BLAS/TESTING/cblat3.f | 2 +- lapack-netlib/BLAS/TESTING/dblat2.f | 2 +- lapack-netlib/BLAS/TESTING/dblat3.f | 2 +- lapack-netlib/BLAS/TESTING/sblat2.f | 2 +- lapack-netlib/BLAS/TESTING/sblat3.f | 2 +- lapack-netlib/BLAS/TESTING/zblat2.f | 2 +- lapack-netlib/BLAS/TESTING/zblat3.f | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/BLAS/TESTING/cblat2.f b/lapack-netlib/BLAS/TESTING/cblat2.f index 5833ea81a..2a6edd382 100644 --- a/lapack-netlib/BLAS/TESTING/cblat2.f +++ b/lapack-netlib/BLAS/TESTING/cblat2.f @@ -120,7 +120,7 @@ REAL RZERO PARAMETER ( RZERO = 0.0 ) INTEGER NMAX, INCMAX - PARAMETER ( NMAX = 65, INCMAX = 2 ) + PARAMETER ( NMAX = 128, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) diff --git a/lapack-netlib/BLAS/TESTING/cblat3.f b/lapack-netlib/BLAS/TESTING/cblat3.f index 09f2cb9c5..fb2aa4ece 100644 --- a/lapack-netlib/BLAS/TESTING/cblat3.f +++ b/lapack-netlib/BLAS/TESTING/cblat3.f @@ -102,7 +102,7 @@ REAL RZERO PARAMETER ( RZERO = 0.0 ) INTEGER NMAX - PARAMETER ( NMAX = 65 ) + PARAMETER ( NMAX = 128 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. diff --git a/lapack-netlib/BLAS/TESTING/dblat2.f b/lapack-netlib/BLAS/TESTING/dblat2.f index 0fa80afa4..80623b260 100644 --- a/lapack-netlib/BLAS/TESTING/dblat2.f +++ b/lapack-netlib/BLAS/TESTING/dblat2.f @@ -117,7 +117,7 @@ DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) INTEGER NMAX, INCMAX - PARAMETER ( NMAX = 65, INCMAX = 2 ) + PARAMETER ( NMAX = 128, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) diff --git a/lapack-netlib/BLAS/TESTING/dblat3.f b/lapack-netlib/BLAS/TESTING/dblat3.f index 8d37c7453..72c17ed3b 100644 --- a/lapack-netlib/BLAS/TESTING/dblat3.f +++ b/lapack-netlib/BLAS/TESTING/dblat3.f @@ -97,7 +97,7 @@ DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) INTEGER NMAX - PARAMETER ( NMAX = 65 ) + PARAMETER ( NMAX = 128 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. diff --git a/lapack-netlib/BLAS/TESTING/sblat2.f b/lapack-netlib/BLAS/TESTING/sblat2.f index 71605ed31..601add7e9 100644 --- a/lapack-netlib/BLAS/TESTING/sblat2.f +++ b/lapack-netlib/BLAS/TESTING/sblat2.f @@ -117,7 +117,7 @@ REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) INTEGER NMAX, INCMAX - PARAMETER ( NMAX = 65, INCMAX = 2 ) + PARAMETER ( NMAX = 128, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) diff --git a/lapack-netlib/BLAS/TESTING/sblat3.f b/lapack-netlib/BLAS/TESTING/sblat3.f index 879269633..78d809379 100644 --- a/lapack-netlib/BLAS/TESTING/sblat3.f +++ b/lapack-netlib/BLAS/TESTING/sblat3.f @@ -97,7 +97,7 @@ REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) INTEGER NMAX - PARAMETER ( NMAX = 65 ) + PARAMETER ( NMAX = 128 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. diff --git a/lapack-netlib/BLAS/TESTING/zblat2.f b/lapack-netlib/BLAS/TESTING/zblat2.f index 53129a11e..2e3e08e7c 100644 --- a/lapack-netlib/BLAS/TESTING/zblat2.f +++ b/lapack-netlib/BLAS/TESTING/zblat2.f @@ -121,7 +121,7 @@ DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) INTEGER NMAX, INCMAX - PARAMETER ( NMAX = 65, INCMAX = 2 ) + PARAMETER ( NMAX = 128, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) diff --git a/lapack-netlib/BLAS/TESTING/zblat3.f b/lapack-netlib/BLAS/TESTING/zblat3.f index 59ca24145..39ce06b99 100644 --- a/lapack-netlib/BLAS/TESTING/zblat3.f +++ b/lapack-netlib/BLAS/TESTING/zblat3.f @@ -104,7 +104,7 @@ DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) INTEGER NMAX - PARAMETER ( NMAX = 65 ) + PARAMETER ( NMAX = 128 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. From 587e16fba3775cd8587f7d54c19ef1696c88f771 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 22 Dec 2014 17:01:18 +0100 Subject: [PATCH 12/37] Ref #458: Backport, sandybrigde uses nehalem zgemm kernel --- kernel/x86_64/KERNEL.SANDYBRIDGE | 12 ++++++------ param.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index 61e13a116..ff96cd011 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -34,17 +34,17 @@ CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S -ZGEMMINCOPY = -ZGEMMITCOPY = +ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S +ZGEMMINCOPY = zgemm_ncopy_1.S +ZGEMMITCOPY = zgemm_tcopy_1.S ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -ZGEMMINCOPYOBJ = -ZGEMMITCOPYOBJ = +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + #STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S #STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S #STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S diff --git a/param.h b/param.h index 28ed91e60..bce05c957 100644 --- a/param.h +++ b/param.h @@ -1129,7 +1129,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 8 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 -#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 From cd9868b1b4e154724699ac2e38c4032d51908930 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Thu, 25 Dec 2014 17:41:17 -0500 Subject: [PATCH 13/37] Correct ilaver C declaration --- lapack-netlib/lapacke/include/lapacke.h | 10 +++++----- lapack-netlib/lapacke/src/lapacke_ilaver.c | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/lapack-netlib/lapacke/include/lapacke.h b/lapack-netlib/lapacke/include/lapacke.h index a31c10d6d..e506319c2 100644 --- a/lapack-netlib/lapacke/include/lapacke.h +++ b/lapack-netlib/lapacke/include/lapacke.h @@ -10707,9 +10707,9 @@ lapack_int LAPACKE_zsyr_work( int matrix_order, char uplo, lapack_int n, const lapack_complex_double* x, lapack_int incx, lapack_complex_double* a, lapack_int lda ); -void LAPACKE_ilaver( const lapack_int* vers_major, - const lapack_int* vers_minor, - const lapack_int* vers_patch ); +void LAPACKE_ilaver( lapack_int* vers_major, + lapack_int* vers_minor, + lapack_int* vers_patch ); #define LAPACK_sgetrf LAPACK_GLOBAL(sgetrf,SGETRF) @@ -16435,8 +16435,8 @@ void LAPACK_csyr( char* uplo, lapack_int* n, lapack_complex_float* alpha, void LAPACK_zsyr( char* uplo, lapack_int* n, lapack_complex_double* alpha, const lapack_complex_double* x, lapack_int* incx, lapack_complex_double* a, lapack_int* lda ); -void LAPACK_ilaver( const lapack_int* vers_major, const lapack_int* vers_minor, - const lapack_int* vers_patch ); +void LAPACK_ilaver( lapack_int* vers_major, lapack_int* vers_minor, + lapack_int* vers_patch ); #ifdef __cplusplus } diff --git a/lapack-netlib/lapacke/src/lapacke_ilaver.c b/lapack-netlib/lapacke/src/lapacke_ilaver.c index bec1d900b..ed362e90b 100644 --- a/lapack-netlib/lapacke/src/lapacke_ilaver.c +++ b/lapack-netlib/lapacke/src/lapacke_ilaver.c @@ -26,16 +26,16 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***************************************************************************** -* Contents: Native high-level C interface to LAPACK function dgesv +* Contents: Native high-level C interface to LAPACK function ilaver * Author: Intel Corporation * Generated November, 2011 *****************************************************************************/ #include "lapacke_utils.h" -void LAPACKE_ilaver( const lapack_int* vers_major, - const lapack_int* vers_minor, - const lapack_int* vers_patch ) +void LAPACKE_ilaver( lapack_int* vers_major, + lapack_int* vers_minor, + lapack_int* vers_patch ) { /* Call LAPACK function */ LAPACK_ilaver( vers_major, vers_minor, vers_patch ); From e9d9a8eae3c9ff68b3d3cbcc794cf32735cda61a Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Fri, 26 Dec 2014 14:42:00 +0100 Subject: [PATCH 14/37] Allow to do gemv and ger buffer allocation on the stack ger and gemv call blas_memory_alloc/free which in their turn call blas_lock. blas_lock create thread contention when matrices are small and the number of thread is high enough. We avoid call blas_memory_alloc by replacing it with stack allocation. This can be enabled with: make -DMAX_STACK_ALLOC=2048 The given size (in byte) must be high enough to avoid thread contention and small enough to avoid stack overflow. Fix #478 --- Makefile.system | 4 ++++ interface/gemv.c | 16 +++++++++++++++- interface/ger.c | 13 ++++++++++++- kernel/x86_64/sgemv_t_4.c | 2 +- 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index ec6339d62..6f3c0bc2b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -305,6 +305,10 @@ ifdef SANITY_CHECK CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU) endif +ifdef MAX_STACK_ALLOC +CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) +endif + # # Architecture dependent settings # diff --git a/interface/gemv.c b/interface/gemv.c index 2dd82dce5..155305be8 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -208,7 +208,18 @@ void CNAME(enum CBLAS_ORDER order, if (incx < 0) x -= (lenx - 1) * incx; if (incy < 0) y -= (leny - 1) * incy; +#ifdef MAX_STACK_ALLOC + int stack_alloc_size = m + n; + if(stack_alloc_size < 128) + //dgemv_n.S require a 128 bytes buffer + stack_alloc_size = 128; + if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) + stack_alloc_size = 0; + FLOAT stack_buffer[stack_alloc_size]; + buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); +#else buffer = (FLOAT *)blas_memory_alloc(1); +#endif #ifdef SMP @@ -237,7 +248,10 @@ void CNAME(enum CBLAS_ORDER order, } #endif - blas_memory_free(buffer); +#ifdef MAX_STACK_ALLOC + if(!stack_alloc_size) +#endif + blas_memory_free(buffer); FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); diff --git a/interface/ger.c b/interface/ger.c index 9857d2423..cac357786 100644 --- a/interface/ger.c +++ b/interface/ger.c @@ -171,7 +171,15 @@ void CNAME(enum CBLAS_ORDER order, if (incy < 0) y -= (n - 1) * incy; if (incx < 0) x -= (m - 1) * incx; +#ifdef MAX_STACK_ALLOC + int stack_alloc_size = m; + if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) + stack_alloc_size = 0; + FLOAT stack_buffer[stack_alloc_size]; + buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); +#else buffer = (FLOAT *)blas_memory_alloc(1); +#endif #ifdef SMPTEST nthreads = num_cpu_avail(2); @@ -190,7 +198,10 @@ void CNAME(enum CBLAS_ORDER order, } #endif - blas_memory_free(buffer); +#ifdef MAX_STACK_ALLOC + if(!stack_alloc_size) +#endif + blas_memory_free(buffer); FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index cd13bb67d..61eb1ed84 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -302,7 +302,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n < 1 ) return(0); xbuffer = buffer; - ytemp = buffer + NBMAX; + ytemp = buffer + (m < NBMAX ? m : NBMAX); n0 = n / NBMAX; n1 = (n % NBMAX) >> 2 ; From 4319769b79c24cc5ca5559a53b37241d4770c322 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 28 Dec 2014 20:16:46 +0800 Subject: [PATCH 15/37] added target processor STEAMROLLER --- Makefile.rule | 2 +- Makefile.system | 8 ++- README.md | 1 + TargetList.txt | 1 + common_x86.h | 2 +- common_x86_64.h | 2 +- cpuid.h | 10 ++-- cpuid_x86.c | 18 +++++++ driver/others/dynamic.c | 15 ++++++ driver/others/parameter.c | 4 +- getarch.c | 17 ++++++ kernel/setparam-ref.c | 17 ++++++ kernel/x86_64/KERNEL.STEAMROLLER | 76 ++++++++++++++++++++++++++ kernel/x86_64/ddot.c | 2 +- kernel/x86_64/sdot.c | 2 +- kernel/x86_64/sgemv_n_4.c | 2 +- kernel/x86_64/sgemv_t_4.c | 2 +- kernel/x86_64/zgemv_t_4.c | 2 +- param.h | 93 ++++++++++++++++++++++++++++++++ 19 files changed, 261 insertions(+), 15 deletions(-) create mode 100644 kernel/x86_64/KERNEL.STEAMROLLER diff --git a/Makefile.rule b/Makefile.rule index d3a2d1fa3..4bd1ab110 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -15,7 +15,7 @@ VERSION = 0.2.13 # TARGET = PENRYN # If you want to support multiple architecture in one binary -# DYNAMIC_ARCH = 1 +DYNAMIC_ARCH = 1 # C compiler including binary type(32bit / 64bit). Default is gcc. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect. diff --git a/Makefile.system b/Makefile.system index ec6339d62..e3e2d5204 100644 --- a/Makefile.system +++ b/Makefile.system @@ -61,6 +61,9 @@ endif ifeq ($(TARGET), PILEDRIVER) GETARCH_FLAGS := -DFORCE_BARCELONA endif +ifeq ($(TARGET), STEAMROLLER) +GETARCH_FLAGS := -DFORCE_BARCELONA +endif endif @@ -85,6 +88,9 @@ endif ifeq ($(TARGET_CORE), PILEDRIVER) GETARCH_FLAGS := -DFORCE_BARCELONA endif +ifeq ($(TARGET_CORE), STEAMROLLER) +GETARCH_FLAGS := -DFORCE_BARCELONA +endif endif @@ -392,7 +398,7 @@ endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER endif ifneq ($(NO_AVX2), 1) DYNAMIC_CORE += HASWELL diff --git a/README.md b/README.md index f4c547701..cdacf9888 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ Please read GotoBLAS_01Readme.txt - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. +- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. #### MIPS64: - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. diff --git a/TargetList.txt b/TargetList.txt index 97661fdcf..c91401f01 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -32,6 +32,7 @@ ISTANBUL BOBCAT BULLDOZER PILEDRIVER +STEAMROLLER c)VIA CPU: SSE_GENERIC diff --git a/common_x86.h b/common_x86.h index f97fd348a..9d82090cc 100644 --- a/common_x86.h +++ b/common_x86.h @@ -171,7 +171,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define MMXSTORE movd #endif -#if defined(PILEDRIVER) || defined(BULLDOZER) +#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/common_x86_64.h b/common_x86_64.h index 547614f74..e0a6c4c42 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -226,7 +226,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #ifdef ASSEMBLER -#if defined(PILEDRIVER) || defined(BULLDOZER) +#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/cpuid.h b/cpuid.h index cb4404cb0..ab6a3fb32 100644 --- a/cpuid.h +++ b/cpuid.h @@ -104,10 +104,11 @@ #define CORE_ATOM 18 #define CORE_NANO 19 #define CORE_SANDYBRIDGE 20 -#define CORE_BOBCAT 21 -#define CORE_BULLDOZER 22 +#define CORE_BOBCAT 21 +#define CORE_BULLDOZER 22 #define CORE_PILEDRIVER 23 -#define CORE_HASWELL 24 +#define CORE_HASWELL 24 +#define CORE_STEAMROLLER 25 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -200,6 +201,7 @@ typedef struct { #define CPUTYPE_BOBCAT 45 #define CPUTYPE_BULLDOZER 46 #define CPUTYPE_PILEDRIVER 47 -#define CPUTYPE_HASWELL 48 +#define CPUTYPE_HASWELL 48 +#define CPUTYPE_STEAMROLLER 49 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 44446e582..ef90b26d8 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1162,6 +1162,12 @@ int get_cpuname(void){ return CPUTYPE_PILEDRIVER; else return CPUTYPE_BARCELONA; //OS don't support AVX. + case 0: + if(support_avx()) + return CPUTYPE_STEAMROLLER; + else + return CPUTYPE_BARCELONA; //OS don't support AVX. + } break; case 5: @@ -1290,6 +1296,7 @@ static char *cpuname[] = { "BULLDOZER", "PILEDRIVER", "HASWELL", + "STEAMROLLER", }; static char *lowercpuname[] = { @@ -1341,6 +1348,7 @@ static char *lowercpuname[] = { "bulldozer", "piledriver", "haswell", + "steamroller", }; static char *corename[] = { @@ -1369,6 +1377,7 @@ static char *corename[] = { "BULLDOZER", "PILEDRIVER", "HASWELL", + "STEAMROLLER", }; static char *corename_lower[] = { @@ -1397,6 +1406,7 @@ static char *corename_lower[] = { "bulldozer", "piledriver", "haswell", + "steamroller", }; @@ -1562,7 +1572,15 @@ int get_coretype(void){ return CORE_PILEDRIVER; else return CORE_BARCELONA; //OS don't support AVX. + + case 0: + if(support_avx()) + return CORE_STEAMROLLER; + else + return CORE_BARCELONA; //OS don't support AVX. } + + }else return CORE_BARCELONA; } } diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 1235df2db..6fd1d8cdf 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -66,6 +66,7 @@ extern gotoblas_t gotoblas_BOBCAT; extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_BULLDOZER; extern gotoblas_t gotoblas_PILEDRIVER; +extern gotoblas_t gotoblas_STEAMROLLER; #ifdef NO_AVX2 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE #else @@ -77,6 +78,7 @@ extern gotoblas_t gotoblas_HASWELL; #define gotoblas_HASWELL gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA +#define gotoblas_STEAMROLLER gotoblas_BARCELONA #endif @@ -275,7 +277,17 @@ static gotoblas_t *get_coretype(void){ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } + }else if(model == 0){ + //AMD STEAMROLLER + if(support_avx()) + return &gotoblas_STEAMROLLER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } } + + } else { return &gotoblas_BARCELONA; } @@ -315,6 +327,7 @@ static char *corename[] = { "Bulldozer", "Piledriver", "Haswell", + "Steamroller", }; char *gotoblas_corename(void) { @@ -339,6 +352,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; if (gotoblas == &gotoblas_HASWELL) return corename[20]; + if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; return corename[0]; } @@ -370,6 +384,7 @@ static gotoblas_t *force_coretype(char *coretype){ switch (found) { + case 21: return (&gotoblas_STEAMROLLER); case 20: return (&gotoblas_HASWELL); case 19: return (&gotoblas_PILEDRIVER); case 18: return (&gotoblas_BULLDOZER); diff --git a/driver/others/parameter.c b/driver/others/parameter.c index f0f889a15..d741f2fb9 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -166,7 +166,7 @@ int get_L2_size(void){ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ - defined(PILEDRIVER) || defined(HASWELL) + defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -251,7 +251,7 @@ void blas_set_parameter(void){ env_var_t p; int factor; -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) int size = 16; #else int size = get_L2_size(); diff --git a/getarch.c b/getarch.c index 81ab9e37c..f6a5ecb94 100644 --- a/getarch.c +++ b/getarch.c @@ -432,6 +432,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "PILEDRIVER" #endif +#if defined (FORCE_STEAMROLLER) +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "STEAMROLLER" +#define ARCHCONFIG "-DSTEAMROLLER " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ + "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" +#define LIBNAME "steamroller" +#define CORENAME "STEAMROLLER" +#endif + + #ifdef FORCE_SSE_GENERIC #define FORCE #define FORCE_INTEL diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 0d7bbd4ac..1fa7f7984 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -941,6 +941,23 @@ static void init_parameter(void) { #endif #endif +#ifdef STEAMROLLER + +#ifdef DEBUG + fprintf(stderr, "Steamroller\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + + #ifdef NANO #ifdef DEBUG diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER new file mode 100644 index 000000000..55285e3d3 --- /dev/null +++ b/kernel/x86_64/KERNEL.STEAMROLLER @@ -0,0 +1,76 @@ +SGEMVNKERNEL = sgemv_n_4.c +SGEMVTKERNEL = sgemv_t_4.c + +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_4.c + +DGEMVNKERNEL = dgemv_n_bulldozer.S +DGEMVTKERNEL = dgemv_t_bulldozer.S + +DDOTKERNEL = ddot_bulldozer.S +DCOPYKERNEL = dcopy_bulldozer.S + +SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = gemm_ncopy_2_bulldozer.S +SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S +DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S +DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S +DGEMMONCOPY = gemm_ncopy_2_bulldozer.S +DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S +DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index b3aad438f..d501c2f68 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "ddot_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index 632d16810..6fec48175 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "sdot_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index a840f8ba9..930dd26b2 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "sgemv_n_microk_bulldozer-4.c" #elif defined(NEHALEM) #include "sgemv_n_microk_nehalem-4.c" diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index cd13bb67d..2bb5809ea 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "sgemv_t_microk_nehalem-4.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 84cf4e2e8..4abb2d5ad 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "zgemv_t_microk_bulldozer-4.c" #elif defined(HASWELL) #include "zgemv_t_microk_haswell-4.c" diff --git a/param.h b/param.h index bce05c957..e3e535b14 100644 --- a/param.h +++ b/param.h @@ -406,6 +406,99 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef STEAMROLLER +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 832 +#define GEMM_DEFAULT_ALIGN 0x0fffUL + + + +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 +#define GEMV_UNROLL 8 +#endif + +#if defined(ARCH_X86_64) +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 768 +#define ZGEMM_DEFAULT_P 384 +#define CGEMM_DEFAULT_P 768 +#else +#define SGEMM_DEFAULT_P 448 +#define DGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 +#endif +#define QGEMM_DEFAULT_P 112 +#define XGEMM_DEFAULT_P 56 + +#if defined(ARCH_X86_64) +#define SGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 168 +#define ZGEMM_DEFAULT_Q 168 +#define CGEMM_DEFAULT_Q 168 +#else +#define SGEMM_DEFAULT_Q 224 +#define DGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 +#endif +#define QGEMM_DEFAULT_Q 224 +#define XGEMM_DEFAULT_Q 224 + +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#define SGEMM_DEFAULT_R 12288 +#define QGEMM_DEFAULT_R qgemm_r +#define DGEMM_DEFAULT_R 12288 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE + +#define GEMM_THREAD gemm_thread_mn + +#endif + + #ifdef ATHLON #define SNUMOPT 4 From 9566f5fdb039b7df8cd4cc15ef4f38ce207c34cc Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 28 Dec 2014 13:45:19 +0100 Subject: [PATCH 16/37] added Steamroller as a target processor --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 4bd1ab110..d3a2d1fa3 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -15,7 +15,7 @@ VERSION = 0.2.13 # TARGET = PENRYN # If you want to support multiple architecture in one binary -DYNAMIC_ARCH = 1 +# DYNAMIC_ARCH = 1 # C compiler including binary type(32bit / 64bit). Default is gcc. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect. From 0dc559ed300e4da5f81e66640a1514f29f7cac63 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 28 Dec 2014 17:15:42 +0100 Subject: [PATCH 17/37] bugfix in dynamic.c --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 6fd1d8cdf..87420938f 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -365,7 +365,7 @@ static gotoblas_t *force_coretype(char *coretype){ char message[128]; char mname[20]; - for ( i=1 ; i <= 20; i++) + for ( i=1 ; i <= 21; i++) { if (!strncasecmp(coretype,corename[i],20)) { From ddf983d643c3eac781b11d1a323f33f31dd2661a Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 30 Dec 2014 20:14:45 +0800 Subject: [PATCH 18/37] added optimizations for steamroller --- Makefile.rule | 2 +- benchmark/Makefile | 2 +- common.h | 8 + driver/others/dynamic.c | 2 +- kernel/x86_64/KERNEL.STEAMROLLER | 24 +- kernel/x86_64/caxpy.c | 2 +- kernel/x86_64/daxpy.c | 2 +- kernel/x86_64/dgemv_n_4.c | 2 +- kernel/x86_64/dgemv_n_microk_piledriver-4.c | 247 ++++++++++++++++++++ kernel/x86_64/dgemv_t_4.c | 2 +- kernel/x86_64/dsymv_L.c | 2 +- kernel/x86_64/dsymv_U.c | 2 +- kernel/x86_64/sgemv_n_4.c | 5 +- kernel/x86_64/sgemv_t_4.c | 4 + kernel/x86_64/ssymv_L.c | 2 +- kernel/x86_64/ssymv_U.c | 2 +- kernel/x86_64/zaxpy.c | 2 +- param.h | 12 +- 18 files changed, 299 insertions(+), 25 deletions(-) create mode 100644 kernel/x86_64/dgemv_n_microk_piledriver-4.c diff --git a/Makefile.rule b/Makefile.rule index d3a2d1fa3..979224cc4 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -15,7 +15,7 @@ VERSION = 0.2.13 # TARGET = PENRYN # If you want to support multiple architecture in one binary -# DYNAMIC_ARCH = 1 +#DYNAMIC_ARCH = 1 # C compiler including binary type(32bit / 64bit). Default is gcc. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect. diff --git a/benchmark/Makefile b/benchmark/Makefile index 402a2e07b..b5eaa9343 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -10,7 +10,7 @@ include $(TOPDIR)/Makefile.system #LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm # ACML 6.1 custom -ACML=/home/saar/acml6.1/gfortran64_mp/lib +ACML=/home/werner/project/acml6.1/gfortran64_mp/lib LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm diff --git a/common.h b/common.h index 1250e2e61..fe2083469 100644 --- a/common.h +++ b/common.h @@ -327,6 +327,14 @@ typedef int blasint; #endif #endif +/* +#ifdef STEAMROLLER +#ifndef YIELDING +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif +#endif +*/ + #ifndef YIELDING #define YIELDING sched_yield() #endif diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 6fd1d8cdf..87420938f 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -365,7 +365,7 @@ static gotoblas_t *force_coretype(char *coretype){ char message[128]; char mname[20]; - for ( i=1 ; i <= 20; i++) + for ( i=1 ; i <= 21; i++) { if (!strncasecmp(coretype,corename[i],20)) { diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER index 55285e3d3..f5b5cb942 100644 --- a/kernel/x86_64/KERNEL.STEAMROLLER +++ b/kernel/x86_64/KERNEL.STEAMROLLER @@ -1,15 +1,27 @@ +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c + +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c + +DSYMV_U_KERNEL = dsymv_U.c +DSYMV_L_KERNEL = dsymv_L.c +SSYMV_U_KERNEL = ssymv_U.c +SSYMV_L_KERNEL = ssymv_L.c + SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c +DGEMVNKERNEL = dgemv_n_4.c +DGEMVTKERNEL = dgemv_t_4.c + ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t_4.c -DGEMVNKERNEL = dgemv_n_bulldozer.S -DGEMVTKERNEL = dgemv_t_bulldozer.S - -DDOTKERNEL = ddot_bulldozer.S DCOPYKERNEL = dcopy_bulldozer.S + SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c @@ -21,8 +33,8 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S -DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S -DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c DGEMMONCOPY = gemm_ncopy_2_bulldozer.S DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index fa8924ae9..be945a441 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "caxpy_microk_bulldozer-2.c" #endif diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index f1d50c909..fd5343eba 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "daxpy_microk_nehalem-2.c" -#elif defined(BULLDOZER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "daxpy_microk_bulldozer-2.c" #endif diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 4da73af3e..27df12bef 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "dgemv_n_microk_nehalem-4.c" -#elif defined(HASWELL) +#elif defined(HASWELL) || defined(STEAMROLLER) #include "dgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dgemv_n_microk_piledriver-4.c b/kernel/x86_64/dgemv_n_microk_piledriver-4.c new file mode 100644 index 000000000..e1587b57c --- /dev/null +++ b/kernel/x86_64/dgemv_n_microk_piledriver-4.c @@ -0,0 +1,247 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%2), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 + "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 + "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 + "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 + "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 + + "vbroadcastsd (%9), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz 2f \n\t" + + "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" + "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + + "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y + + "addq $4 , %8 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + "2: \n\t" + + "cmpq $0, %1 \n\t" + "je 3f \n\t" + + + ".align 16 \n\t" + "1: \n\t" + + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" + "addq $8 , %0 \n\t" + "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" + "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" + "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" + "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" + "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" + + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "addq $8 , %8 \n\t" + "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y + "subq $8 , %1 \n\t" + "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y + + "jnz 1b \n\t" + + "3: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#define HAVE_KERNEL_4x4 1 +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%2), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 + + "vbroadcastsd (%8), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz 2f \n\t" + + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + "2: \n\t" + + "cmpq $0, %1 \n\t" + "je 3f \n\t" + + + ".align 16 \n\t" + "1: \n\t" + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y + "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz 1b \n\t" + + "3: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index ee99228aa..5d85ecab7 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) +#if defined(HASWELL) || defined(STEAMROLLER) #include "dgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 8d1337746..f6157f791 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "dsymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "dsymv_L_microk_nehalem-2.c" diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index 267755c2f..ecfaf5043 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "dsymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "dsymv_U_microk_nehalem-2.c" diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 930dd26b2..840ce9207 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -39,8 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_haswell-4.c" #endif - +#if defined(STEAMROLLER) +#define NBMAX 2048 +#else #define NBMAX 4096 +#endif #ifndef HAVE_KERNEL_4x8 diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 2bb5809ea..b97161612 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_haswell-4.c" #endif +#if defined(STEAMROLLER) +#define NBMAX 2048 +#else #define NBMAX 4096 +#endif #ifndef HAVE_KERNEL_4x4 diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 096adc6ca..a2b716b58 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 61127aa3d..0aadd3fd2 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index ca2f03dd0..52a25c793 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "zaxpy_microk_bulldozer-2.c" #endif diff --git a/param.h b/param.h index e3e535b14..08c5dc81f 100644 --- a/param.h +++ b/param.h @@ -448,9 +448,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 768 -#define ZGEMM_DEFAULT_P 384 -#define CGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 576 +#define ZGEMM_DEFAULT_P 288 +#define CGEMM_DEFAULT_P 576 #else #define SGEMM_DEFAULT_P 448 #define DGEMM_DEFAULT_P 480 @@ -462,9 +462,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 168 -#define ZGEMM_DEFAULT_Q 168 -#define CGEMM_DEFAULT_Q 168 +#define DGEMM_DEFAULT_Q 160 +#define ZGEMM_DEFAULT_Q 160 +#define CGEMM_DEFAULT_Q 160 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 From 5cb5af93336d259a5a62449a9ba4fd9606b3e28c Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 2 Jan 2015 02:42:32 +0800 Subject: [PATCH 19/37] Add configuration options. --- Makefile.rule | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 979224cc4..bea1fe194 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -15,7 +15,7 @@ VERSION = 0.2.13 # TARGET = PENRYN # If you want to support multiple architecture in one binary -#DYNAMIC_ARCH = 1 +# DYNAMIC_ARCH = 1 # C compiler including binary type(32bit / 64bit). Default is gcc. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect. @@ -159,6 +159,19 @@ COMMON_PROF = -pg # Build Debug version # DEBUG = 1 +# Improve GEMV and GER for small matrices by stack allocation. +# For details, https://github.com/xianyi/OpenBLAS/pull/482 +# +# MAX_STACK_ALLOC=2048 + +# Add a prefix or suffix to all exported symbol names in the shared library. +# Avoid conflicts with other BLAS libraries, especially when using +# 64 bit integer interfaces in OpenBLAS. +# For details, https://github.com/xianyi/OpenBLAS/pull/459 +# +# SYMBOLPREFIX= +# SYMBOLSUFFIX= + # # End of user configuration # From 5344f335a8b3ad9c054c58d90934c3df67e9347a Mon Sep 17 00:00:00 2001 From: kortschak Date: Wed, 7 Jan 2015 10:06:55 +1030 Subject: [PATCH 20/37] Add test for drotmg bug fixed by 692b14c Test requested in issue xianyi/OpenBLAS#484. Run tests by applying the following change and then make: diff --git a/Makefile.rule b/Makefile.rule index bea1fe1..9852ff3 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -140,7 +140,7 @@ NO_AFFINITY = 1 -# UTEST_CHECK = 1 +UTEST_CHECK = 1 --- CONTRIBUTORS.md | 3 +++ utest/common_utest.h | 1 + utest/main.c | 1 + utest/test_rotmg.c | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 38 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 02d15b7f3..152ec95aa 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -121,5 +121,8 @@ In chronological order: * [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1). ARMv8 support. +* Dan Kortschak + * [2015-01-07] Added test for drotmg bug #484. + * [Your name or handle] <[email or website]> * [Date] [Brief summary of your changes] diff --git a/utest/common_utest.h b/utest/common_utest.h index e8377e681..d170ed27c 100644 --- a/utest/common_utest.h +++ b/utest/common_utest.h @@ -59,6 +59,7 @@ void test_zdotu_n_1(void); void test_zdotu_offset_1(void); void test_drotmg(void); +void test_drotmg_D1eqD2_X1eqX2(); void test_dsdot_n_1(void); diff --git a/utest/main.c b/utest/main.c index f44008b79..770d1451e 100644 --- a/utest/main.c +++ b/utest/main.c @@ -57,6 +57,7 @@ CU_TestInfo test_level1[]={ {"Testing zdotu with input x & y offset == 1",test_zdotu_offset_1}, {"Testing drotmg",test_drotmg}, + {"Testing drotmg with D1 == D2 && X1 == X2",test_drotmg_D1eqD2_X1eqX2}, {"Testing dsdot with n == 1",test_dsdot_n_1}, diff --git a/utest/test_rotmg.c b/utest/test_rotmg.c index b72446c1b..b175653a6 100644 --- a/utest/test_rotmg.c +++ b/utest/test_rotmg.c @@ -65,3 +65,36 @@ void test_drotmg() CU_ASSERT_DOUBLE_EQUAL(te_param[i], tr_param[i], CHECK_EPS); } } + +void test_drotmg_D1eqD2_X1eqX2() +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i=0; + te_d1= tr_d1=2.; + te_d2= tr_d2=2.; + te_x1= tr_x1=8.; + te_y1= tr_y1=8.; + + for(i=0; i<5; i++){ + te_param[i]=tr_param[i]=0.0; + } + + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + //reference + BLASFUNC_REF(drotmg)(&tr_d1, &tr_d2, &tr_x1, &tr_y1, tr_param); + + CU_ASSERT_DOUBLE_EQUAL(te_d1, tr_d1, CHECK_EPS); + CU_ASSERT_DOUBLE_EQUAL(te_d2, tr_d2, CHECK_EPS); + CU_ASSERT_DOUBLE_EQUAL(te_x1, tr_x1, CHECK_EPS); + CU_ASSERT_DOUBLE_EQUAL(te_y1, tr_y1, CHECK_EPS); + + for(i=0; i<5; i++){ + CU_ASSERT_DOUBLE_EQUAL(te_param[i], tr_param[i], CHECK_EPS); + } +} From 229ce2ccd1cc8c0775442d3b6e725db0618d4567 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 12 Jan 2015 08:55:29 +0000 Subject: [PATCH 21/37] Add cortex-a9 and cortex-a15 targets. --- Makefile.arm | 5 ++ cpuid_arm.c | 95 ++++++++++++++++++++++++++----------- getarch.c | 30 ++++++++++++ kernel/arm/KERNEL.CORTEXA15 | 1 + kernel/arm/KERNEL.CORTEXA9 | 1 + param.h | 82 ++++++++++++++++++++++++++++++++ 6 files changed, 186 insertions(+), 28 deletions(-) create mode 100644 kernel/arm/KERNEL.CORTEXA15 create mode 100644 kernel/arm/KERNEL.CORTEXA9 diff --git a/Makefile.arm b/Makefile.arm index 5bdd4d151..9978a672a 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,3 +1,8 @@ +# ifeq logical or +ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15)) +CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a +FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a +endif ifeq ($(CORE), ARMV7) CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a diff --git a/cpuid_arm.c b/cpuid_arm.c index b7181b2f9..211ea2764 100644 --- a/cpuid_arm.c +++ b/cpuid_arm.c @@ -30,16 +30,27 @@ #define CPU_UNKNOWN 0 #define CPU_ARMV6 1 #define CPU_ARMV7 2 -#define CPU_CORTEXA15 3 +#define CPU_CORTEXA9 3 +#define CPU_CORTEXA15 4 static char *cpuname[] = { "UNKOWN", "ARMV6", "ARMV7", + "CORTEXA9", "CORTEXA15" }; +static char *cpuname_lower[] = { + "unknown", + "armv6", + "armv7", + "cortexa9", + "cortexa15" +}; + + int get_feature(char *search) { @@ -85,6 +96,29 @@ int detect(void) char buffer[512], *p; p = (char *) NULL ; + infile = fopen("/proc/cpuinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("CPU part", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + if(p != NULL) { + if (strstr(p, "0xc09")) { + return CPU_CORTEXA9; + } + if (strstr(p, "0xc15")) { + return CPU_CORTEXA15; + } + + } + + p = (char *) NULL ; infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)) @@ -142,21 +176,7 @@ void get_architecture(void) void get_subarchitecture(void) { int d = detect(); - switch (d) - { - - case CPU_ARMV7: - printf("ARMV7"); - break; - - case CPU_ARMV6: - printf("ARMV6"); - break; - - default: - printf("UNKNOWN"); - break; - } + printf("%s", cpuname[d]); } void get_subdirname(void) @@ -170,6 +190,36 @@ void get_cpuconfig(void) int d = detect(); switch (d) { + case CPU_CORTEXA9: + printf("#define CORTEXA9\n"); + printf("#define HAVE_VFP\n"); + printf("#define HAVE_VFPV3\n"); + if ( get_feature("neon")) printf("#define HAVE_NEON\n"); + if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 128\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + break; + + case CPU_CORTEXA15: + printf("#define CORTEXA15\n"); + printf("#define HAVE_VFP\n"); + printf("#define HAVE_VFPV3\n"); + if ( get_feature("neon")) printf("#define HAVE_NEON\n"); + if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 128\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + break; + case CPU_ARMV7: printf("#define ARMV7\n"); @@ -206,18 +256,7 @@ void get_libname(void) { int d = detect(); - switch (d) - { - - case CPU_ARMV7: - printf("armv7\n"); - break; - - case CPU_ARMV6: - printf("armv6\n"); - break; - - } + printf("%s", cpuname_lower[d]); } diff --git a/getarch.c b/getarch.c index f6a5ecb94..ed304b692 100644 --- a/getarch.c +++ b/getarch.c @@ -727,6 +727,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_CORTEXA9 +#define FORCE +#define ARCHITECTURE "ARM" +#define SUBARCHITECTURE "CORTEXA9" +#define SUBDIRNAME "arm" +#define ARCHCONFIG "-DCORTEXA9 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" +#define LIBNAME "cortexa9" +#define CORENAME "CORTEXA9" +#else +#endif + +#ifdef FORCE_CORTEXA15 +#define FORCE +#define ARCHITECTURE "ARM" +#define SUBARCHITECTURE "CORTEXA15" +#define SUBDIRNAME "arm" +#define ARCHCONFIG "-DCORTEXA15 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" +#define LIBNAME "cortexa9" +#define CORENAME "CORTEXA9" +#else +#endif + #ifdef FORCE_ARMV6 #define FORCE #define ARCHITECTURE "ARM" diff --git a/kernel/arm/KERNEL.CORTEXA15 b/kernel/arm/KERNEL.CORTEXA15 new file mode 100644 index 000000000..72e3ba02e --- /dev/null +++ b/kernel/arm/KERNEL.CORTEXA15 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.ARMV7 \ No newline at end of file diff --git a/kernel/arm/KERNEL.CORTEXA9 b/kernel/arm/KERNEL.CORTEXA9 new file mode 100644 index 000000000..72e3ba02e --- /dev/null +++ b/kernel/arm/KERNEL.CORTEXA9 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.ARMV7 \ No newline at end of file diff --git a/param.h b/param.h index 08c5dc81f..18c711eb3 100644 --- a/param.h +++ b/param.h @@ -2206,6 +2206,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#ifdef CORTEXA9 +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + + +#define SYMV_P 16 +#endif + + +#ifdef CORTEXA15 +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + + +#define SYMV_P 16 +#endif + + #ifdef GENERIC From 4e6c4046f76c1ad624f0be5a326b3d35c90711ab Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 12 Jan 2015 09:35:16 +0000 Subject: [PATCH 22/37] Fix cortex-a15 detecting bug. --- cpuid_arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_arm.c b/cpuid_arm.c index 211ea2764..51ba72d70 100644 --- a/cpuid_arm.c +++ b/cpuid_arm.c @@ -112,7 +112,7 @@ int detect(void) if (strstr(p, "0xc09")) { return CPU_CORTEXA9; } - if (strstr(p, "0xc15")) { + if (strstr(p, "0xc0f")) { return CPU_CORTEXA15; } From 29cb47fc06ce8e81c39a8ecc19964053033b21ed Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Mon, 12 Jan 2015 21:27:52 -0500 Subject: [PATCH 23/37] Move #include statements outside extern "C" blocks --- lapack-netlib/lapacke/include/lapacke_config.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/lapack-netlib/lapacke/include/lapacke_config.h b/lapack-netlib/lapacke/include/lapacke_config.h index 561b2736b..d46ed98e5 100644 --- a/lapack-netlib/lapacke/include/lapacke_config.h +++ b/lapack-netlib/lapacke/include/lapacke_config.h @@ -38,7 +38,6 @@ #if defined(LAPACK_COMPLEX_CPP) #include #endif -extern "C" { #endif /* __cplusplus */ #include @@ -63,8 +62,14 @@ extern "C" { #if defined(LAPACK_COMPLEX_STRUCTURE) +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ typedef struct { float real, imag; } _lapack_complex_float; typedef struct { double real, imag; } _lapack_complex_double; +#ifdef __cplusplus +} +#endif /* __cplusplus */ #define lapack_complex_float _lapack_complex_float #define lapack_complex_double _lapack_complex_double #define lapack_complex_float_real(z) ((z).real) @@ -103,8 +108,14 @@ typedef struct { double real, imag; } _lapack_complex_double; #endif +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ lapack_complex_float lapack_make_complex_float( float re, float im ); lapack_complex_double lapack_make_complex_double( double re, double im ); +#ifdef __cplusplus +} +#endif /* __cplusplus */ #endif @@ -116,8 +127,4 @@ lapack_complex_double lapack_make_complex_double( double re, double im ); #define LAPACK_free( p ) free( p ) #endif -#ifdef __cplusplus -} -#endif /* __cplusplus */ - #endif /* _LAPACKE_CONFIG_H_ */ From b17ccb4c5c5ef3a41b7a5040f1c604348f8cf6f0 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Thu, 29 Jan 2015 09:55:57 +0100 Subject: [PATCH 24/37] Fix a segfault in gemv when MAX_STACK_ALLOC is set * stack_alloc_size is needed after the implementation call but it may be overwritten if it's optimized to a register, because some gemv implementation (ex: dgemv_n.S) do not restore all register (ex: r10). * do the same in ger.c for the same reasons even if the bug has not been observed. --- interface/gemv.c | 4 +++- interface/ger.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/interface/gemv.c b/interface/gemv.c index 155305be8..f33973ef3 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -209,7 +209,9 @@ void CNAME(enum CBLAS_ORDER order, if (incy < 0) y -= (leny - 1) * incy; #ifdef MAX_STACK_ALLOC - int stack_alloc_size = m + n; + // make it volatile because some gemv implementation (ex: dgemv_n.S) + // do not restore all register + volatile int stack_alloc_size = m + n; if(stack_alloc_size < 128) //dgemv_n.S require a 128 bytes buffer stack_alloc_size = 128; diff --git a/interface/ger.c b/interface/ger.c index cac357786..9dd2dc58b 100644 --- a/interface/ger.c +++ b/interface/ger.c @@ -172,7 +172,7 @@ void CNAME(enum CBLAS_ORDER order, if (incx < 0) x -= (m - 1) * incx; #ifdef MAX_STACK_ALLOC - int stack_alloc_size = m; + volatile int stack_alloc_size = m; if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) stack_alloc_size = 0; FLOAT stack_buffer[stack_alloc_size]; From 65a847cd361d33b4a65c10d13eefb11eb02f04d7 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Tue, 3 Feb 2015 12:23:34 -0500 Subject: [PATCH 25/37] Introduce openblas_get_num_threads and openblas_get_num_procs --- cblas.h | 6 ++++ cblas_noconst.h | 6 ++++ driver/others/Makefile | 8 ++++- driver/others/memory.c | 8 +++++ driver/others/openblas_get_num_procs.c | 40 ++++++++++++++++++++++++ driver/others/openblas_get_num_threads.c | 40 ++++++++++++++++++++++++ exports/gensymbol | 5 ++- 7 files changed, 111 insertions(+), 2 deletions(-) create mode 100644 driver/others/openblas_get_num_procs.c create mode 100644 driver/others/openblas_get_num_threads.c diff --git a/cblas.h b/cblas.h index d1c029afa..a21863d88 100644 --- a/cblas.h +++ b/cblas.h @@ -13,6 +13,12 @@ extern "C" { void openblas_set_num_threads(int num_threads); void goto_set_num_threads(int num_threads); +/*Get the number of threads on runtime.*/ +int openblas_get_num_threads(void); + +/*Get the number of physical processors (cores).*/ +int openblas_get_num_procs(void); + /*Get the build configure on runtime.*/ char* openblas_get_config(void); diff --git a/cblas_noconst.h b/cblas_noconst.h index bc6382513..f6a6baf62 100644 --- a/cblas_noconst.h +++ b/cblas_noconst.h @@ -13,6 +13,12 @@ extern "C" { void openblas_set_num_threads(int num_threads); void goto_set_num_threads(int num_threads); +/*Get the number of threads on runtime.*/ +int openblas_get_num_threads(void); + +/*Get the number of physical processors (cores).*/ +int openblas_get_num_procs(void); + /*Get the build configure on runtime.*/ char* openblas_get_config(void); diff --git a/driver/others/Makefile b/driver/others/Makefile index fc73871cc..ed145cee8 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -1,7 +1,7 @@ TOPDIR = ../.. include ../../Makefile.system -COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) #COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) @@ -103,6 +103,12 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../. openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c $(CC) $(CFLAGS) -c $< -o $(@F) +openblas_get_num_threads.$(SUFFIX) : openblas_get_num_threads.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +openblas_get_num_procs.$(SUFFIX) : openblas_get_num_procs.c + $(CC) $(CFLAGS) -c $< -o $(@F) + openblas_get_config.$(SUFFIX) : openblas_get_config.c $(CC) $(CFLAGS) -c $< -o $(@F) diff --git a/driver/others/memory.c b/driver/others/memory.c index 16d68cced..e245d9e53 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -241,6 +241,10 @@ void set_stack_limit(int limitMB){ */ #endif +int openblas_get_num_procs(void) { + return get_num_procs(); +} + /* OpenBLAS uses the numbers of CPU cores in multithreading. It can be set by openblas_set_num_threads(int num_threads); @@ -323,6 +327,10 @@ int blas_get_cpu_number(void){ } #endif +int openblas_get_num_threads(void) { + return blas_get_cpu_number(); +} + struct release_t { void *address; void (*func)(struct release_t *); diff --git a/driver/others/openblas_get_num_procs.c b/driver/others/openblas_get_num_procs.c new file mode 100644 index 000000000..6b0c1ec5c --- /dev/null +++ b/driver/others/openblas_get_num_procs.c @@ -0,0 +1,40 @@ +/***************************************************************************** +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common.h" + +extern int openblas_get_num_procs(void); + +int openblas_get_num_procs_(void) { + return openblas_get_num_procs(); +} diff --git a/driver/others/openblas_get_num_threads.c b/driver/others/openblas_get_num_threads.c new file mode 100644 index 000000000..e31aa4b4a --- /dev/null +++ b/driver/others/openblas_get_num_threads.c @@ -0,0 +1,40 @@ +/***************************************************************************** +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common.h" + +extern int openblas_get_num_threads(void); + +int openblas_get_num_threads_(void) { + return openblas_get_num_threads(); +} diff --git a/exports/gensymbol b/exports/gensymbol index 8bd2f17af..2155f801f 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -81,7 +81,10 @@ #both underscore and no underscore @misc_common_objs = ( - openblas_set_num_threads, openblas_get_parallel, + openblas_get_parallel, + openblas_get_num_procs, + openblas_set_num_threads, + openblas_get_num_threads, ); @misc_no_underscore_objs = ( From cfa9392ffa1903c309fb2d8c9ad74831475b778d Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sun, 8 Feb 2015 01:30:12 -0600 Subject: [PATCH 26/37] Fix openblas_get_num_threads and openblas_get_num_procs bug with single thread. --- driver/others/memory.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index e245d9e53..031615576 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -241,9 +241,6 @@ void set_stack_limit(int limitMB){ */ #endif -int openblas_get_num_procs(void) { - return get_num_procs(); -} /* OpenBLAS uses the numbers of CPU cores in multithreading. @@ -327,8 +324,21 @@ int blas_get_cpu_number(void){ } #endif + +int openblas_get_num_procs(void) { +#ifndef SMP + return 1; +#else + return get_num_procs(); +#endif +} + int openblas_get_num_threads(void) { +#ifndef SMP + return 1; +#else return blas_get_cpu_number(); +#endif } struct release_t { From 771b18ae9c3aa70f4a2497b59700e06819a6ae30 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sun, 8 Feb 2015 01:42:48 -0600 Subject: [PATCH 27/37] Detect the wrong combined flags of USE_OPENMP=1 and USE_THREAD=0. --- Makefile.system | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile.system b/Makefile.system index 1d1d48c75..525daa41b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -364,6 +364,12 @@ endif ifeq ($(USE_OPENMP), 1) + +#check +ifeq ($(USE_THREAD), 0) +$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.) +endif + # ifeq logical or. GCC or LSB ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) CCOMMON_OPT += -fopenmp From 39cc6b21d3e282e8b91ffe4255e02e586885db2c Mon Sep 17 00:00:00 2001 From: Martin Koehler Date: Mon, 16 Feb 2015 13:46:20 +0100 Subject: [PATCH 28/37] Add ATLAS-style ?geadd function --- cblas.h | 10 +++ cblas_noconst.h | 10 +++ common_c.h | 2 + common_d.h | 3 + common_interface.h | 6 ++ common_level3.h | 5 ++ common_macro.h | 6 +- common_param.h | 4 ++ common_s.h | 2 + common_z.h | 3 + exports/gensymbol | 4 +- interface/Makefile | 49 ++++++++++++--- interface/geadd.c | 148 ++++++++++++++++++++++++++++++++++++++++++++ interface/zgeadd.c | 146 +++++++++++++++++++++++++++++++++++++++++++ kernel/Makefile.L3 | 40 ++++++++++-- kernel/arm/geadd.c | 64 +++++++++++++++++++ kernel/arm/zgeadd.c | 65 +++++++++++++++++++ 17 files changed, 553 insertions(+), 14 deletions(-) create mode 100644 interface/geadd.c create mode 100644 interface/zgeadd.c create mode 100644 kernel/arm/geadd.c create mode 100644 kernel/arm/zgeadd.c diff --git a/cblas.h b/cblas.h index a21863d88..d6949e10c 100644 --- a/cblas.h +++ b/cblas.h @@ -347,6 +347,16 @@ void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); +void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, + float *c, OPENBLAS_CONST blasint cldc); +void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, + double *c, OPENBLAS_CONST blasint cldc); +void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, + float *c, OPENBLAS_CONST blasint cldc); +void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, + double *c, OPENBLAS_CONST blasint cldc); + + #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/cblas_noconst.h b/cblas_noconst.h index f6a6baf62..4451c304e 100644 --- a/cblas_noconst.h +++ b/cblas_noconst.h @@ -333,6 +333,16 @@ void cblas_cimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, bl blasint clda, blasint cldb); void cblas_zimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double* calpha, double* a, blasint clda, blasint cldb); + +void cblas_sgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float calpha, float *a, blasint clda, float cbeta, + float *c, blasint cldc); +void cblas_dgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double calpha, double *a, blasint clda, double cbeta, + double *c, blasint cldc); +void cblas_cgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float *calpha, float *a, blasint clda, float *cbeta, + float *c, blasint cldc); +void cblas_zgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double *calpha, double *a, blasint clda, double *cbeta, + double *c, blasint cldc); + #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/common_c.h b/common_c.h index 724d1e261..741d7d087 100644 --- a/common_c.h +++ b/common_c.h @@ -220,6 +220,7 @@ #define COMATCOPY_K_CTC comatcopy_k_ctc #define COMATCOPY_K_RTC comatcopy_k_rtc +#define CGEADD_K cgeadd_k #else @@ -402,6 +403,7 @@ #define COMATCOPY_K_RNC gotoblas -> comatcopy_k_rnc #define COMATCOPY_K_CTC gotoblas -> comatcopy_k_ctc #define COMATCOPY_K_RTC gotoblas -> comatcopy_k_rtc +#define CGEADD_K gotoblas -> cgeadd_k #endif diff --git a/common_d.h b/common_d.h index c34e1f28f..d6dfd7f04 100644 --- a/common_d.h +++ b/common_d.h @@ -149,6 +149,7 @@ #define DOMATCOPY_K_RN domatcopy_k_rn #define DOMATCOPY_K_CT domatcopy_k_ct #define DOMATCOPY_K_RT domatcopy_k_rt +#define DGEADD_K dgeadd_k #else @@ -267,6 +268,8 @@ #define DOMATCOPY_K_CT gotoblas -> domatcopy_k_ct #define DOMATCOPY_K_RT gotoblas -> domatcopy_k_rt +#define DGEADD_K gotoblas -> dgeadd_k + #endif #define DGEMM_NN dgemm_nn diff --git a/common_interface.h b/common_interface.h index ddd2cf6e5..15f69e02f 100644 --- a/common_interface.h +++ b/common_interface.h @@ -754,6 +754,12 @@ void BLASFUNC(dimatcopy) (char *, char *, blasint *, blasint *, double *, do void BLASFUNC(cimatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, blasint *); void BLASFUNC(zimatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, blasint *); +void BLASFUNC(sgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); +void BLASFUNC(dgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); +void BLASFUNC(cgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); +void BLASFUNC(zgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); + + #ifdef __cplusplus } diff --git a/common_level3.h b/common_level3.h index 0babd45b7..e0ecbc4e2 100644 --- a/common_level3.h +++ b/common_level3.h @@ -1762,6 +1762,11 @@ int zomatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, dou int zomatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int sgeadd_k(BLASLONG, BLASLONG, float, float*, BLASLONG, float, float *, BLASLONG); +int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BLASLONG); +int cgeadd_k(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float, float, float *, BLASLONG); +int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG); + #ifdef __CUDACC__ } diff --git a/common_macro.h b/common_macro.h index f9de3773a..8555baa67 100644 --- a/common_macro.h +++ b/common_macro.h @@ -634,7 +634,7 @@ #define OMATCOPY_K_RN DOMATCOPY_K_RN #define OMATCOPY_K_CT DOMATCOPY_K_CT #define OMATCOPY_K_RT DOMATCOPY_K_RT - +#define GEADD_K DGEADD_K #else #define AMAX_K SAMAX_K @@ -932,6 +932,7 @@ #define OMATCOPY_K_CT SOMATCOPY_K_CT #define OMATCOPY_K_RT SOMATCOPY_K_RT +#define GEADD_K SGEADD_K #endif #else #ifdef XDOUBLE @@ -1746,6 +1747,7 @@ #define OMATCOPY_K_RNC ZOMATCOPY_K_RNC #define OMATCOPY_K_CTC ZOMATCOPY_K_CTC #define OMATCOPY_K_RTC ZOMATCOPY_K_RTC +#define GEADD_K ZGEADD_K #else @@ -2159,6 +2161,8 @@ #define OMATCOPY_K_CTC COMATCOPY_K_CTC #define OMATCOPY_K_RTC COMATCOPY_K_RTC +#define GEADD_K CGEADD_K + #endif #endif diff --git a/common_param.h b/common_param.h index 49c1bf73b..1b56e85f0 100644 --- a/common_param.h +++ b/common_param.h @@ -855,6 +855,10 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); + int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); + int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); + int (*zgeadd_k) (BLASLONG, BLASLONG, float, double, double *, BLASLONG, double, double, double *, BLASLONG); } gotoblas_t; diff --git a/common_s.h b/common_s.h index 4e9b6dbe7..a4d8679b7 100644 --- a/common_s.h +++ b/common_s.h @@ -153,6 +153,7 @@ #define SOMATCOPY_K_CT somatcopy_k_ct #define SOMATCOPY_K_RT somatcopy_k_rt +#define SGEADD_K sgeadd_k #else @@ -274,6 +275,7 @@ #define SOMATCOPY_K_CT gotoblas -> somatcopy_k_ct #define SOMATCOPY_K_RT gotoblas -> somatcopy_k_rt +#define SGEADD_K gotoblas -> sgeadd_k #endif diff --git a/common_z.h b/common_z.h index 133dea80c..85f577a27 100644 --- a/common_z.h +++ b/common_z.h @@ -220,6 +220,7 @@ #define ZOMATCOPY_K_CTC zomatcopy_k_ctc #define ZOMATCOPY_K_RTC zomatcopy_k_rtc +#define ZGEADD_K zgeadd_k #else @@ -403,6 +404,8 @@ #define ZOMATCOPY_K_CTC gotoblas -> zomatcopy_k_ctc #define ZOMATCOPY_K_RTC gotoblas -> zomatcopy_k_rtc +#define ZGEADD_K zgeadd_k + #endif #define ZGEMM_NN zgemm_nn diff --git a/exports/gensymbol b/exports/gensymbol index 2155f801f..12ca7376c 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -23,7 +23,8 @@ zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zsymv, xerbla, - saxpby,daxpby,caxpby,zaxpby + saxpby,daxpby,caxpby,zaxpby, + sgeadd,dgeadd,cgeadd,zgeadd, ); @cblasobjs = ( @@ -55,6 +56,7 @@ cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby, cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy, cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy, + cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd ); @exblasobjs = ( diff --git a/interface/Makefile b/interface/Makefile index 54699b7e3..1666d9145 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -43,7 +43,8 @@ SBLAS2OBJS = \ SBLAS3OBJS = \ sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \ strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \ - somatcopy.$(SUFFIX) simatcopy.$(SUFFIX) + somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ + sgeadd.$(SUFFIX) DBLAS1OBJS = \ @@ -68,7 +69,8 @@ DBLAS2OBJS = \ DBLAS3OBJS = \ dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \ dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \ - domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX) + domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\ + dgeadd.$(SUFFIX) CBLAS1OBJS = \ caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ @@ -96,7 +98,8 @@ CBLAS3OBJS = \ cgemm.$(SUFFIX) csymm.$(SUFFIX) ctrmm.$(SUFFIX) \ ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \ chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \ - comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX) + comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\ + cgeadd.$(SUFFIX) ZBLAS1OBJS = \ zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ @@ -124,7 +127,8 @@ ZBLAS3OBJS = \ zgemm.$(SUFFIX) zsymm.$(SUFFIX) ztrmm.$(SUFFIX) \ ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \ zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \ - zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX) + zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\ + zgeadd.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) @@ -269,7 +273,8 @@ CSBLAS2OBJS = \ CSBLAS3OBJS = \ cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ - cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX) + cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ + cblas_sgeadd.$(SUFFIX) CDBLAS1OBJS = \ cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ @@ -285,7 +290,8 @@ CDBLAS2OBJS = \ CDBLAS3OBJS += \ cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ - cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) + cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \ + cblas_dgeadd.$(SUFFIX) CCBLAS1OBJS = \ cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ @@ -308,7 +314,9 @@ CCBLAS3OBJS = \ cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ - cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX) + cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ + cblas_cgeadd.$(SUFFIX) + CZBLAS1OBJS = \ @@ -332,7 +340,9 @@ CZBLAS3OBJS = \ cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \ cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ - cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) + cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \ + cblas_zgeadd.$(SUFFIX) + ifeq ($(SUPPORT_GEMM3M), 1) @@ -2103,4 +2113,27 @@ zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c cblas_zimatcopy.$(SUFFIX) cblas_zimatcopy.$(PSUFFIX) : zimatcopy.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) +sgeadd.$(SUFFIX) sgeadd.$(PSUFFIX) : geadd.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgeadd.$(SUFFIX) dgeadd.$(PSUFFIX) : geadd.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgeadd.$(SUFFIX) cgeadd.$(PSUFFIX) : zgeadd.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgeadd.$(SUFFIX) zgeadd.$(PSUFFIX) : zgeadd.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cblas_sgeadd.$(SUFFIX) cblas_sgeadd.$(PSUFFIX) : geadd.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_dgeadd.$(SUFFIX) cblas_dgeadd.$(PSUFFIX) : geadd.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_cgeadd.$(SUFFIX) cblas_cgeadd.$(PSUFFIX) : zgeadd.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) diff --git a/interface/geadd.c b/interface/geadd.c new file mode 100644 index 000000000..f0befa14a --- /dev/null +++ b/interface/geadd.c @@ -0,0 +1,148 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#if defined(DOUBLE) +#define ERROR_NAME "DGEADD " +#else +#define ERROR_NAME "SGEADD " +#endif + +#ifndef CBLAS + +void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *BETA, FLOAT *c, blasint *LDC) +{ + + blasint m = *M; + blasint n = *N; + blasint lda = *LDA; + blasint ldc = *LDC; + FLOAT alpha = *ALPHA; + FLOAT beta = *BETA; + + blasint info; + + PRINT_DEBUG_NAME; + + info = 0; + + + if (lda < MAX(1, m)) info = 6; + if (ldc < MAX(1, m)) info = 8; + + if (n < 0) info = 2; + if (m < 0) info = 1; + + if (info != 0){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else +void CNAME( enum CBLAS_ORDER order, blasint m, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT beta, + FLOAT *c, blasint ldc) +{ +/* +void CNAME(enum CBLAS_ORDER order, + blasint m, blasint n, + FLOAT alpha, + FLOAT *a, blasint lda, + FLOAT beta, + FLOAT *c, blasint ldc){ */ + + blasint info, t; + + PRINT_DEBUG_CNAME; + + info = 0; + + if (order == CblasColMajor) { + + info = -1; + + if (ldc < MAX(1, m)) info = 8; + if (lda < MAX(1, m)) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + + } + + if (order == CblasRowMajor) { + info = -1; + + t = n; + n = m; + m = t; + + if (ldc < MAX(1, m)) info = 8; + if (lda < MAX(1, m)) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((m==0) || (n==0)) return; + + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + + GEADD_K(m,n,alpha, a, lda, beta, c, ldc); + + + FUNCTION_PROFILE_END(1, 2* m * n , 2 * m * n); + + IDEBUG_END; + + return; + +} diff --git a/interface/zgeadd.c b/interface/zgeadd.c new file mode 100644 index 000000000..7124cf230 --- /dev/null +++ b/interface/zgeadd.c @@ -0,0 +1,146 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#if defined(DOUBLE) +#define ERROR_NAME "ZGEADD " +#else +#define ERROR_NAME "CGEADD " +#endif + +#ifndef CBLAS + +void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *BETA, FLOAT *c, blasint *LDC) +{ + + blasint m = *M; + blasint n = *N; + blasint lda = *LDA; + blasint ldc = *LDC; + + blasint info; + + PRINT_DEBUG_NAME; + + info = 0; + + + if (lda < MAX(1, m)) info = 6; + if (ldc < MAX(1, m)) info = 8; + + if (n < 0) info = 2; + if (m < 0) info = 1; + + if (info != 0){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else +void CNAME( enum CBLAS_ORDER order, blasint m, blasint n, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *BETA, + FLOAT *c, blasint ldc) +{ +/* +void CNAME(enum CBLAS_ORDER order, + blasint m, blasint n, + FLOAT alpha, + FLOAT *a, blasint lda, + FLOAT beta, + FLOAT *c, blasint ldc){ */ + + blasint info, t; + + PRINT_DEBUG_CNAME; + + info = 0; + + if (order == CblasColMajor) { + + info = -1; + + if (ldc < MAX(1, m)) info = 8; + if (lda < MAX(1, m)) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + + } + + if (order == CblasRowMajor) { + info = -1; + + t = n; + n = m; + m = t; + + if (ldc < MAX(1, m)) info = 8; + if (lda < MAX(1, m)) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((m==0) || (n==0)) return; + + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + + GEADD_K(m,n,ALPHA[0],ALPHA[1], a, lda, BETA[0], BETA[1], c, ldc); + + + FUNCTION_PROFILE_END(1, 2* m * n , 2 * m * n); + + IDEBUG_END; + + return; + +} diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 5702b7ac8..a3ccc19a9 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -329,23 +329,27 @@ endif ###### BLAS extensions ##### SBLASOBJS += \ somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) + somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + sgeadd_k$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) + domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + dgeadd_k$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ - comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) + comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + cgeadd_k$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ - zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) + zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + zgeadd_k$(TSUFFIX).$(SUFFIX) SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) @@ -3440,3 +3444,31 @@ $(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ +ifndef SGEADD_K +SGEADD_K = ../arm/geadd.c +endif + +$(KDIR)sgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef DGEADD_K +DGEADD_K = ../arm/geadd.c +endif + +$(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef CGEADD_K +CGEADD_K = ../arm/zgeadd.c +endif + +$(KDIR)cgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEADD_K) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM $< -o $@ + +ifndef ZGEADD_K +ZGEADD_K = ../arm/zgeadd.c +endif + +$(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@ + diff --git a/kernel/arm/geadd.c b/kernel/arm/geadd.c new file mode 100644 index 000000000..062918b8c --- /dev/null +++ b/kernel/arm/geadd.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT beta, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i; + FLOAT *aptr,*bptr; + + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + + + aptr = a; + bptr = b; + + if ( alpha == 0.0 ) + { + for ( i=0; i Date: Thu, 26 Feb 2015 01:47:11 +0800 Subject: [PATCH 29/37] Refs #509. Fixed geadd building bug with DYNAMIC_ARCH=1. --- common_z.h | 2 +- kernel/Makefile.L3 | 8 ++++---- kernel/{arm => generic}/geadd.c | 0 kernel/{arm => generic}/zgeadd.c | 0 kernel/setparam-ref.c | 3 ++- 5 files changed, 7 insertions(+), 6 deletions(-) rename kernel/{arm => generic}/geadd.c (100%) rename kernel/{arm => generic}/zgeadd.c (100%) diff --git a/common_z.h b/common_z.h index 85f577a27..b17122776 100644 --- a/common_z.h +++ b/common_z.h @@ -404,7 +404,7 @@ #define ZOMATCOPY_K_CTC gotoblas -> zomatcopy_k_ctc #define ZOMATCOPY_K_RTC gotoblas -> zomatcopy_k_rtc -#define ZGEADD_K zgeadd_k +#define ZGEADD_K gotoblas -> zgeadd_k #endif diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index a3ccc19a9..fdbae2daa 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -3445,28 +3445,28 @@ $(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC) ifndef SGEADD_K -SGEADD_K = ../arm/geadd.c +SGEADD_K = ../generic/geadd.c endif $(KDIR)sgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ ifndef DGEADD_K -DGEADD_K = ../arm/geadd.c +DGEADD_K = ../generic/geadd.c endif $(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ ifndef CGEADD_K -CGEADD_K = ../arm/zgeadd.c +CGEADD_K = ../generic/zgeadd.c endif $(KDIR)cgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEADD_K) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM $< -o $@ ifndef ZGEADD_K -ZGEADD_K = ../arm/zgeadd.c +ZGEADD_K = ../generic/zgeadd.c endif $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) diff --git a/kernel/arm/geadd.c b/kernel/generic/geadd.c similarity index 100% rename from kernel/arm/geadd.c rename to kernel/generic/geadd.c diff --git a/kernel/arm/zgeadd.c b/kernel/generic/zgeadd.c similarity index 100% rename from kernel/arm/zgeadd.c rename to kernel/generic/zgeadd.c diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 1fa7f7984..0eeac2e1f 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -548,8 +548,9 @@ gotoblas_t TABLE_NAME = { comatcopy_k_cnTS, comatcopy_k_ctTS, comatcopy_k_rnTS, comatcopy_k_rtTS, comatcopy_k_cncTS, comatcopy_k_ctcTS, comatcopy_k_rncTS, comatcopy_k_rtcTS, zomatcopy_k_cnTS, zomatcopy_k_ctTS, zomatcopy_k_rnTS, zomatcopy_k_rtTS, - zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS + zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS, + sgeadd_kTS, dgeadd_kTS, cgeadd_kTS, zgeadd_kTS }; From cdefdb21cd9b98b5ad60bd4b216afe37c58e4cd7 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 26 Feb 2015 06:37:03 +0800 Subject: [PATCH 30/37] Refs #492. Fixed c/zsyr bug with negative incx. --- interface/zsyr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/zsyr.c b/interface/zsyr.c index 5d62e8797..5fe29cefa 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -173,7 +173,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO FUNCTION_PROFILE_START(); - if (incx < 0 ) x -= (n - 1) * incx; + if (incx < 0 ) x -= (n - 1) * incx * 2; buffer = (FLOAT *)blas_memory_alloc(1); From b6438dedea99647fa479f67766ea8a2c788b17f9 Mon Sep 17 00:00:00 2001 From: Ton van den Heuvel Date: Wed, 18 Mar 2015 13:22:43 +0100 Subject: [PATCH 31/37] Fix issue #508 Fix race condition during shutdown causing a crash in gotoblas_set_affinity(). --- CONTRIBUTORS.md | 3 +++ driver/others/memory.c | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 152ec95aa..b88e3671b 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -124,5 +124,8 @@ In chronological order: * Dan Kortschak * [2015-01-07] Added test for drotmg bug #484. +* Ton van den Heuvel + * [2015-03-18] Fix race condition during shutdown causing a crash in gotoblas_set_affinity(). + * [Your name or handle] <[email or website]> * [Date] [Brief summary of your changes] diff --git a/driver/others/memory.c b/driver/others/memory.c index 031615576..4010ec974 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1353,6 +1353,8 @@ void DESTRUCTOR gotoblas_quit(void) { if (gotoblas_initialized == 0) return; + blas_shutdown(); + #ifdef PROFILE moncontrol (0); #endif @@ -1374,8 +1376,6 @@ void DESTRUCTOR gotoblas_quit(void) { #ifdef PROFILE moncontrol (1); #endif - - blas_shutdown(); } #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) From 75c40bcc4827524d4de5b8458bf197c6c8e66e38 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 19 Mar 2015 11:51:36 -0500 Subject: [PATCH 32/37] Refs #520. Fixed ONLY_CBLAS=1 compiling bug on OSX. --- exports/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/exports/Makefile b/exports/Makefile index f2f688191..1fdaf2213 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -100,7 +100,12 @@ else $(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed $(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def endif +ifeq ($(NOFORTRAN), 2) +#only build cblas without Fortran + $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) +else $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) +endif dllinit.$(SUFFIX) : dllinit.c $(CC) $(CFLAGS) -c -o $(@F) -s $< From e95d64333a6726a9c339eca0e803881f6cfc9926 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 19 Mar 2015 15:57:22 -0500 Subject: [PATCH 33/37] Refs #519. Avoid calling strncpy. --- driver/others/dynamic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 87420938f..60b3c72af 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -363,7 +363,7 @@ static gotoblas_t *force_coretype(char *coretype){ int i ; int found = -1; char message[128]; - char mname[20]; + //char mname[20]; for ( i=1 ; i <= 21; i++) { @@ -375,8 +375,8 @@ static gotoblas_t *force_coretype(char *coretype){ } if (found < 0) { - strncpy(mname,coretype,20); - sprintf(message, "Core not found: %s\n",mname); + //strncpy(mname,coretype,20); + snprintf(message, 128, "Core not found: %s\n",coretype); openblas_warning(1, message); return(NULL); } From 770fac92eb5abb50659056116337d0ab25f492b4 Mon Sep 17 00:00:00 2001 From: Maximilien Levesque Date: Fri, 20 Mar 2015 23:25:11 +0100 Subject: [PATCH 34/37] Correct typo /proc/ instead of /pros/ --- GotoBLAS_05LargePage.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/GotoBLAS_05LargePage.txt b/GotoBLAS_05LargePage.txt index ec5106fcd..c3e171a88 100644 --- a/GotoBLAS_05LargePage.txt +++ b/GotoBLAS_05LargePage.txt @@ -9,10 +9,10 @@ If you want to allocate 64 large pages, - $shell> echo 0 > /pros/sys/vm/nr_hugepages # need to be reset - $shell> echo 65 > /pros/sys/vm/nr_hugepages # add 1 extra page - $shell> echo 3355443200 > /pros/sys/kernel/shmmax # just large number - $shell> echo 3355443200 > /pros/sys/kernel/shmall + $shell> echo 0 > /proc/sys/vm/nr_hugepages # need to be reset + $shell> echo 65 > /proc/sys/vm/nr_hugepages # add 1 extra page + $shell> echo 3355443200 > /proc/sys/kernel/shmmax # just large number + $shell> echo 3355443200 > /proc/sys/kernel/shmall Also may add a few lines into /etc/security/limits.conf file. From e34911a73d4db160475519f686dd4d9548710adc Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 24 Mar 2015 17:15:33 +0000 Subject: [PATCH 35/37] Fix compiling bug for ARM with setting BINARY. --- c_check | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/c_check b/c_check index c4aa2d420..fbe9c9fab 100644 --- a/c_check +++ b/c_check @@ -81,6 +81,10 @@ if (($architecture eq "mips32") || ($architecture eq "mips64")) { $defined = 1; } +if (($architecture eq "arm") || ($architecture eq "arm64")) { + $defined = 1; +} + if ($architecture eq "alpha") { $defined = 1; $binary = 64; From c674fa32be040f6a3d4d150169fce2c0002dd70f Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 24 Mar 2015 12:17:04 -0500 Subject: [PATCH 36/37] Add ARM targets. --- TargetList.txt | 5 +++++ getarch.c | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/TargetList.txt b/TargetList.txt index c91401f01..1c985080b 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -63,6 +63,11 @@ SPARC SPARCV7 6.ARM CPU: +CORTEXA15 +CORTEXA9 ARMV7 ARMV6 ARMV5 + +7.ARM 64-bit CPU: +ARMV8 diff --git a/getarch.c b/getarch.c index ed304b692..ee5f55fd1 100644 --- a/getarch.c +++ b/getarch.c @@ -752,8 +752,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ "-DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" -#define LIBNAME "cortexa9" -#define CORENAME "CORTEXA9" +#define LIBNAME "cortexa15" +#define CORENAME "CORTEXA15" #else #endif From a3491e1e88e0f6a8a09188825048a66ef07ad80f Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 24 Mar 2015 15:05:59 -0500 Subject: [PATCH 37/37] Update the doc for 0.2.14. --- Changelog.txt | 20 ++++++++++++++++++++ Makefile.rule | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index b11321f71..6941a9f96 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,24 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.14 +24-Mar-2015 +common: + * Improve OpenBLASConfig.cmake. (#474, #475. Thanks, xantares.) + * Improve ger and gemv for small matrices by stack allocation. + e.g. make -DMAX_STACK_ALLOC=2048 (#482. Thanks, Jerome Robert.) + * Introduce openblas_get_num_threads and openblas_get_num_procs. + (#497. Thanks, Erik Schnetter.) + * Add ATLAS-style ?geadd function. (#509. Thanks, Martin Köhler.) + * Fix c/zsyr bug with negative incx. (#492.) + * Fix race condition during shutdown causing a crash in + gotoblas_set_affinity(). (#508. Thanks, Ton van den Heuvel.) + +x86/x86-64: + * Support AMD Streamroller. + +ARM: + * Add Cortex-A9 and Cortex-A15 targets. + ==================================================================== Version 0.2.13 3-Dec-2014 diff --git a/Makefile.rule b/Makefile.rule index bea1fe194..1479de660 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.13 +VERSION = 0.2.14 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library