diff --git a/interface/axpy.c b/interface/axpy.c index f0d95b395..39edea6af 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -40,11 +40,11 @@ #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" -#endif +#endif #if defined(Z13) #define MULTI_THREAD_MINIMAL 200000 #else -#define MULTI_THREAD_MINIMAL 10000 +#define MULTI_THREAD_MINIMAL 10000 #endif #ifndef CBLAS @@ -83,17 +83,15 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc if (incy < 0) y -= (n - 1) * incy; #ifdef SMP - nthreads = num_cpu_avail(1); - //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. - if (incx == 0 || incy == 0) - nthreads = 1; - + // //Temporarily work-around the low performance issue with small imput size & //multithreads. - if (n <= MULTI_THREAD_MINIMAL) + if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif diff --git a/interface/scal.c b/interface/scal.c index 3f468a2a3..6d07b1650 100644 --- a/interface/scal.c +++ b/interface/scal.c @@ -76,10 +76,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ #ifdef SMP - nthreads = num_cpu_avail(1); - if (n <= 1048576 ) nthreads = 1; + else + nthreads = num_cpu_avail(1); + if (nthreads == 1) { #endif diff --git a/interface/zaxpy.c b/interface/zaxpy.c index 529e78e79..1a0259c96 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -90,18 +90,16 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in if (incy < 0) y -= (n - 1) * incy * 2; #ifdef SMP - nthreads = num_cpu_avail(1); - //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. - if (incx == 0 || incy == 0) - nthreads = 1; - - //Work around the low performance issue with small imput size & + // + //Temporarily work-around the low performance issue with small imput size & //multithreads. - if (n <= MULTI_THREAD_MINIMAL) { + if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; - } + else + nthreads = num_cpu_avail(1); + if (nthreads == 1) { #endif diff --git a/interface/zscal.c b/interface/zscal.c index 633b6ecf5..bfaddc260 100644 --- a/interface/zscal.c +++ b/interface/zscal.c @@ -90,10 +90,10 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ FUNCTION_PROFILE_START(); #ifdef SMP - nthreads = num_cpu_avail(1); - if ( n <= 1048576 ) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif diff --git a/interface/zswap.c b/interface/zswap.c index 5308cbe90..e33bbafba 100644 --- a/interface/zswap.c +++ b/interface/zswap.c @@ -79,12 +79,12 @@ FLOAT *y = (FLOAT*)vy; if (incy < 0) y -= (n - 1) * incy * 2; #ifdef SMP - nthreads = num_cpu_avail(1); - //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. if (incx == 0 || incy == 0) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif diff --git a/kernel/arm64/casum_thunderx2t99.c b/kernel/arm64/casum_thunderx2t99.c index cd5d936c5..c6dbb3f77 100644 --- a/kernel/arm64/casum_thunderx2t99.c +++ b/kernel/arm64/casum_thunderx2t99.c @@ -233,13 +233,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = casum_compute(n, x, inc_x); diff --git a/kernel/arm64/copy_thunderx2t99.c b/kernel/arm64/copy_thunderx2t99.c index bd67b48b0..e31876139 100644 --- a/kernel/arm64/copy_thunderx2t99.c +++ b/kernel/arm64/copy_thunderx2t99.c @@ -183,13 +183,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if (n <= 0) return 0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { do_copy(n, x, inc_x, y, inc_y); diff --git a/kernel/arm64/dasum_thunderx2t99.c b/kernel/arm64/dasum_thunderx2t99.c index ba12fc776..a212c9534 100644 --- a/kernel/arm64/dasum_thunderx2t99.c +++ b/kernel/arm64/dasum_thunderx2t99.c @@ -228,13 +228,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = dasum_compute(n, x, inc_x); diff --git a/kernel/arm64/dot_thunderx2t99.c b/kernel/arm64/dot_thunderx2t99.c index 8eeb94f36..3940acddd 100644 --- a/kernel/arm64/dot_thunderx2t99.c +++ b/kernel/arm64/dot_thunderx2t99.c @@ -199,7 +199,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " faddp "DOTF", v0.2d \n" #endif /* !defined(DSDOT) */ -#else /* !defined(DOUBLE) */ +#else /* !defined(DOUBLE) */ #define KERNEL_F1 \ " ldr "TMPX", ["X"] \n" \ " ldr "TMPY", ["Y"] \n" \ @@ -384,13 +384,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y RETURN_TYPE dot = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0 || inc_y == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || inc_y == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { dot = dot_compute(n, x, inc_x, y, inc_y); diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index 2aea9b4a9..b94f0cffc 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -328,10 +328,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - if (n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { nrm2_compute(n, x, inc_x, &ssq, &scale); diff --git a/kernel/arm64/dznrm2_thunderx2t99_fast.c b/kernel/arm64/dznrm2_thunderx2t99_fast.c index 8b04a3eb6..8405b388b 100644 --- a/kernel/arm64/dznrm2_thunderx2t99_fast.c +++ b/kernel/arm64/dznrm2_thunderx2t99_fast.c @@ -235,10 +235,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - if (n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { nrm2 = nrm2_compute(n, x, inc_x); diff --git a/kernel/arm64/iamax_thunderx2t99.c b/kernel/arm64/iamax_thunderx2t99.c index a11b18419..e3bec4a20 100644 --- a/kernel/arm64/iamax_thunderx2t99.c +++ b/kernel/arm64/iamax_thunderx2t99.c @@ -321,13 +321,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG max_index = 0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { max_index = iamax_compute(n, x, inc_x); diff --git a/kernel/arm64/izamax_thunderx2t99.c b/kernel/arm64/izamax_thunderx2t99.c index 8d70b0515..b2e2828f0 100644 --- a/kernel/arm64/izamax_thunderx2t99.c +++ b/kernel/arm64/izamax_thunderx2t99.c @@ -330,13 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG max_index = 0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { max_index = izamax_compute(n, x, inc_x); diff --git a/kernel/arm64/sasum_thunderx2t99.c b/kernel/arm64/sasum_thunderx2t99.c index 28fc34c62..014c667ba 100644 --- a/kernel/arm64/sasum_thunderx2t99.c +++ b/kernel/arm64/sasum_thunderx2t99.c @@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = sasum_compute(n, x, inc_x); diff --git a/kernel/arm64/scnrm2_thunderx2t99.c b/kernel/arm64/scnrm2_thunderx2t99.c index b8df4962b..f96de441e 100644 --- a/kernel/arm64/scnrm2_thunderx2t99.c +++ b/kernel/arm64/scnrm2_thunderx2t99.c @@ -318,10 +318,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - if (n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { nrm2_double = nrm2_compute(n, x, inc_x); diff --git a/kernel/arm64/zasum_thunderx2t99.c b/kernel/arm64/zasum_thunderx2t99.c index 140e5a741..1d303a9a3 100644 --- a/kernel/arm64/zasum_thunderx2t99.c +++ b/kernel/arm64/zasum_thunderx2t99.c @@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = zasum_compute(n, x, inc_x); diff --git a/kernel/arm64/zdot_thunderx2t99.c b/kernel/arm64/zdot_thunderx2t99.c index 70d683077..6185bc7d9 100644 --- a/kernel/arm64/zdot_thunderx2t99.c +++ b/kernel/arm64/zdot_thunderx2t99.c @@ -317,13 +317,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA CIMAG(zdot) = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0 || inc_y == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || inc_y == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { zdot_compute(n, x, inc_x, y, inc_y, &zdot); diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 059549028..0dc9cd3da 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) #include "ddot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "ddot_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "ddot_microk_piledriver-2.c" -#elif defined(NEHALEM) +#elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ddot_microk_haswell-2.c" @@ -110,7 +110,7 @@ static FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON FLOAT temp1 = 0.0; FLOAT temp2 = 0.0; - BLASLONG n1 = n & -4; + BLASLONG n1 = n & -4; while(i < n1) { @@ -169,13 +169,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT dot = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0 || inc_y == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || inc_y == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { dot = dot_compute(n, x, inc_x, y, inc_y);