Fixed a few more unnecessary calls to num_cpu_avail.
I don't have as many benchmarks for these as for gemm, but it should still make a difference for small matrices.
This commit is contained in:
parent
3313e4b946
commit
c2545b0fd6
|
@ -40,11 +40,11 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#ifdef FUNCTION_PROFILE
|
#ifdef FUNCTION_PROFILE
|
||||||
#include "functable.h"
|
#include "functable.h"
|
||||||
#endif
|
#endif
|
||||||
#if defined(Z13)
|
#if defined(Z13)
|
||||||
#define MULTI_THREAD_MINIMAL 200000
|
#define MULTI_THREAD_MINIMAL 200000
|
||||||
#else
|
#else
|
||||||
#define MULTI_THREAD_MINIMAL 10000
|
#define MULTI_THREAD_MINIMAL 10000
|
||||||
#endif
|
#endif
|
||||||
#ifndef CBLAS
|
#ifndef CBLAS
|
||||||
|
|
||||||
|
@ -83,17 +83,15 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
|
||||||
if (incy < 0) y -= (n - 1) * incy;
|
if (incy < 0) y -= (n - 1) * incy;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
//disable multi-thread when incx==0 or incy==0
|
//disable multi-thread when incx==0 or incy==0
|
||||||
//In that case, the threads would be dependent.
|
//In that case, the threads would be dependent.
|
||||||
if (incx == 0 || incy == 0)
|
//
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
//Temporarily work-around the low performance issue with small imput size &
|
//Temporarily work-around the low performance issue with small imput size &
|
||||||
//multithreads.
|
//multithreads.
|
||||||
if (n <= MULTI_THREAD_MINIMAL)
|
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -76,10 +76,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){
|
||||||
|
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
if (n <= 1048576 )
|
if (n <= 1048576 )
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -90,18 +90,16 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
|
||||||
if (incy < 0) y -= (n - 1) * incy * 2;
|
if (incy < 0) y -= (n - 1) * incy * 2;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
//disable multi-thread when incx==0 or incy==0
|
//disable multi-thread when incx==0 or incy==0
|
||||||
//In that case, the threads would be dependent.
|
//In that case, the threads would be dependent.
|
||||||
if (incx == 0 || incy == 0)
|
//
|
||||||
nthreads = 1;
|
//Temporarily work-around the low performance issue with small imput size &
|
||||||
|
|
||||||
//Work around the low performance issue with small imput size &
|
|
||||||
//multithreads.
|
//multithreads.
|
||||||
if (n <= MULTI_THREAD_MINIMAL) {
|
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
}
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -90,10 +90,10 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
|
||||||
FUNCTION_PROFILE_START();
|
FUNCTION_PROFILE_START();
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
if ( n <= 1048576 )
|
if ( n <= 1048576 )
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -79,12 +79,12 @@ FLOAT *y = (FLOAT*)vy;
|
||||||
if (incy < 0) y -= (n - 1) * incy * 2;
|
if (incy < 0) y -= (n - 1) * incy * 2;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
//disable multi-thread when incx==0 or incy==0
|
//disable multi-thread when incx==0 or incy==0
|
||||||
//In that case, the threads would be dependent.
|
//In that case, the threads would be dependent.
|
||||||
if (incx == 0 || incy == 0)
|
if (incx == 0 || incy == 0)
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -233,13 +233,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
FLOAT asum = 0.0;
|
FLOAT asum = 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
|
||||||
if (inc_x == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
asum = casum_compute(n, x, inc_x);
|
asum = casum_compute(n, x, inc_x);
|
||||||
|
|
|
@ -183,13 +183,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
if (n <= 0) return 0;
|
if (n <= 0) return 0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
|
||||||
if (inc_x == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
do_copy(n, x, inc_x, y, inc_y);
|
do_copy(n, x, inc_x, y, inc_y);
|
||||||
|
|
|
@ -228,13 +228,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
FLOAT asum = 0.0;
|
FLOAT asum = 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
|
||||||
if (inc_x == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
asum = dasum_compute(n, x, inc_x);
|
asum = dasum_compute(n, x, inc_x);
|
||||||
|
|
|
@ -199,7 +199,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
" faddp "DOTF", v0.2d \n"
|
" faddp "DOTF", v0.2d \n"
|
||||||
#endif /* !defined(DSDOT) */
|
#endif /* !defined(DSDOT) */
|
||||||
|
|
||||||
#else /* !defined(DOUBLE) */
|
#else /* !defined(DOUBLE) */
|
||||||
#define KERNEL_F1 \
|
#define KERNEL_F1 \
|
||||||
" ldr "TMPX", ["X"] \n" \
|
" ldr "TMPX", ["X"] \n" \
|
||||||
" ldr "TMPY", ["Y"] \n" \
|
" ldr "TMPY", ["Y"] \n" \
|
||||||
|
@ -384,13 +384,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y
|
||||||
RETURN_TYPE dot = 0.0;
|
RETURN_TYPE dot = 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
if (inc_x == 0 || inc_y == 0 || n <= 10000)
|
||||||
|
|
||||||
if (inc_x == 0 || inc_y == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
dot = dot_compute(n, x, inc_x, y, inc_y);
|
dot = dot_compute(n, x, inc_x, y, inc_y);
|
||||||
|
|
|
@ -328,10 +328,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
if (n <= 0 || inc_x <= 0) return 0.0;
|
if (n <= 0 || inc_x <= 0) return 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
if (n <= 10000)
|
if (n <= 10000)
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
nrm2_compute(n, x, inc_x, &ssq, &scale);
|
nrm2_compute(n, x, inc_x, &ssq, &scale);
|
||||||
|
|
|
@ -235,10 +235,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
if (n <= 0 || inc_x <= 0) return 0.0;
|
if (n <= 0 || inc_x <= 0) return 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
if (n <= 10000)
|
if (n <= 10000)
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
nrm2 = nrm2_compute(n, x, inc_x);
|
nrm2 = nrm2_compute(n, x, inc_x);
|
||||||
|
|
|
@ -321,13 +321,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
BLASLONG max_index = 0;
|
BLASLONG max_index = 0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
|
||||||
if (inc_x == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
max_index = iamax_compute(n, x, inc_x);
|
max_index = iamax_compute(n, x, inc_x);
|
||||||
|
|
|
@ -330,13 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
BLASLONG max_index = 0;
|
BLASLONG max_index = 0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
|
||||||
if (inc_x == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
max_index = izamax_compute(n, x, inc_x);
|
max_index = izamax_compute(n, x, inc_x);
|
||||||
|
|
|
@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
FLOAT asum = 0.0;
|
FLOAT asum = 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
|
||||||
if (inc_x == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
asum = sasum_compute(n, x, inc_x);
|
asum = sasum_compute(n, x, inc_x);
|
||||||
|
|
|
@ -318,10 +318,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
if (n <= 0 || inc_x <= 0) return 0.0;
|
if (n <= 0 || inc_x <= 0) return 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
|
||||||
|
|
||||||
if (n <= 10000)
|
if (n <= 10000)
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
nrm2_double = nrm2_compute(n, x, inc_x);
|
nrm2_double = nrm2_compute(n, x, inc_x);
|
||||||
|
|
|
@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
FLOAT asum = 0.0;
|
FLOAT asum = 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
|
||||||
if (inc_x == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
asum = zasum_compute(n, x, inc_x);
|
asum = zasum_compute(n, x, inc_x);
|
||||||
|
|
|
@ -317,13 +317,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||||
CIMAG(zdot) = 0.0;
|
CIMAG(zdot) = 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
if (inc_x == 0 || inc_y == 0 || n <= 10000)
|
||||||
|
|
||||||
if (inc_x == 0 || inc_y == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
zdot_compute(n, x, inc_x, y, inc_y, &zdot);
|
zdot_compute(n, x, inc_x, y, inc_y, &zdot);
|
||||||
|
|
|
@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(BULLDOZER)
|
#if defined(BULLDOZER)
|
||||||
#include "ddot_microk_bulldozer-2.c"
|
#include "ddot_microk_bulldozer-2.c"
|
||||||
#elif defined(STEAMROLLER) || defined(EXCAVATOR)
|
#elif defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "ddot_microk_steamroller-2.c"
|
#include "ddot_microk_steamroller-2.c"
|
||||||
#elif defined(PILEDRIVER)
|
#elif defined(PILEDRIVER)
|
||||||
#include "ddot_microk_piledriver-2.c"
|
#include "ddot_microk_piledriver-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "ddot_microk_nehalem-2.c"
|
#include "ddot_microk_nehalem-2.c"
|
||||||
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX)
|
||||||
#include "ddot_microk_haswell-2.c"
|
#include "ddot_microk_haswell-2.c"
|
||||||
|
@ -110,7 +110,7 @@ static FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON
|
||||||
FLOAT temp1 = 0.0;
|
FLOAT temp1 = 0.0;
|
||||||
FLOAT temp2 = 0.0;
|
FLOAT temp2 = 0.0;
|
||||||
|
|
||||||
BLASLONG n1 = n & -4;
|
BLASLONG n1 = n & -4;
|
||||||
|
|
||||||
while(i < n1)
|
while(i < n1)
|
||||||
{
|
{
|
||||||
|
@ -169,13 +169,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
FLOAT dot = 0.0;
|
FLOAT dot = 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
nthreads = num_cpu_avail(1);
|
if (inc_x == 0 || inc_y == 0 || n <= 10000)
|
||||||
|
|
||||||
if (inc_x == 0 || inc_y == 0)
|
|
||||||
nthreads = 1;
|
|
||||||
|
|
||||||
if (n <= 10000)
|
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
dot = dot_compute(n, x, inc_x, y, inc_y);
|
dot = dot_compute(n, x, inc_x, y, inc_y);
|
||||||
|
|
Loading…
Reference in New Issue