Add multithreading support to the x86_64 zdot kernel (#2222)

* Add multithreading support

copied from the ThunderX2T99 kernel. For #2221
This commit is contained in:
Martin Kroeker 2019-08-15 22:09:12 +02:00 committed by GitHub
parent b48c025974
commit 9ef96b32a6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 74 additions and 16 deletions

View File

@ -86,18 +86,26 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
#endif #endif
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#if defined(SMP)
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif
static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,OPENBLAS_COMPLEX_FLOAT *result)
{ {
BLASLONG i; BLASLONG i;
BLASLONG ix,iy; BLASLONG ix,iy;
FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ;
if ( n <= 0 ) if ( n <= 0 )
{ {
// CREAL(result) = 0.0 ; OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
// CIMAG(result) = 0.0 ; *result=res;
OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); return;
return(result);
} }
@ -150,18 +158,68 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
} }
#if !defined(CONJ) #if !defined(CONJ)
OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]);
// CREAL(result) = dot[0] - dot[1];
// CIMAG(result) = dot[2] + dot[3];
#else #else
OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]);
// CREAL(result) = dot[0] + dot[1];
// CIMAG(result) = dot[2] - dot[3];
#endif #endif
*result=res;
return(result); return;
} }
#if defined(SMP)
static int zdot_thread_function(BLASLONG n, BLASLONG dummy0,
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
{
zdot_compute(n, x, inc_x, y, inc_y, (void *)result);
return 0;
}
#endif
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha;
#endif
OPENBLAS_COMPLEX_FLOAT zdot;
CREAL(zdot) = 0.0;
CIMAG(zdot) = 0.0;
#if defined(SMP)
if (inc_x == 0 || inc_y == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) {
zdot_compute(n, x, inc_x, y, inc_y, &zdot);
} else {
int mode, i;
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
OPENBLAS_COMPLEX_FLOAT *ptr;
#if !defined(DOUBLE)
mode = BLAS_SINGLE | BLAS_COMPLEX;
#else
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#endif
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, y, inc_y, result, 0,
( void *)zdot_thread_function, nthreads);
ptr = (OPENBLAS_COMPLEX_FLOAT *)result;
for (i = 0; i < nthreads; i++) {
CREAL(zdot) = CREAL(zdot) + CREAL(*ptr);
CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr);
ptr = (void *)(((char *)ptr) + sizeof(double) * 2);
}
}
#else
zdot_compute(n, x, inc_x, y, inc_y, &zdot);
#endif
return zdot;
}