Add multithreading support for Haswell DDOT
copied from ashwinyes' implementation in dot_thunderx2t99.c
This commit is contained in:
parent
0ab5bf1746
commit
85a41e9cdb
|
@ -43,6 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "ddot_microk_sandy-2.c"
|
||||
#endif
|
||||
|
||||
#if !defined(DSDOT)
|
||||
#define RETURN_TYPE FLOAT
|
||||
#else
|
||||
#define RETURN_TYPE double
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
|
@ -71,7 +77,7 @@ static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
|||
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
@ -139,4 +145,64 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
|
||||
}
|
||||
|
||||
#if defined(SMP)
|
||||
static int dot_thread_function(BLASLONG n, BLASLONG dummy0,
|
||||
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||
BLASLONG inc_y, RETURN_TYPE *result, BLASLONG dummy3)
|
||||
{
|
||||
*(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
|
||||
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
|
||||
void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
#if defined(SMP)
|
||||
int nthreads;
|
||||
FLOAT dummy_alpha;
|
||||
#endif
|
||||
FLOAT dot = 0.0;
|
||||
|
||||
#if defined(SMP)
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
if (inc_x == 0 || inc_y == 0)
|
||||
nthreads = 1;
|
||||
|
||||
if (n <= 10000)
|
||||
nthreads = 1;
|
||||
|
||||
if (nthreads == 1) {
|
||||
dot = dot_compute(n, x, inc_x, y, inc_y);
|
||||
} else {
|
||||
int mode, i;
|
||||
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
|
||||
RETURN_TYPE *ptr;
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#else
|
||||
mode = BLAS_DOUBLE | BLAS_REAL;
|
||||
#endif
|
||||
fprintf(stderr,"threaded ddot with %d threads\n",nthreads);
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
|
||||
x, inc_x, y, inc_y, result, 0,
|
||||
( void *)dot_thread_function, nthreads);
|
||||
|
||||
ptr = (RETURN_TYPE *)result;
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
dot = dot + (*ptr);
|
||||
ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2);
|
||||
}
|
||||
}
|
||||
#else
|
||||
dot = dot_compute(n, x, inc_x, y, inc_y);
|
||||
#endif
|
||||
|
||||
return dot;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue