diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 0a20564cf..7394e352e 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -43,6 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_sandy-2.c" #endif +#if !defined(DSDOT) +#define RETURN_TYPE FLOAT +#else +#define RETURN_TYPE double +#endif + #ifndef HAVE_KERNEL_8 @@ -71,7 +77,7 @@ static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) #endif -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; @@ -139,4 +145,64 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } +#if defined(SMP) +static int dot_thread_function(BLASLONG n, BLASLONG dummy0, + BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, RETURN_TYPE *result, BLASLONG dummy3) +{ + *(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, + BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + FLOAT dot = 0.0; + +#if defined(SMP) + nthreads = num_cpu_avail(1); + + if (inc_x == 0 || inc_y == 0) + nthreads = 1; + + if (n <= 10000) + nthreads = 1; + + if (nthreads == 1) { + dot = dot_compute(n, x, inc_x, y, inc_y); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) * 2]; + RETURN_TYPE *ptr; + +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_REAL; +#endif +fprintf(stderr,"threaded ddot with %d threads\n",nthreads); + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, y, inc_y, result, 0, + ( void *)dot_thread_function, nthreads); + + ptr = (RETURN_TYPE *)result; + for (i = 0; i < nthreads; i++) { + dot = dot + (*ptr); + ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + dot = dot_compute(n, x, inc_x, y, inc_y); +#endif + + return dot; +}