diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index b1c8dd140..df92cf4ef 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -124,7 +124,13 @@ In chronological order: * Jerome Robert * [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478) * [2015-12-23] `stack_check` in `gemv.c` (bug #722) + * [2015-12-28] Allow to force the number of parallel make job + * [2015-12-28] Fix detection of AMD E2-3200 detection + * [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected * [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731) + * [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742) + * [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760) + * [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727) * Dan Kortschak * [2015-01-07] Added test for drotmg bug #484. diff --git a/benchmark/Makefile b/benchmark/Makefile index 492d2617f..bcf3da2cc 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -166,7 +166,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + smallscaling acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -2132,6 +2133,8 @@ cgemm3m.$(SUFFIX) : gemm3m.c zgemm3m.$(SUFFIX) : gemm3m.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +smallscaling: smallscaling.c ../$(LIBNAME) + $(CC) $(CFLAGS) -lpthread -fopenmp -lm -o $(@F) $^ clean :: @rm -f *.goto *.mkl *.acml *.atlas *.veclib diff --git a/benchmark/smallscaling.c b/benchmark/smallscaling.c new file mode 100644 index 000000000..daed8f3da --- /dev/null +++ b/benchmark/smallscaling.c @@ -0,0 +1,191 @@ +// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n +#include +#include +#include +#include +#include +#include +#define MIN_SIZE 5 +#define MAX_SIZE 60 +#define NB_SIZE 10 + +// number of loop for a 1x1 matrix. Lower it if the test is +// too slow on you computer. +#define NLOOP 2e7 + +typedef struct { + int matrix_size; + int n_loop; + void (* bench_func)(); + void (* blas_func)(); + void * (* create_matrix)(int size); +} BenchParam; + +void * s_create_matrix(int size) { + float * r = malloc(size * sizeof(double)); + for(int i = 0; i < size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void * c_create_matrix(int size) { + float * r = malloc(size * 2 * sizeof(double)); + for(int i = 0; i < 2 * size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void * z_create_matrix(int size) { + double * r = malloc(size * 2 * sizeof(double)); + for(int i = 0; i < 2 * size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void * d_create_matrix(int size) { + double * r = malloc(size * sizeof(double)); + for(int i = 0; i < size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void trmv_bench(BenchParam * param) +{ + int i, n; + int size = param->matrix_size; + n = param->n_loop / size; + int one = 1; + void * A = param->create_matrix(size * size); + void * y = param->create_matrix(size); + for(i = 0; i < n; i++) { + param->blas_func("U", "N", "N", &size, A, &size, y, &one); + } + free(A); + free(y); +} + +void gemv_bench(BenchParam * param) +{ + int i, n; + int size = param->matrix_size; + n = param->n_loop / size; + double v = 1.01; + int one = 1; + void * A = param->create_matrix(size * size); + void * y = param->create_matrix(size); + for(i = 0; i < n; i++) { + param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one); + } + free(A); + free(y); +} + +void ger_bench(BenchParam * param) { + int i, n; + int size = param->matrix_size; + n = param->n_loop / size; + double v = 1.01; + int one = 1; + void * A = param->create_matrix(size * size); + void * y = param->create_matrix(size); + for(i = 0; i < n; i++) { + param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size); + } + free(A); + free(y); +} + +#ifndef _WIN32 +void * pthread_func_wrapper(void * param) { + ((BenchParam *)param)->bench_func(param); + pthread_exit(NULL); +} +#endif + +#define NB_TESTS 5 +void * TESTS[4 * NB_TESTS] = { + trmv_bench, ztrmv_, z_create_matrix, "ztrmv", + gemv_bench, dgemv_, d_create_matrix, "dgemv", + gemv_bench, zgemv_, z_create_matrix, "zgemv", + ger_bench, dger_, d_create_matrix, "dger", + ger_bench, zgerc_, z_create_matrix, "zgerc", +}; + +inline static double delta_time(struct timespec tick) { + struct timespec tock; + clock_gettime(CLOCK_MONOTONIC, &tock); + return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9; +} + +double pthread_bench(BenchParam * param, int nb_threads) +{ +#ifdef _WIN32 + return 0; +#else + BenchParam threaded_param = *param; + pthread_t threads[nb_threads]; + int t, rc; + struct timespec tick; + threaded_param.n_loop /= nb_threads; + clock_gettime(CLOCK_MONOTONIC, &tick); + for(t=0; tbench_func(param); + return delta_time(tick); +} + +double omp_bench(BenchParam * param) { + BenchParam threaded_param = *param; + struct timespec tick; + int t; + int nb_threads = omp_get_max_threads(); + threaded_param.n_loop /= nb_threads; + clock_gettime(CLOCK_MONOTONIC, &tick); + #pragma omp parallel for + for(t = 0; t < nb_threads; t ++){ + param->bench_func(&threaded_param); + } + return delta_time(tick); +} + +int main(int argc, char * argv[]) { + double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE); + BenchParam param; + int test_id; + printf ("Running on %d threads\n", omp_get_max_threads()); + for(test_id = 0; test_id < NB_TESTS; test_id ++) { + double size = MIN_SIZE; + param.bench_func = TESTS[test_id * 4]; + param.blas_func = TESTS[test_id * 4 + 1]; + param.create_matrix = TESTS[test_id * 4 + 2]; + printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]); + param.n_loop = NLOOP; + while(size <= MAX_SIZE) { + param.matrix_size = (int)(size + 0.5); + double seq_time = seq_bench(¶m); + double omp_time = omp_bench(¶m); + double pthread_time = pthread_bench(¶m, omp_get_max_threads()); + printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, " + "pthread %gs, speedup %g\n", + param.matrix_size, seq_time, + omp_time, seq_time / omp_time, + pthread_time, seq_time / pthread_time); + size *= inc_factor; + } + } +} diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index a9dc2dc62..42edb83cb 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #endif x = buffer; - buffer += ((COMPSIZE * args -> m + 1023) & ~1023); + buffer += ((COMPSIZE * args -> m + 3) & ~3); } #ifndef TRANS @@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu if (num_cpu) { queue[0].sa = NULL; - queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; + queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE; queue[num_cpu - 1].next = NULL; diff --git a/driver/level2/ztrmv_U.c b/driver/level2/ztrmv_U.c index f9671c9d6..063de6cbc 100644 --- a/driver/level2/ztrmv_U.c +++ b/driver/level2/ztrmv_U.c @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15); COPY_K(m, b, incb, buffer, 1); } diff --git a/interface/zgemv.c b/interface/zgemv.c index 584080d30..e5ba3757c 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -77,6 +77,7 @@ void NAME(char *TRANS, blasint *M, blasint *N, blasint incy = *INCY; FLOAT *buffer; + int buffer_size; #ifdef SMP int nthreads; #endif @@ -141,7 +142,7 @@ void CNAME(enum CBLAS_ORDER order, FLOAT *buffer; blasint lenx, leny; - int trans; + int trans, buffer_size; blasint info, t; #ifdef SMP int nthreads; @@ -230,7 +231,19 @@ void CNAME(enum CBLAS_ORDER order, if (incx < 0) x -= (lenx - 1) * incx * 2; if (incy < 0) y -= (leny - 1) * incy * 2; - buffer = (FLOAT *)blas_memory_alloc(1); + buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT) ; +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, FLOAT, buffer); + +#if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0 + // cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746) + if(trans && stack_alloc_size) + memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size)); +#endif #ifdef SMP @@ -253,7 +266,7 @@ void CNAME(enum CBLAS_ORDER order, } #endif - blas_memory_free(buffer); + STACK_FREE(buffer); FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); diff --git a/interface/zger.c b/interface/zger.c index f7354d26d..db72b4e4c 100644 --- a/interface/zger.c +++ b/interface/zger.c @@ -210,7 +210,7 @@ void CNAME(enum CBLAS_ORDER order, if (incy < 0) y -= (n - 1) * incy * 2; if (incx < 0) x -= (m - 1) * incx * 2; - buffer = (FLOAT *)blas_memory_alloc(1); + STACK_ALLOC(2 * m, FLOAT, buffer); #ifdef SMPTEST // Threshold chosen so that speed-up is > 1 on a Xeon E5-2630 @@ -249,7 +249,7 @@ void CNAME(enum CBLAS_ORDER order, } #endif - blas_memory_free(buffer); + STACK_FREE(buffer); FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); diff --git a/interface/ztrmv.c b/interface/ztrmv.c index 1abaac920..2be915c32 100644 --- a/interface/ztrmv.c +++ b/interface/ztrmv.c @@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, blasint info; int uplo; int unit; - int trans; + int trans, buffer_size; FLOAT *buffer; #ifdef SMP int nthreads; @@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { - int trans, uplo, unit; + int trans, uplo, unit, buffer_size; blasint info; FLOAT *buffer; #ifdef SMP @@ -227,11 +227,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (incx < 0 ) x -= (n - 1) * incx * 2; - buffer = (FLOAT *)blas_memory_alloc(1); +#ifdef SMP + // Calibrated on a Xeon E5-2630 + if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) { + nthreads = num_cpu_avail(2); + if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 2; + } else + nthreads = 1; + + if(nthreads > 1) { + buffer_size = n > 16 ? 0 : n * 4 + 40; + } + else +#endif + { + buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT); + if(incx != 1) + buffer_size += n * 2; + } + STACK_ALLOC(buffer_size, FLOAT, buffer); #ifdef SMP - nthreads = num_cpu_avail(2); - if (nthreads == 1) { #endif @@ -245,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } #endif - blas_memory_free(buffer); + STACK_FREE(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n);