From 32f793195fc20e03d1385040f195ab968b70a877 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Sun, 3 Jan 2016 14:01:12 +0100 Subject: [PATCH 1/6] Use stack allocation in zgemv and zger For better performance with small matrices Ref #727 --- interface/zgemv.c | 13 ++++++++++--- interface/zger.c | 4 ++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/interface/zgemv.c b/interface/zgemv.c index 584080d30..a335f6832 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -77,6 +77,7 @@ void NAME(char *TRANS, blasint *M, blasint *N, blasint incy = *INCY; FLOAT *buffer; + int buffer_size; #ifdef SMP int nthreads; #endif @@ -141,7 +142,7 @@ void CNAME(enum CBLAS_ORDER order, FLOAT *buffer; blasint lenx, leny; - int trans; + int trans, buffer_size; blasint info, t; #ifdef SMP int nthreads; @@ -230,7 +231,13 @@ void CNAME(enum CBLAS_ORDER order, if (incx < 0) x -= (lenx - 1) * incx * 2; if (incy < 0) y -= (leny - 1) * incy * 2; - buffer = (FLOAT *)blas_memory_alloc(1); + buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT) ; +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, FLOAT, buffer); #ifdef SMP @@ -253,7 +260,7 @@ void CNAME(enum CBLAS_ORDER order, } #endif - blas_memory_free(buffer); + STACK_FREE(buffer); FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); diff --git a/interface/zger.c b/interface/zger.c index f7354d26d..db72b4e4c 100644 --- a/interface/zger.c +++ b/interface/zger.c @@ -210,7 +210,7 @@ void CNAME(enum CBLAS_ORDER order, if (incy < 0) y -= (n - 1) * incy * 2; if (incx < 0) x -= (m - 1) * incx * 2; - buffer = (FLOAT *)blas_memory_alloc(1); + STACK_ALLOC(2 * m, FLOAT, buffer); #ifdef SMPTEST // Threshold chosen so that speed-up is > 1 on a Xeon E5-2630 @@ -249,7 +249,7 @@ void CNAME(enum CBLAS_ORDER order, } #endif - blas_memory_free(buffer); + STACK_FREE(buffer); FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); From 78dcf5c3d583d933f579e6947fc8cf4b1988840f Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Thu, 14 Jan 2016 22:12:57 +0100 Subject: [PATCH 2/6] Improve performances of ztrmv on small matrices * Use stack allocation * Disable multi-threading * Ref #727 --- driver/level2/trmv_thread.c | 4 ++-- driver/level2/ztrmv_U.c | 2 +- interface/ztrmv.c | 29 +++++++++++++++++++++++------ 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index a9dc2dc62..42edb83cb 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #endif x = buffer; - buffer += ((COMPSIZE * args -> m + 1023) & ~1023); + buffer += ((COMPSIZE * args -> m + 3) & ~3); } #ifndef TRANS @@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu if (num_cpu) { queue[0].sa = NULL; - queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; + queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE; queue[num_cpu - 1].next = NULL; diff --git a/driver/level2/ztrmv_U.c b/driver/level2/ztrmv_U.c index f9671c9d6..063de6cbc 100644 --- a/driver/level2/ztrmv_U.c +++ b/driver/level2/ztrmv_U.c @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15); COPY_K(m, b, incb, buffer, 1); } diff --git a/interface/ztrmv.c b/interface/ztrmv.c index 1abaac920..2be915c32 100644 --- a/interface/ztrmv.c +++ b/interface/ztrmv.c @@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, blasint info; int uplo; int unit; - int trans; + int trans, buffer_size; FLOAT *buffer; #ifdef SMP int nthreads; @@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { - int trans, uplo, unit; + int trans, uplo, unit, buffer_size; blasint info; FLOAT *buffer; #ifdef SMP @@ -227,11 +227,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (incx < 0 ) x -= (n - 1) * incx * 2; - buffer = (FLOAT *)blas_memory_alloc(1); +#ifdef SMP + // Calibrated on a Xeon E5-2630 + if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) { + nthreads = num_cpu_avail(2); + if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 2; + } else + nthreads = 1; + + if(nthreads > 1) { + buffer_size = n > 16 ? 0 : n * 4 + 40; + } + else +#endif + { + buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT); + if(incx != 1) + buffer_size += n * 2; + } + STACK_ALLOC(buffer_size, FLOAT, buffer); #ifdef SMP - nthreads = num_cpu_avail(2); - if (nthreads == 1) { #endif @@ -245,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } #endif - blas_memory_free(buffer); + STACK_FREE(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); From 5fc2203d8a9951416a6689ed7029e059ce0405d8 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Sun, 24 Jan 2016 10:14:41 +0100 Subject: [PATCH 3/6] zgemv: Add a workaround for #746 --- interface/zgemv.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/interface/zgemv.c b/interface/zgemv.c index a335f6832..abf2aa710 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -239,6 +239,12 @@ void CNAME(enum CBLAS_ORDER order, buffer_size = (buffer_size + 3) & ~3; STACK_ALLOC(buffer_size, FLOAT, buffer); +#ifdef ARCH_X86_64 + // cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746) + if(trans && stack_alloc_size) + memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size)); +#endif + #ifdef SMP if ( 1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD ) From 73397faf6801779c974e93223f075c918a237ff2 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Sun, 3 Jan 2016 14:04:33 +0100 Subject: [PATCH 4/6] Add benchmark/smallscaling.c * Bench small matrices with multi-threading * Close #727 --- benchmark/Makefile | 5 +- benchmark/smallscaling.c | 191 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 195 insertions(+), 1 deletion(-) create mode 100644 benchmark/smallscaling.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 492d2617f..bcf3da2cc 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -166,7 +166,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + smallscaling acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -2132,6 +2133,8 @@ cgemm3m.$(SUFFIX) : gemm3m.c zgemm3m.$(SUFFIX) : gemm3m.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +smallscaling: smallscaling.c ../$(LIBNAME) + $(CC) $(CFLAGS) -lpthread -fopenmp -lm -o $(@F) $^ clean :: @rm -f *.goto *.mkl *.acml *.atlas *.veclib diff --git a/benchmark/smallscaling.c b/benchmark/smallscaling.c new file mode 100644 index 000000000..daed8f3da --- /dev/null +++ b/benchmark/smallscaling.c @@ -0,0 +1,191 @@ +// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n +#include +#include +#include +#include +#include +#include +#define MIN_SIZE 5 +#define MAX_SIZE 60 +#define NB_SIZE 10 + +// number of loop for a 1x1 matrix. Lower it if the test is +// too slow on you computer. +#define NLOOP 2e7 + +typedef struct { + int matrix_size; + int n_loop; + void (* bench_func)(); + void (* blas_func)(); + void * (* create_matrix)(int size); +} BenchParam; + +void * s_create_matrix(int size) { + float * r = malloc(size * sizeof(double)); + for(int i = 0; i < size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void * c_create_matrix(int size) { + float * r = malloc(size * 2 * sizeof(double)); + for(int i = 0; i < 2 * size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void * z_create_matrix(int size) { + double * r = malloc(size * 2 * sizeof(double)); + for(int i = 0; i < 2 * size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void * d_create_matrix(int size) { + double * r = malloc(size * sizeof(double)); + for(int i = 0; i < size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void trmv_bench(BenchParam * param) +{ + int i, n; + int size = param->matrix_size; + n = param->n_loop / size; + int one = 1; + void * A = param->create_matrix(size * size); + void * y = param->create_matrix(size); + for(i = 0; i < n; i++) { + param->blas_func("U", "N", "N", &size, A, &size, y, &one); + } + free(A); + free(y); +} + +void gemv_bench(BenchParam * param) +{ + int i, n; + int size = param->matrix_size; + n = param->n_loop / size; + double v = 1.01; + int one = 1; + void * A = param->create_matrix(size * size); + void * y = param->create_matrix(size); + for(i = 0; i < n; i++) { + param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one); + } + free(A); + free(y); +} + +void ger_bench(BenchParam * param) { + int i, n; + int size = param->matrix_size; + n = param->n_loop / size; + double v = 1.01; + int one = 1; + void * A = param->create_matrix(size * size); + void * y = param->create_matrix(size); + for(i = 0; i < n; i++) { + param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size); + } + free(A); + free(y); +} + +#ifndef _WIN32 +void * pthread_func_wrapper(void * param) { + ((BenchParam *)param)->bench_func(param); + pthread_exit(NULL); +} +#endif + +#define NB_TESTS 5 +void * TESTS[4 * NB_TESTS] = { + trmv_bench, ztrmv_, z_create_matrix, "ztrmv", + gemv_bench, dgemv_, d_create_matrix, "dgemv", + gemv_bench, zgemv_, z_create_matrix, "zgemv", + ger_bench, dger_, d_create_matrix, "dger", + ger_bench, zgerc_, z_create_matrix, "zgerc", +}; + +inline static double delta_time(struct timespec tick) { + struct timespec tock; + clock_gettime(CLOCK_MONOTONIC, &tock); + return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9; +} + +double pthread_bench(BenchParam * param, int nb_threads) +{ +#ifdef _WIN32 + return 0; +#else + BenchParam threaded_param = *param; + pthread_t threads[nb_threads]; + int t, rc; + struct timespec tick; + threaded_param.n_loop /= nb_threads; + clock_gettime(CLOCK_MONOTONIC, &tick); + for(t=0; tbench_func(param); + return delta_time(tick); +} + +double omp_bench(BenchParam * param) { + BenchParam threaded_param = *param; + struct timespec tick; + int t; + int nb_threads = omp_get_max_threads(); + threaded_param.n_loop /= nb_threads; + clock_gettime(CLOCK_MONOTONIC, &tick); + #pragma omp parallel for + for(t = 0; t < nb_threads; t ++){ + param->bench_func(&threaded_param); + } + return delta_time(tick); +} + +int main(int argc, char * argv[]) { + double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE); + BenchParam param; + int test_id; + printf ("Running on %d threads\n", omp_get_max_threads()); + for(test_id = 0; test_id < NB_TESTS; test_id ++) { + double size = MIN_SIZE; + param.bench_func = TESTS[test_id * 4]; + param.blas_func = TESTS[test_id * 4 + 1]; + param.create_matrix = TESTS[test_id * 4 + 2]; + printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]); + param.n_loop = NLOOP; + while(size <= MAX_SIZE) { + param.matrix_size = (int)(size + 0.5); + double seq_time = seq_bench(¶m); + double omp_time = omp_bench(¶m); + double pthread_time = pthread_bench(¶m, omp_get_max_threads()); + printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, " + "pthread %gs, speedup %g\n", + param.matrix_size, seq_time, + omp_time, seq_time / omp_time, + pthread_time, seq_time / pthread_time); + size *= inc_factor; + } + } +} From 0ad02ef2d6fb29acd753589b143611c4ff1949f0 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Mon, 18 Jan 2016 18:54:51 +0100 Subject: [PATCH 5/6] update CONTRIBUTORS.md --- CONTRIBUTORS.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index b1c8dd140..df92cf4ef 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -124,7 +124,13 @@ In chronological order: * Jerome Robert * [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478) * [2015-12-23] `stack_check` in `gemv.c` (bug #722) + * [2015-12-28] Allow to force the number of parallel make job + * [2015-12-28] Fix detection of AMD E2-3200 detection + * [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected * [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731) + * [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742) + * [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760) + * [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727) * Dan Kortschak * [2015-01-07] Added test for drotmg bug #484. From 16ec5323c9ddf3efe31c85a2a9b4141c452db580 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Mon, 8 Feb 2016 12:05:02 +0100 Subject: [PATCH 6/6] Fix zgemv.c compilation when stack allocation is disabled --- interface/zgemv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/zgemv.c b/interface/zgemv.c index abf2aa710..e5ba3757c 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -239,7 +239,7 @@ void CNAME(enum CBLAS_ORDER order, buffer_size = (buffer_size + 3) & ~3; STACK_ALLOC(buffer_size, FLOAT, buffer); -#ifdef ARCH_X86_64 +#if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0 // cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746) if(trans && stack_alloc_size) memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size));