Merge pull request #780 from jeromerobert/bug727

Bug727
2016-02-08 13:24:40 -05:00 · 2016-02-08 13:24:40 -05:00 · 233c6b959f
parent be4e5fcd20 16ec5323c9
commit 233c6b959f
8 changed files with 245 additions and 15 deletions
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@ -124,7 +124,13 @@ In chronological order:
 * Jerome Robert <jeromerobert@gmx.com>
  * [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478)
  * [2015-12-23] `stack_check` in `gemv.c` (bug #722)
  * [2015-12-28] Allow to force the number of parallel make job
  * [2015-12-28] Fix detection of AMD E2-3200 detection
  * [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected
  * [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731)
  * [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742)
  * [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760)
  * [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727)
 * Dan Kortschak
  * [2015-01-07] Added test for drotmg bug #484.
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@ -166,7 +166,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
       sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
       sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
       spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
-       ssymm.goto dsymm.goto csymm.goto zsymm.goto
+       ssymm.goto dsymm.goto csymm.goto zsymm.goto \
       smallscaling
 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
       scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
@ -2132,6 +2133,8 @@ cgemm3m.$(SUFFIX) : gemm3m.c
 zgemm3m.$(SUFFIX) : gemm3m.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
 smallscaling: smallscaling.c ../$(LIBNAME)
 	$(CC) $(CFLAGS) -lpthread -fopenmp -lm -o $(@F) $^
 clean ::
 	@rm -f *.goto *.mkl *.acml *.atlas *.veclib
--- a/benchmark/smallscaling.c
+++ b/benchmark/smallscaling.c
@ -0,0 +1,191 @@
 // run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <time.h>
 #include <cblas.h>
 #include <omp.h>
 #define MIN_SIZE 5
 #define MAX_SIZE 60
 #define NB_SIZE 10
 // number of loop for a 1x1 matrix. Lower it if the test is
 // too slow on you computer.
 #define NLOOP 2e7
 typedef struct {
    int matrix_size;
    int n_loop;
    void (* bench_func)();
    void (* blas_func)();
    void * (* create_matrix)(int size);
 } BenchParam;
 void * s_create_matrix(int size) {
    float * r = malloc(size * sizeof(double));
    for(int i = 0; i < size; i++)
        r[i] = 1e3 * i / size;
    return r;
 }
 void * c_create_matrix(int size) {
    float * r = malloc(size * 2 * sizeof(double));
    for(int i = 0; i < 2 * size; i++)
        r[i] = 1e3 * i / size;
    return r;
 }
 void * z_create_matrix(int size) {
    double * r = malloc(size * 2 * sizeof(double));
    for(int i = 0; i < 2 * size; i++)
        r[i] = 1e3 * i / size;
    return r;
 }
 void * d_create_matrix(int size) {
    double * r = malloc(size * sizeof(double));
    for(int i = 0; i < size; i++)
        r[i] = 1e3 * i / size;
    return r;
 }
 void trmv_bench(BenchParam * param)
 {
    int i, n;
    int size = param->matrix_size;
    n = param->n_loop / size;
    int one = 1;
    void * A = param->create_matrix(size * size);
    void * y = param->create_matrix(size);
    for(i = 0; i < n; i++) {
        param->blas_func("U", "N", "N", &size, A, &size, y, &one);
    }
    free(A);
    free(y);
 }
 void gemv_bench(BenchParam * param)
 {
    int i, n;
    int size = param->matrix_size;
    n = param->n_loop / size;
    double v = 1.01;
    int one = 1;
    void * A = param->create_matrix(size * size);
    void * y = param->create_matrix(size);
    for(i = 0; i < n; i++) {
        param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
    }
    free(A);
    free(y);
 }
 void ger_bench(BenchParam * param) {
    int i, n;
    int size = param->matrix_size;
    n = param->n_loop / size;
    double v = 1.01;
    int one = 1;
    void * A = param->create_matrix(size * size);
    void * y = param->create_matrix(size);
    for(i = 0; i < n; i++) {
        param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size);
    }
    free(A);
    free(y);
 }
 #ifndef _WIN32
 void * pthread_func_wrapper(void * param) {
    ((BenchParam *)param)->bench_func(param);
    pthread_exit(NULL);
 }
 #endif
 #define NB_TESTS 5
 void * TESTS[4 * NB_TESTS] = {
    trmv_bench, ztrmv_, z_create_matrix, "ztrmv",
    gemv_bench, dgemv_, d_create_matrix, "dgemv",
    gemv_bench, zgemv_, z_create_matrix, "zgemv",
    ger_bench, dger_, d_create_matrix, "dger",
    ger_bench, zgerc_, z_create_matrix, "zgerc",
 };
 inline static double delta_time(struct timespec tick) {
    struct timespec tock;
 	clock_gettime(CLOCK_MONOTONIC, &tock);
 	return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
 }
 double pthread_bench(BenchParam * param, int nb_threads)
 {
 #ifdef _WIN32
    return 0;
 #else
    BenchParam threaded_param = *param;
    pthread_t threads[nb_threads];
    int t, rc;
    struct timespec tick;
    threaded_param.n_loop /= nb_threads;
    clock_gettime(CLOCK_MONOTONIC, &tick);
    for(t=0; t<nb_threads; t++){
        rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param);
        if (rc){
            printf("ERROR; return code from pthread_create() is %d\n", rc);
            exit(-1);
        }
    }
    for(t=0; t<nb_threads; t++){
        pthread_join(threads[t], NULL);
    }
 	return delta_time(tick);
 #endif
 }
 double seq_bench(BenchParam * param) {
    struct timespec tick;
    clock_gettime(CLOCK_MONOTONIC, &tick);
    param->bench_func(param);
    return delta_time(tick);
 }
 double omp_bench(BenchParam * param) {
    BenchParam threaded_param = *param;
    struct timespec tick;
    int t;
    int nb_threads = omp_get_max_threads();
    threaded_param.n_loop /= nb_threads;
    clock_gettime(CLOCK_MONOTONIC, &tick);
    #pragma omp parallel for
    for(t = 0; t < nb_threads; t ++){
        param->bench_func(&threaded_param);
    }
    return delta_time(tick);
 }
 int main(int argc, char * argv[]) {
    double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE);
    BenchParam param;
    int test_id;
    printf ("Running on %d threads\n", omp_get_max_threads());
    for(test_id = 0; test_id < NB_TESTS; test_id ++) {
        double size = MIN_SIZE;
        param.bench_func = TESTS[test_id * 4];
        param.blas_func = TESTS[test_id * 4 + 1];
        param.create_matrix = TESTS[test_id * 4 + 2];
        printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]);
        param.n_loop = NLOOP;
        while(size <= MAX_SIZE) {
            param.matrix_size = (int)(size + 0.5);
            double seq_time = seq_bench(&param);
            double omp_time = omp_bench(&param);
            double pthread_time = pthread_bench(&param, omp_get_max_threads());
            printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, "
                   "pthread %gs, speedup %g\n",
                   param.matrix_size, seq_time,
                   omp_time, seq_time / omp_time,
                   pthread_time, seq_time / pthread_time);
            size *= inc_factor;
        }
    }
 }
--- a/driver/level2/trmv_thread.c
+++ b/driver/level2/trmv_thread.c
@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
 #endif
    x = buffer;
-    buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
+    buffer += ((COMPSIZE * args -> m + 3) & ~3);
  }
 #ifndef TRANS
@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
  if (num_cpu) {
    queue[0].sa = NULL;
-    queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
+    queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE;
    queue[num_cpu - 1].next = NULL;
--- a/driver/level2/ztrmv_U.c
+++ b/driver/level2/ztrmv_U.c
@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
  if (incb != 1) {
    B = buffer;
-    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
+    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
    COPY_K(m, b, incb, buffer, 1);
  }
--- a/interface/zgemv.c
+++ b/interface/zgemv.c
@ -77,6 +77,7 @@ void NAME(char *TRANS, blasint *M, blasint *N,
  blasint incy = *INCY;
  FLOAT *buffer;
  int buffer_size;
 #ifdef SMP
  int nthreads;
 #endif
@ -141,7 +142,7 @@ void CNAME(enum CBLAS_ORDER order,
  FLOAT *buffer;
  blasint    lenx, leny;
-  int trans;
+  int trans, buffer_size;
  blasint info, t;
 #ifdef SMP
  int nthreads;
@ -230,7 +231,19 @@ void CNAME(enum CBLAS_ORDER order,
  if (incx < 0) x -= (lenx - 1) * incx * 2;
  if (incy < 0) y -= (leny - 1) * incy * 2;
-  buffer = (FLOAT *)blas_memory_alloc(1);
+  buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT);
 #ifdef WINDOWS_ABI
  buffer_size += 160 / sizeof(FLOAT) ;
 #endif
  // for alignment
  buffer_size = (buffer_size + 3) & ~3;
  STACK_ALLOC(buffer_size, FLOAT, buffer);
 #if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
  // cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746)
  if(trans && stack_alloc_size)
    memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size));
 #endif
 #ifdef SMP
@ -253,7 +266,7 @@ void CNAME(enum CBLAS_ORDER order,
  }
 #endif
-  blas_memory_free(buffer);
+  STACK_FREE(buffer);
  FUNCTION_PROFILE_END(4, m * n + m + n,  2 * m * n);
--- a/interface/zger.c
+++ b/interface/zger.c
@ -210,7 +210,7 @@ void CNAME(enum CBLAS_ORDER order,
  if (incy < 0) y -= (n - 1) * incy * 2;
  if (incx < 0) x -= (m - 1) * incx * 2;
-  buffer = (FLOAT *)blas_memory_alloc(1);
+  STACK_ALLOC(2 * m, FLOAT, buffer);
 #ifdef SMPTEST
  // Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
@ -249,7 +249,7 @@ void CNAME(enum CBLAS_ORDER order,
  }
 #endif
-  blas_memory_free(buffer);
+  STACK_FREE(buffer);
  FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
--- a/interface/ztrmv.c
+++ b/interface/ztrmv.c
@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
  blasint info;
  int uplo;
  int unit;
-  int trans;
+  int trans, buffer_size;
  FLOAT *buffer;
 #ifdef SMP
  int nthreads;
@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 	   blasint n, FLOAT  *a, blasint lda, FLOAT  *x, blasint incx) {
-  int trans, uplo, unit;
+  int trans, uplo, unit, buffer_size;
  blasint info;
  FLOAT *buffer;
 #ifdef SMP
@ -227,11 +227,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
  if (incx < 0 ) x -= (n - 1) * incx * 2;
-  buffer = (FLOAT *)blas_memory_alloc(1);
+#ifdef SMP
  // Calibrated on a Xeon E5-2630
  if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) {
    nthreads = num_cpu_avail(2);
    if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
      nthreads = 2;
  } else
      nthreads = 1;
  if(nthreads > 1) {
    buffer_size = n > 16 ? 0 : n * 4 + 40;
  }
  else
 #endif
  {
    buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT);
    if(incx != 1)
      buffer_size += n * 2;
  }
  STACK_ALLOC(buffer_size, FLOAT, buffer);
 #ifdef SMP
  nthreads = num_cpu_avail(2);
  if (nthreads == 1) {
 #endif
@ -245,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
  }
 #endif
-  blas_memory_free(buffer);
+  STACK_FREE(buffer);
  FUNCTION_PROFILE_END(4, n * n / 2 + n,  n * n);