Merge pull request #780 from jeromerobert/bug727

Bug727
This commit is contained in:
Zhang Xianyi 2016-02-08 13:24:40 -05:00
commit 233c6b959f
8 changed files with 245 additions and 15 deletions

View File

@ -124,7 +124,13 @@ In chronological order:
* Jerome Robert <jeromerobert@gmx.com>
* [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478)
* [2015-12-23] `stack_check` in `gemv.c` (bug #722)
* [2015-12-28] Allow to force the number of parallel make job
* [2015-12-28] Fix detection of AMD E2-3200 detection
* [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected
* [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731)
* [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742)
* [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760)
* [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727)
* Dan Kortschak
* [2015-01-07] Added test for drotmg bug #484.

View File

@ -166,7 +166,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
ssymm.goto dsymm.goto csymm.goto zsymm.goto
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
smallscaling
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
@ -2132,6 +2133,8 @@ cgemm3m.$(SUFFIX) : gemm3m.c
zgemm3m.$(SUFFIX) : gemm3m.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
smallscaling: smallscaling.c ../$(LIBNAME)
$(CC) $(CFLAGS) -lpthread -fopenmp -lm -o $(@F) $^
clean ::
@rm -f *.goto *.mkl *.acml *.atlas *.veclib

191
benchmark/smallscaling.c Normal file
View File

@ -0,0 +1,191 @@
// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <cblas.h>
#include <omp.h>
#define MIN_SIZE 5
#define MAX_SIZE 60
#define NB_SIZE 10
// number of loop for a 1x1 matrix. Lower it if the test is
// too slow on you computer.
#define NLOOP 2e7
typedef struct {
int matrix_size;
int n_loop;
void (* bench_func)();
void (* blas_func)();
void * (* create_matrix)(int size);
} BenchParam;
void * s_create_matrix(int size) {
float * r = malloc(size * sizeof(double));
for(int i = 0; i < size; i++)
r[i] = 1e3 * i / size;
return r;
}
void * c_create_matrix(int size) {
float * r = malloc(size * 2 * sizeof(double));
for(int i = 0; i < 2 * size; i++)
r[i] = 1e3 * i / size;
return r;
}
void * z_create_matrix(int size) {
double * r = malloc(size * 2 * sizeof(double));
for(int i = 0; i < 2 * size; i++)
r[i] = 1e3 * i / size;
return r;
}
void * d_create_matrix(int size) {
double * r = malloc(size * sizeof(double));
for(int i = 0; i < size; i++)
r[i] = 1e3 * i / size;
return r;
}
void trmv_bench(BenchParam * param)
{
int i, n;
int size = param->matrix_size;
n = param->n_loop / size;
int one = 1;
void * A = param->create_matrix(size * size);
void * y = param->create_matrix(size);
for(i = 0; i < n; i++) {
param->blas_func("U", "N", "N", &size, A, &size, y, &one);
}
free(A);
free(y);
}
void gemv_bench(BenchParam * param)
{
int i, n;
int size = param->matrix_size;
n = param->n_loop / size;
double v = 1.01;
int one = 1;
void * A = param->create_matrix(size * size);
void * y = param->create_matrix(size);
for(i = 0; i < n; i++) {
param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
}
free(A);
free(y);
}
void ger_bench(BenchParam * param) {
int i, n;
int size = param->matrix_size;
n = param->n_loop / size;
double v = 1.01;
int one = 1;
void * A = param->create_matrix(size * size);
void * y = param->create_matrix(size);
for(i = 0; i < n; i++) {
param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size);
}
free(A);
free(y);
}
#ifndef _WIN32
void * pthread_func_wrapper(void * param) {
((BenchParam *)param)->bench_func(param);
pthread_exit(NULL);
}
#endif
#define NB_TESTS 5
void * TESTS[4 * NB_TESTS] = {
trmv_bench, ztrmv_, z_create_matrix, "ztrmv",
gemv_bench, dgemv_, d_create_matrix, "dgemv",
gemv_bench, zgemv_, z_create_matrix, "zgemv",
ger_bench, dger_, d_create_matrix, "dger",
ger_bench, zgerc_, z_create_matrix, "zgerc",
};
inline static double delta_time(struct timespec tick) {
struct timespec tock;
clock_gettime(CLOCK_MONOTONIC, &tock);
return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
}
double pthread_bench(BenchParam * param, int nb_threads)
{
#ifdef _WIN32
return 0;
#else
BenchParam threaded_param = *param;
pthread_t threads[nb_threads];
int t, rc;
struct timespec tick;
threaded_param.n_loop /= nb_threads;
clock_gettime(CLOCK_MONOTONIC, &tick);
for(t=0; t<nb_threads; t++){
rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param);
if (rc){
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
}
for(t=0; t<nb_threads; t++){
pthread_join(threads[t], NULL);
}
return delta_time(tick);
#endif
}
double seq_bench(BenchParam * param) {
struct timespec tick;
clock_gettime(CLOCK_MONOTONIC, &tick);
param->bench_func(param);
return delta_time(tick);
}
double omp_bench(BenchParam * param) {
BenchParam threaded_param = *param;
struct timespec tick;
int t;
int nb_threads = omp_get_max_threads();
threaded_param.n_loop /= nb_threads;
clock_gettime(CLOCK_MONOTONIC, &tick);
#pragma omp parallel for
for(t = 0; t < nb_threads; t ++){
param->bench_func(&threaded_param);
}
return delta_time(tick);
}
int main(int argc, char * argv[]) {
double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE);
BenchParam param;
int test_id;
printf ("Running on %d threads\n", omp_get_max_threads());
for(test_id = 0; test_id < NB_TESTS; test_id ++) {
double size = MIN_SIZE;
param.bench_func = TESTS[test_id * 4];
param.blas_func = TESTS[test_id * 4 + 1];
param.create_matrix = TESTS[test_id * 4 + 2];
printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]);
param.n_loop = NLOOP;
while(size <= MAX_SIZE) {
param.matrix_size = (int)(size + 0.5);
double seq_time = seq_bench(&param);
double omp_time = omp_bench(&param);
double pthread_time = pthread_bench(&param, omp_get_max_threads());
printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, "
"pthread %gs, speedup %g\n",
param.matrix_size, seq_time,
omp_time, seq_time / omp_time,
pthread_time, seq_time / pthread_time);
size *= inc_factor;
}
}
}

View File

@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#endif
x = buffer;
buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
buffer += ((COMPSIZE * args -> m + 3) & ~3);
}
#ifndef TRANS
@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE;
queue[num_cpu - 1].next = NULL;

View File

@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
COPY_K(m, b, incb, buffer, 1);
}

View File

@ -77,6 +77,7 @@ void NAME(char *TRANS, blasint *M, blasint *N,
blasint incy = *INCY;
FLOAT *buffer;
int buffer_size;
#ifdef SMP
int nthreads;
#endif
@ -141,7 +142,7 @@ void CNAME(enum CBLAS_ORDER order,
FLOAT *buffer;
blasint lenx, leny;
int trans;
int trans, buffer_size;
blasint info, t;
#ifdef SMP
int nthreads;
@ -230,7 +231,19 @@ void CNAME(enum CBLAS_ORDER order,
if (incx < 0) x -= (lenx - 1) * incx * 2;
if (incy < 0) y -= (leny - 1) * incy * 2;
buffer = (FLOAT *)blas_memory_alloc(1);
buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT) ;
#endif
// for alignment
buffer_size = (buffer_size + 3) & ~3;
STACK_ALLOC(buffer_size, FLOAT, buffer);
#if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
// cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746)
if(trans && stack_alloc_size)
memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size));
#endif
#ifdef SMP
@ -253,7 +266,7 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
blas_memory_free(buffer);
STACK_FREE(buffer);
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);

View File

@ -210,7 +210,7 @@ void CNAME(enum CBLAS_ORDER order,
if (incy < 0) y -= (n - 1) * incy * 2;
if (incx < 0) x -= (m - 1) * incx * 2;
buffer = (FLOAT *)blas_memory_alloc(1);
STACK_ALLOC(2 * m, FLOAT, buffer);
#ifdef SMPTEST
// Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
@ -249,7 +249,7 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
blas_memory_free(buffer);
STACK_FREE(buffer);
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);

View File

@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
blasint info;
int uplo;
int unit;
int trans;
int trans, buffer_size;
FLOAT *buffer;
#ifdef SMP
int nthreads;
@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) {
int trans, uplo, unit;
int trans, uplo, unit, buffer_size;
blasint info;
FLOAT *buffer;
#ifdef SMP
@ -227,11 +227,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (incx < 0 ) x -= (n - 1) * incx * 2;
buffer = (FLOAT *)blas_memory_alloc(1);
#ifdef SMP
// Calibrated on a Xeon E5-2630
if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) {
nthreads = num_cpu_avail(2);
if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
nthreads = 2;
} else
nthreads = 1;
if(nthreads > 1) {
buffer_size = n > 16 ? 0 : n * 4 + 40;
}
else
#endif
{
buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT);
if(incx != 1)
buffer_size += n * 2;
}
STACK_ALLOC(buffer_size, FLOAT, buffer);
#ifdef SMP
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
#endif
@ -245,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
}
#endif
blas_memory_free(buffer);
STACK_FREE(buffer);
FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n);