commit
233c6b959f
|
@ -124,7 +124,13 @@ In chronological order:
|
||||||
* Jerome Robert <jeromerobert@gmx.com>
|
* Jerome Robert <jeromerobert@gmx.com>
|
||||||
* [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478)
|
* [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478)
|
||||||
* [2015-12-23] `stack_check` in `gemv.c` (bug #722)
|
* [2015-12-23] `stack_check` in `gemv.c` (bug #722)
|
||||||
|
* [2015-12-28] Allow to force the number of parallel make job
|
||||||
|
* [2015-12-28] Fix detection of AMD E2-3200 detection
|
||||||
|
* [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected
|
||||||
* [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731)
|
* [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731)
|
||||||
|
* [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742)
|
||||||
|
* [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760)
|
||||||
|
* [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727)
|
||||||
|
|
||||||
* Dan Kortschak
|
* Dan Kortschak
|
||||||
* [2015-01-07] Added test for drotmg bug #484.
|
* [2015-01-07] Added test for drotmg bug #484.
|
||||||
|
|
|
@ -166,7 +166,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||||
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
|
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
|
||||||
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
|
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
|
||||||
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
|
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
|
||||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto
|
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
|
||||||
|
smallscaling
|
||||||
|
|
||||||
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||||
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
||||||
|
@ -2132,6 +2133,8 @@ cgemm3m.$(SUFFIX) : gemm3m.c
|
||||||
zgemm3m.$(SUFFIX) : gemm3m.c
|
zgemm3m.$(SUFFIX) : gemm3m.c
|
||||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
smallscaling: smallscaling.c ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -lpthread -fopenmp -lm -o $(@F) $^
|
||||||
|
|
||||||
clean ::
|
clean ::
|
||||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib
|
@rm -f *.goto *.mkl *.acml *.atlas *.veclib
|
||||||
|
|
|
@ -0,0 +1,191 @@
|
||||||
|
// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <cblas.h>
|
||||||
|
#include <omp.h>
|
||||||
|
#define MIN_SIZE 5
|
||||||
|
#define MAX_SIZE 60
|
||||||
|
#define NB_SIZE 10
|
||||||
|
|
||||||
|
// number of loop for a 1x1 matrix. Lower it if the test is
|
||||||
|
// too slow on you computer.
|
||||||
|
#define NLOOP 2e7
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int matrix_size;
|
||||||
|
int n_loop;
|
||||||
|
void (* bench_func)();
|
||||||
|
void (* blas_func)();
|
||||||
|
void * (* create_matrix)(int size);
|
||||||
|
} BenchParam;
|
||||||
|
|
||||||
|
void * s_create_matrix(int size) {
|
||||||
|
float * r = malloc(size * sizeof(double));
|
||||||
|
for(int i = 0; i < size; i++)
|
||||||
|
r[i] = 1e3 * i / size;
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void * c_create_matrix(int size) {
|
||||||
|
float * r = malloc(size * 2 * sizeof(double));
|
||||||
|
for(int i = 0; i < 2 * size; i++)
|
||||||
|
r[i] = 1e3 * i / size;
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void * z_create_matrix(int size) {
|
||||||
|
double * r = malloc(size * 2 * sizeof(double));
|
||||||
|
for(int i = 0; i < 2 * size; i++)
|
||||||
|
r[i] = 1e3 * i / size;
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void * d_create_matrix(int size) {
|
||||||
|
double * r = malloc(size * sizeof(double));
|
||||||
|
for(int i = 0; i < size; i++)
|
||||||
|
r[i] = 1e3 * i / size;
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void trmv_bench(BenchParam * param)
|
||||||
|
{
|
||||||
|
int i, n;
|
||||||
|
int size = param->matrix_size;
|
||||||
|
n = param->n_loop / size;
|
||||||
|
int one = 1;
|
||||||
|
void * A = param->create_matrix(size * size);
|
||||||
|
void * y = param->create_matrix(size);
|
||||||
|
for(i = 0; i < n; i++) {
|
||||||
|
param->blas_func("U", "N", "N", &size, A, &size, y, &one);
|
||||||
|
}
|
||||||
|
free(A);
|
||||||
|
free(y);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gemv_bench(BenchParam * param)
|
||||||
|
{
|
||||||
|
int i, n;
|
||||||
|
int size = param->matrix_size;
|
||||||
|
n = param->n_loop / size;
|
||||||
|
double v = 1.01;
|
||||||
|
int one = 1;
|
||||||
|
void * A = param->create_matrix(size * size);
|
||||||
|
void * y = param->create_matrix(size);
|
||||||
|
for(i = 0; i < n; i++) {
|
||||||
|
param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
|
||||||
|
}
|
||||||
|
free(A);
|
||||||
|
free(y);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ger_bench(BenchParam * param) {
|
||||||
|
int i, n;
|
||||||
|
int size = param->matrix_size;
|
||||||
|
n = param->n_loop / size;
|
||||||
|
double v = 1.01;
|
||||||
|
int one = 1;
|
||||||
|
void * A = param->create_matrix(size * size);
|
||||||
|
void * y = param->create_matrix(size);
|
||||||
|
for(i = 0; i < n; i++) {
|
||||||
|
param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size);
|
||||||
|
}
|
||||||
|
free(A);
|
||||||
|
free(y);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef _WIN32
|
||||||
|
void * pthread_func_wrapper(void * param) {
|
||||||
|
((BenchParam *)param)->bench_func(param);
|
||||||
|
pthread_exit(NULL);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define NB_TESTS 5
|
||||||
|
void * TESTS[4 * NB_TESTS] = {
|
||||||
|
trmv_bench, ztrmv_, z_create_matrix, "ztrmv",
|
||||||
|
gemv_bench, dgemv_, d_create_matrix, "dgemv",
|
||||||
|
gemv_bench, zgemv_, z_create_matrix, "zgemv",
|
||||||
|
ger_bench, dger_, d_create_matrix, "dger",
|
||||||
|
ger_bench, zgerc_, z_create_matrix, "zgerc",
|
||||||
|
};
|
||||||
|
|
||||||
|
inline static double delta_time(struct timespec tick) {
|
||||||
|
struct timespec tock;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &tock);
|
||||||
|
return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
|
||||||
|
}
|
||||||
|
|
||||||
|
double pthread_bench(BenchParam * param, int nb_threads)
|
||||||
|
{
|
||||||
|
#ifdef _WIN32
|
||||||
|
return 0;
|
||||||
|
#else
|
||||||
|
BenchParam threaded_param = *param;
|
||||||
|
pthread_t threads[nb_threads];
|
||||||
|
int t, rc;
|
||||||
|
struct timespec tick;
|
||||||
|
threaded_param.n_loop /= nb_threads;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||||
|
for(t=0; t<nb_threads; t++){
|
||||||
|
rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param);
|
||||||
|
if (rc){
|
||||||
|
printf("ERROR; return code from pthread_create() is %d\n", rc);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(t=0; t<nb_threads; t++){
|
||||||
|
pthread_join(threads[t], NULL);
|
||||||
|
}
|
||||||
|
return delta_time(tick);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
double seq_bench(BenchParam * param) {
|
||||||
|
struct timespec tick;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||||
|
param->bench_func(param);
|
||||||
|
return delta_time(tick);
|
||||||
|
}
|
||||||
|
|
||||||
|
double omp_bench(BenchParam * param) {
|
||||||
|
BenchParam threaded_param = *param;
|
||||||
|
struct timespec tick;
|
||||||
|
int t;
|
||||||
|
int nb_threads = omp_get_max_threads();
|
||||||
|
threaded_param.n_loop /= nb_threads;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(t = 0; t < nb_threads; t ++){
|
||||||
|
param->bench_func(&threaded_param);
|
||||||
|
}
|
||||||
|
return delta_time(tick);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char * argv[]) {
|
||||||
|
double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE);
|
||||||
|
BenchParam param;
|
||||||
|
int test_id;
|
||||||
|
printf ("Running on %d threads\n", omp_get_max_threads());
|
||||||
|
for(test_id = 0; test_id < NB_TESTS; test_id ++) {
|
||||||
|
double size = MIN_SIZE;
|
||||||
|
param.bench_func = TESTS[test_id * 4];
|
||||||
|
param.blas_func = TESTS[test_id * 4 + 1];
|
||||||
|
param.create_matrix = TESTS[test_id * 4 + 2];
|
||||||
|
printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]);
|
||||||
|
param.n_loop = NLOOP;
|
||||||
|
while(size <= MAX_SIZE) {
|
||||||
|
param.matrix_size = (int)(size + 0.5);
|
||||||
|
double seq_time = seq_bench(¶m);
|
||||||
|
double omp_time = omp_bench(¶m);
|
||||||
|
double pthread_time = pthread_bench(¶m, omp_get_max_threads());
|
||||||
|
printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, "
|
||||||
|
"pthread %gs, speedup %g\n",
|
||||||
|
param.matrix_size, seq_time,
|
||||||
|
omp_time, seq_time / omp_time,
|
||||||
|
pthread_time, seq_time / pthread_time);
|
||||||
|
size *= inc_factor;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
x = buffer;
|
x = buffer;
|
||||||
buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
|
buffer += ((COMPSIZE * args -> m + 3) & ~3);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef TRANS
|
#ifndef TRANS
|
||||||
|
@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
||||||
|
|
||||||
if (num_cpu) {
|
if (num_cpu) {
|
||||||
queue[0].sa = NULL;
|
queue[0].sa = NULL;
|
||||||
queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
|
queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE;
|
||||||
|
|
||||||
queue[num_cpu - 1].next = NULL;
|
queue[num_cpu - 1].next = NULL;
|
||||||
|
|
||||||
|
|
|
@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
|
||||||
|
|
||||||
if (incb != 1) {
|
if (incb != 1) {
|
||||||
B = buffer;
|
B = buffer;
|
||||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
|
||||||
COPY_K(m, b, incb, buffer, 1);
|
COPY_K(m, b, incb, buffer, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -77,6 +77,7 @@ void NAME(char *TRANS, blasint *M, blasint *N,
|
||||||
blasint incy = *INCY;
|
blasint incy = *INCY;
|
||||||
|
|
||||||
FLOAT *buffer;
|
FLOAT *buffer;
|
||||||
|
int buffer_size;
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
int nthreads;
|
int nthreads;
|
||||||
#endif
|
#endif
|
||||||
|
@ -141,7 +142,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
|
|
||||||
FLOAT *buffer;
|
FLOAT *buffer;
|
||||||
blasint lenx, leny;
|
blasint lenx, leny;
|
||||||
int trans;
|
int trans, buffer_size;
|
||||||
blasint info, t;
|
blasint info, t;
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
int nthreads;
|
int nthreads;
|
||||||
|
@ -230,7 +231,19 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
if (incx < 0) x -= (lenx - 1) * incx * 2;
|
if (incx < 0) x -= (lenx - 1) * incx * 2;
|
||||||
if (incy < 0) y -= (leny - 1) * incy * 2;
|
if (incy < 0) y -= (leny - 1) * incy * 2;
|
||||||
|
|
||||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT);
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
buffer_size += 160 / sizeof(FLOAT) ;
|
||||||
|
#endif
|
||||||
|
// for alignment
|
||||||
|
buffer_size = (buffer_size + 3) & ~3;
|
||||||
|
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||||
|
|
||||||
|
#if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
|
||||||
|
// cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746)
|
||||||
|
if(trans && stack_alloc_size)
|
||||||
|
memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size));
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
|
|
||||||
|
@ -253,7 +266,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
blas_memory_free(buffer);
|
STACK_FREE(buffer);
|
||||||
|
|
||||||
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
||||||
|
|
||||||
|
|
|
@ -210,7 +210,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
if (incy < 0) y -= (n - 1) * incy * 2;
|
if (incy < 0) y -= (n - 1) * incy * 2;
|
||||||
if (incx < 0) x -= (m - 1) * incx * 2;
|
if (incx < 0) x -= (m - 1) * incx * 2;
|
||||||
|
|
||||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
STACK_ALLOC(2 * m, FLOAT, buffer);
|
||||||
|
|
||||||
#ifdef SMPTEST
|
#ifdef SMPTEST
|
||||||
// Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
|
// Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
|
||||||
|
@ -249,7 +249,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
blas_memory_free(buffer);
|
STACK_FREE(buffer);
|
||||||
|
|
||||||
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
||||||
|
|
||||||
|
|
|
@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
|
||||||
blasint info;
|
blasint info;
|
||||||
int uplo;
|
int uplo;
|
||||||
int unit;
|
int unit;
|
||||||
int trans;
|
int trans, buffer_size;
|
||||||
FLOAT *buffer;
|
FLOAT *buffer;
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
int nthreads;
|
int nthreads;
|
||||||
|
@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||||
blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) {
|
blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) {
|
||||||
|
|
||||||
int trans, uplo, unit;
|
int trans, uplo, unit, buffer_size;
|
||||||
blasint info;
|
blasint info;
|
||||||
FLOAT *buffer;
|
FLOAT *buffer;
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
|
@ -227,11 +227,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
|
|
||||||
if (incx < 0 ) x -= (n - 1) * incx * 2;
|
if (incx < 0 ) x -= (n - 1) * incx * 2;
|
||||||
|
|
||||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
#ifdef SMP
|
||||||
|
// Calibrated on a Xeon E5-2630
|
||||||
|
if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) {
|
||||||
|
nthreads = num_cpu_avail(2);
|
||||||
|
if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
|
||||||
|
nthreads = 2;
|
||||||
|
} else
|
||||||
|
nthreads = 1;
|
||||||
|
|
||||||
|
if(nthreads > 1) {
|
||||||
|
buffer_size = n > 16 ? 0 : n * 4 + 40;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT);
|
||||||
|
if(incx != 1)
|
||||||
|
buffer_size += n * 2;
|
||||||
|
}
|
||||||
|
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
nthreads = num_cpu_avail(2);
|
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -245,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
blas_memory_free(buffer);
|
STACK_FREE(buffer);
|
||||||
|
|
||||||
FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n);
|
FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue