From 73397faf6801779c974e93223f075c918a237ff2 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Sun, 3 Jan 2016 14:04:33 +0100 Subject: [PATCH] Add benchmark/smallscaling.c * Bench small matrices with multi-threading * Close #727 --- benchmark/Makefile | 5 +- benchmark/smallscaling.c | 191 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 195 insertions(+), 1 deletion(-) create mode 100644 benchmark/smallscaling.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 492d2617f..bcf3da2cc 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -166,7 +166,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + smallscaling acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -2132,6 +2133,8 @@ cgemm3m.$(SUFFIX) : gemm3m.c zgemm3m.$(SUFFIX) : gemm3m.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +smallscaling: smallscaling.c ../$(LIBNAME) + $(CC) $(CFLAGS) -lpthread -fopenmp -lm -o $(@F) $^ clean :: @rm -f *.goto *.mkl *.acml *.atlas *.veclib diff --git a/benchmark/smallscaling.c b/benchmark/smallscaling.c new file mode 100644 index 000000000..daed8f3da --- /dev/null +++ b/benchmark/smallscaling.c @@ -0,0 +1,191 @@ +// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n +#include +#include +#include +#include +#include +#include +#define MIN_SIZE 5 +#define MAX_SIZE 60 +#define NB_SIZE 10 + +// number of loop for a 1x1 matrix. Lower it if the test is +// too slow on you computer. +#define NLOOP 2e7 + +typedef struct { + int matrix_size; + int n_loop; + void (* bench_func)(); + void (* blas_func)(); + void * (* create_matrix)(int size); +} BenchParam; + +void * s_create_matrix(int size) { + float * r = malloc(size * sizeof(double)); + for(int i = 0; i < size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void * c_create_matrix(int size) { + float * r = malloc(size * 2 * sizeof(double)); + for(int i = 0; i < 2 * size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void * z_create_matrix(int size) { + double * r = malloc(size * 2 * sizeof(double)); + for(int i = 0; i < 2 * size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void * d_create_matrix(int size) { + double * r = malloc(size * sizeof(double)); + for(int i = 0; i < size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void trmv_bench(BenchParam * param) +{ + int i, n; + int size = param->matrix_size; + n = param->n_loop / size; + int one = 1; + void * A = param->create_matrix(size * size); + void * y = param->create_matrix(size); + for(i = 0; i < n; i++) { + param->blas_func("U", "N", "N", &size, A, &size, y, &one); + } + free(A); + free(y); +} + +void gemv_bench(BenchParam * param) +{ + int i, n; + int size = param->matrix_size; + n = param->n_loop / size; + double v = 1.01; + int one = 1; + void * A = param->create_matrix(size * size); + void * y = param->create_matrix(size); + for(i = 0; i < n; i++) { + param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one); + } + free(A); + free(y); +} + +void ger_bench(BenchParam * param) { + int i, n; + int size = param->matrix_size; + n = param->n_loop / size; + double v = 1.01; + int one = 1; + void * A = param->create_matrix(size * size); + void * y = param->create_matrix(size); + for(i = 0; i < n; i++) { + param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size); + } + free(A); + free(y); +} + +#ifndef _WIN32 +void * pthread_func_wrapper(void * param) { + ((BenchParam *)param)->bench_func(param); + pthread_exit(NULL); +} +#endif + +#define NB_TESTS 5 +void * TESTS[4 * NB_TESTS] = { + trmv_bench, ztrmv_, z_create_matrix, "ztrmv", + gemv_bench, dgemv_, d_create_matrix, "dgemv", + gemv_bench, zgemv_, z_create_matrix, "zgemv", + ger_bench, dger_, d_create_matrix, "dger", + ger_bench, zgerc_, z_create_matrix, "zgerc", +}; + +inline static double delta_time(struct timespec tick) { + struct timespec tock; + clock_gettime(CLOCK_MONOTONIC, &tock); + return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9; +} + +double pthread_bench(BenchParam * param, int nb_threads) +{ +#ifdef _WIN32 + return 0; +#else + BenchParam threaded_param = *param; + pthread_t threads[nb_threads]; + int t, rc; + struct timespec tick; + threaded_param.n_loop /= nb_threads; + clock_gettime(CLOCK_MONOTONIC, &tick); + for(t=0; tbench_func(param); + return delta_time(tick); +} + +double omp_bench(BenchParam * param) { + BenchParam threaded_param = *param; + struct timespec tick; + int t; + int nb_threads = omp_get_max_threads(); + threaded_param.n_loop /= nb_threads; + clock_gettime(CLOCK_MONOTONIC, &tick); + #pragma omp parallel for + for(t = 0; t < nb_threads; t ++){ + param->bench_func(&threaded_param); + } + return delta_time(tick); +} + +int main(int argc, char * argv[]) { + double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE); + BenchParam param; + int test_id; + printf ("Running on %d threads\n", omp_get_max_threads()); + for(test_id = 0; test_id < NB_TESTS; test_id ++) { + double size = MIN_SIZE; + param.bench_func = TESTS[test_id * 4]; + param.blas_func = TESTS[test_id * 4 + 1]; + param.create_matrix = TESTS[test_id * 4 + 2]; + printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]); + param.n_loop = NLOOP; + while(size <= MAX_SIZE) { + param.matrix_size = (int)(size + 0.5); + double seq_time = seq_bench(¶m); + double omp_time = omp_bench(¶m); + double pthread_time = pthread_bench(¶m, omp_get_max_threads()); + printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, " + "pthread %gs, speedup %g\n", + param.matrix_size, seq_time, + omp_time, seq_time / omp_time, + pthread_time, seq_time / pthread_time); + size *= inc_factor; + } + } +}