Add benchmark/smallscaling.c
* Bench small matrices with multi-threading * Close #727
This commit is contained in:
parent
5fc2203d8a
commit
73397faf68
|
@ -166,7 +166,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
|||
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
|
||||
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
|
||||
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
|
||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto
|
||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
|
||||
smallscaling
|
||||
|
||||
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
||||
|
@ -2132,6 +2133,8 @@ cgemm3m.$(SUFFIX) : gemm3m.c
|
|||
zgemm3m.$(SUFFIX) : gemm3m.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
smallscaling: smallscaling.c ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -lpthread -fopenmp -lm -o $(@F) $^
|
||||
|
||||
clean ::
|
||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib
|
||||
|
|
|
@ -0,0 +1,191 @@
|
|||
// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <cblas.h>
|
||||
#include <omp.h>
|
||||
#define MIN_SIZE 5
|
||||
#define MAX_SIZE 60
|
||||
#define NB_SIZE 10
|
||||
|
||||
// number of loop for a 1x1 matrix. Lower it if the test is
|
||||
// too slow on you computer.
|
||||
#define NLOOP 2e7
|
||||
|
||||
typedef struct {
|
||||
int matrix_size;
|
||||
int n_loop;
|
||||
void (* bench_func)();
|
||||
void (* blas_func)();
|
||||
void * (* create_matrix)(int size);
|
||||
} BenchParam;
|
||||
|
||||
void * s_create_matrix(int size) {
|
||||
float * r = malloc(size * sizeof(double));
|
||||
for(int i = 0; i < size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void * c_create_matrix(int size) {
|
||||
float * r = malloc(size * 2 * sizeof(double));
|
||||
for(int i = 0; i < 2 * size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void * z_create_matrix(int size) {
|
||||
double * r = malloc(size * 2 * sizeof(double));
|
||||
for(int i = 0; i < 2 * size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void * d_create_matrix(int size) {
|
||||
double * r = malloc(size * sizeof(double));
|
||||
for(int i = 0; i < size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void trmv_bench(BenchParam * param)
|
||||
{
|
||||
int i, n;
|
||||
int size = param->matrix_size;
|
||||
n = param->n_loop / size;
|
||||
int one = 1;
|
||||
void * A = param->create_matrix(size * size);
|
||||
void * y = param->create_matrix(size);
|
||||
for(i = 0; i < n; i++) {
|
||||
param->blas_func("U", "N", "N", &size, A, &size, y, &one);
|
||||
}
|
||||
free(A);
|
||||
free(y);
|
||||
}
|
||||
|
||||
void gemv_bench(BenchParam * param)
|
||||
{
|
||||
int i, n;
|
||||
int size = param->matrix_size;
|
||||
n = param->n_loop / size;
|
||||
double v = 1.01;
|
||||
int one = 1;
|
||||
void * A = param->create_matrix(size * size);
|
||||
void * y = param->create_matrix(size);
|
||||
for(i = 0; i < n; i++) {
|
||||
param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
|
||||
}
|
||||
free(A);
|
||||
free(y);
|
||||
}
|
||||
|
||||
void ger_bench(BenchParam * param) {
|
||||
int i, n;
|
||||
int size = param->matrix_size;
|
||||
n = param->n_loop / size;
|
||||
double v = 1.01;
|
||||
int one = 1;
|
||||
void * A = param->create_matrix(size * size);
|
||||
void * y = param->create_matrix(size);
|
||||
for(i = 0; i < n; i++) {
|
||||
param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size);
|
||||
}
|
||||
free(A);
|
||||
free(y);
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
void * pthread_func_wrapper(void * param) {
|
||||
((BenchParam *)param)->bench_func(param);
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
#endif
|
||||
|
||||
#define NB_TESTS 5
|
||||
void * TESTS[4 * NB_TESTS] = {
|
||||
trmv_bench, ztrmv_, z_create_matrix, "ztrmv",
|
||||
gemv_bench, dgemv_, d_create_matrix, "dgemv",
|
||||
gemv_bench, zgemv_, z_create_matrix, "zgemv",
|
||||
ger_bench, dger_, d_create_matrix, "dger",
|
||||
ger_bench, zgerc_, z_create_matrix, "zgerc",
|
||||
};
|
||||
|
||||
inline static double delta_time(struct timespec tick) {
|
||||
struct timespec tock;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tock);
|
||||
return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
|
||||
}
|
||||
|
||||
double pthread_bench(BenchParam * param, int nb_threads)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
return 0;
|
||||
#else
|
||||
BenchParam threaded_param = *param;
|
||||
pthread_t threads[nb_threads];
|
||||
int t, rc;
|
||||
struct timespec tick;
|
||||
threaded_param.n_loop /= nb_threads;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||
for(t=0; t<nb_threads; t++){
|
||||
rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param);
|
||||
if (rc){
|
||||
printf("ERROR; return code from pthread_create() is %d\n", rc);
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
for(t=0; t<nb_threads; t++){
|
||||
pthread_join(threads[t], NULL);
|
||||
}
|
||||
return delta_time(tick);
|
||||
#endif
|
||||
}
|
||||
|
||||
double seq_bench(BenchParam * param) {
|
||||
struct timespec tick;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||
param->bench_func(param);
|
||||
return delta_time(tick);
|
||||
}
|
||||
|
||||
double omp_bench(BenchParam * param) {
|
||||
BenchParam threaded_param = *param;
|
||||
struct timespec tick;
|
||||
int t;
|
||||
int nb_threads = omp_get_max_threads();
|
||||
threaded_param.n_loop /= nb_threads;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||
#pragma omp parallel for
|
||||
for(t = 0; t < nb_threads; t ++){
|
||||
param->bench_func(&threaded_param);
|
||||
}
|
||||
return delta_time(tick);
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE);
|
||||
BenchParam param;
|
||||
int test_id;
|
||||
printf ("Running on %d threads\n", omp_get_max_threads());
|
||||
for(test_id = 0; test_id < NB_TESTS; test_id ++) {
|
||||
double size = MIN_SIZE;
|
||||
param.bench_func = TESTS[test_id * 4];
|
||||
param.blas_func = TESTS[test_id * 4 + 1];
|
||||
param.create_matrix = TESTS[test_id * 4 + 2];
|
||||
printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]);
|
||||
param.n_loop = NLOOP;
|
||||
while(size <= MAX_SIZE) {
|
||||
param.matrix_size = (int)(size + 0.5);
|
||||
double seq_time = seq_bench(¶m);
|
||||
double omp_time = omp_bench(¶m);
|
||||
double pthread_time = pthread_bench(¶m, omp_get_max_threads());
|
||||
printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, "
|
||||
"pthread %gs, speedup %g\n",
|
||||
param.matrix_size, seq_time,
|
||||
omp_time, seq_time / omp_time,
|
||||
pthread_time, seq_time / pthread_time);
|
||||
size *= inc_factor;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue