From 78dcf5c3d583d933f579e6947fc8cf4b1988840f Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Thu, 14 Jan 2016 22:12:57 +0100 Subject: [PATCH] Improve performances of ztrmv on small matrices * Use stack allocation * Disable multi-threading * Ref #727 --- driver/level2/trmv_thread.c | 4 ++-- driver/level2/ztrmv_U.c | 2 +- interface/ztrmv.c | 29 +++++++++++++++++++++++------ 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index a9dc2dc62..42edb83cb 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #endif x = buffer; - buffer += ((COMPSIZE * args -> m + 1023) & ~1023); + buffer += ((COMPSIZE * args -> m + 3) & ~3); } #ifndef TRANS @@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu if (num_cpu) { queue[0].sa = NULL; - queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; + queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE; queue[num_cpu - 1].next = NULL; diff --git a/driver/level2/ztrmv_U.c b/driver/level2/ztrmv_U.c index f9671c9d6..063de6cbc 100644 --- a/driver/level2/ztrmv_U.c +++ b/driver/level2/ztrmv_U.c @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15); COPY_K(m, b, incb, buffer, 1); } diff --git a/interface/ztrmv.c b/interface/ztrmv.c index 1abaac920..2be915c32 100644 --- a/interface/ztrmv.c +++ b/interface/ztrmv.c @@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, blasint info; int uplo; int unit; - int trans; + int trans, buffer_size; FLOAT *buffer; #ifdef SMP int nthreads; @@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { - int trans, uplo, unit; + int trans, uplo, unit, buffer_size; blasint info; FLOAT *buffer; #ifdef SMP @@ -227,11 +227,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (incx < 0 ) x -= (n - 1) * incx * 2; - buffer = (FLOAT *)blas_memory_alloc(1); +#ifdef SMP + // Calibrated on a Xeon E5-2630 + if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) { + nthreads = num_cpu_avail(2); + if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 2; + } else + nthreads = 1; + + if(nthreads > 1) { + buffer_size = n > 16 ? 0 : n * 4 + 40; + } + else +#endif + { + buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT); + if(incx != 1) + buffer_size += n * 2; + } + STACK_ALLOC(buffer_size, FLOAT, buffer); #ifdef SMP - nthreads = num_cpu_avail(2); - if (nthreads == 1) { #endif @@ -245,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } #endif - blas_memory_free(buffer); + STACK_FREE(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n);