Improve performances of ztrmv on small matrices

* Use stack allocation
* Disable multi-threading
* Ref #727
This commit is contained in:
Jerome Robert 2016-01-14 22:12:57 +01:00
parent 32f793195f
commit 78dcf5c3d5
3 changed files with 26 additions and 9 deletions

View File

@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#endif
x = buffer;
buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
buffer += ((COMPSIZE * args -> m + 3) & ~3);
}
#ifndef TRANS
@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE;
queue[num_cpu - 1].next = NULL;

View File

@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
COPY_K(m, b, incb, buffer, 1);
}

View File

@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
blasint info;
int uplo;
int unit;
int trans;
int trans, buffer_size;
FLOAT *buffer;
#ifdef SMP
int nthreads;
@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) {
int trans, uplo, unit;
int trans, uplo, unit, buffer_size;
blasint info;
FLOAT *buffer;
#ifdef SMP
@ -227,11 +227,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (incx < 0 ) x -= (n - 1) * incx * 2;
buffer = (FLOAT *)blas_memory_alloc(1);
#ifdef SMP
// Calibrated on a Xeon E5-2630
if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) {
nthreads = num_cpu_avail(2);
if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
nthreads = 2;
} else
nthreads = 1;
if(nthreads > 1) {
buffer_size = n > 16 ? 0 : n * 4 + 40;
}
else
#endif
{
buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT);
if(incx != 1)
buffer_size += n * 2;
}
STACK_ALLOC(buffer_size, FLOAT, buffer);
#ifdef SMP
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
#endif
@ -245,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
}
#endif
blas_memory_free(buffer);
STACK_FREE(buffer);
FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n);