Improve performances of ztrmv on small matrices
* Use stack allocation * Disable multi-threading * Ref #727
This commit is contained in:
parent
32f793195f
commit
78dcf5c3d5
|
@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
|||
#endif
|
||||
|
||||
x = buffer;
|
||||
buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
|
||||
buffer += ((COMPSIZE * args -> m + 3) & ~3);
|
||||
}
|
||||
|
||||
#ifndef TRANS
|
||||
|
@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
|||
|
||||
if (num_cpu) {
|
||||
queue[0].sa = NULL;
|
||||
queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
|
||||
queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE;
|
||||
|
||||
queue[num_cpu - 1].next = NULL;
|
||||
|
||||
|
|
|
@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
|
|||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
|
|||
blasint info;
|
||||
int uplo;
|
||||
int unit;
|
||||
int trans;
|
||||
int trans, buffer_size;
|
||||
FLOAT *buffer;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
|
@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) {
|
||||
|
||||
int trans, uplo, unit;
|
||||
int trans, uplo, unit, buffer_size;
|
||||
blasint info;
|
||||
FLOAT *buffer;
|
||||
#ifdef SMP
|
||||
|
@ -227,11 +227,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
if (incx < 0 ) x -= (n - 1) * incx * 2;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
#ifdef SMP
|
||||
// Calibrated on a Xeon E5-2630
|
||||
if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) {
|
||||
nthreads = num_cpu_avail(2);
|
||||
if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
|
||||
nthreads = 2;
|
||||
} else
|
||||
nthreads = 1;
|
||||
|
||||
if(nthreads > 1) {
|
||||
buffer_size = n > 16 ? 0 : n * 4 + 40;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT);
|
||||
if(incx != 1)
|
||||
buffer_size += n * 2;
|
||||
}
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
|
@ -245,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
}
|
||||
#endif
|
||||
|
||||
blas_memory_free(buffer);
|
||||
STACK_FREE(buffer);
|
||||
|
||||
FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n);
|
||||
|
||||
|
|
Loading…
Reference in New Issue