Use stack allocation in zgemv and zger
For better performance with small matrices Ref #727
This commit is contained in:
parent
0e68beb89f
commit
32f793195f
|
@ -77,6 +77,7 @@ void NAME(char *TRANS, blasint *M, blasint *N,
|
||||||
blasint incy = *INCY;
|
blasint incy = *INCY;
|
||||||
|
|
||||||
FLOAT *buffer;
|
FLOAT *buffer;
|
||||||
|
int buffer_size;
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
int nthreads;
|
int nthreads;
|
||||||
#endif
|
#endif
|
||||||
|
@ -141,7 +142,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
|
|
||||||
FLOAT *buffer;
|
FLOAT *buffer;
|
||||||
blasint lenx, leny;
|
blasint lenx, leny;
|
||||||
int trans;
|
int trans, buffer_size;
|
||||||
blasint info, t;
|
blasint info, t;
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
int nthreads;
|
int nthreads;
|
||||||
|
@ -230,7 +231,13 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
if (incx < 0) x -= (lenx - 1) * incx * 2;
|
if (incx < 0) x -= (lenx - 1) * incx * 2;
|
||||||
if (incy < 0) y -= (leny - 1) * incy * 2;
|
if (incy < 0) y -= (leny - 1) * incy * 2;
|
||||||
|
|
||||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT);
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
buffer_size += 160 / sizeof(FLOAT) ;
|
||||||
|
#endif
|
||||||
|
// for alignment
|
||||||
|
buffer_size = (buffer_size + 3) & ~3;
|
||||||
|
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
|
|
||||||
|
@ -253,7 +260,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
blas_memory_free(buffer);
|
STACK_FREE(buffer);
|
||||||
|
|
||||||
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
||||||
|
|
||||||
|
|
|
@ -210,7 +210,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
if (incy < 0) y -= (n - 1) * incy * 2;
|
if (incy < 0) y -= (n - 1) * incy * 2;
|
||||||
if (incx < 0) x -= (m - 1) * incx * 2;
|
if (incx < 0) x -= (m - 1) * incx * 2;
|
||||||
|
|
||||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
STACK_ALLOC(2 * m, FLOAT, buffer);
|
||||||
|
|
||||||
#ifdef SMPTEST
|
#ifdef SMPTEST
|
||||||
// Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
|
// Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
|
||||||
|
@ -249,7 +249,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
blas_memory_free(buffer);
|
STACK_FREE(buffer);
|
||||||
|
|
||||||
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue