Benchmarks: align malloc'ed buffers.

Benchmarks should allocate with cacheline (often 64 bytes) alignment
to avoid unreliable timings. This technique, storing the offset in the
byte before the pointer, doesn't require C11's aligned_alloc for
compatibility with older compilers.

For example, Glibc's x86_64 malloc returns 16-byte aligned buffers, which is
not sufficient for AVX/AVX2 (32-byte preferred) or AVX512 (64-byte).
This commit is contained in:
Bart Oldeman 2022-10-20 13:28:20 -04:00
parent ad424fce08
commit 9959a60873
1 changed files with 18 additions and 0 deletions

View File

@ -74,6 +74,24 @@ static void *huge_malloc(BLASLONG size){
#endif
/* Benchmarks should allocate with cacheline (often 64 bytes) alignment
to avoid unreliable results. This technique, storing the offset in the
byte before the pointer, doesn't require C11's aligned_alloc for
compatibility with older compilers. */
static void *aligned_alloc_cacheline(size_t n)
{
void *p = malloc((size_t)(void *) + n + L1_DATA_LINESIZE - 1);
if (p) {
void **newp = (void **)
(((uintptr_t)p + L1_DATA_LINESIZE) & (uintptr_t)-L1_DATA_LINESIZE);
newp[-1] = p;
p = newp;
}
return p;
}
#define malloc aligned_alloc_cacheline
#define free(p) free((p) ? ((void **)(p))[-1] : (p))
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
struct timeval start, stop;
#elif defined(__APPLE__)