From 9959a60873fbddc9dea23f4c32cc035147d1f351 Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Thu, 20 Oct 2022 13:28:20 -0400 Subject: [PATCH] Benchmarks: align malloc'ed buffers. Benchmarks should allocate with cacheline (often 64 bytes) alignment to avoid unreliable timings. This technique, storing the offset in the byte before the pointer, doesn't require C11's aligned_alloc for compatibility with older compilers. For example, Glibc's x86_64 malloc returns 16-byte aligned buffers, which is not sufficient for AVX/AVX2 (32-byte preferred) or AVX512 (64-byte). --- benchmark/bench.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/benchmark/bench.h b/benchmark/bench.h index c03d72bef..f23e487aa 100644 --- a/benchmark/bench.h +++ b/benchmark/bench.h @@ -74,6 +74,24 @@ static void *huge_malloc(BLASLONG size){ #endif +/* Benchmarks should allocate with cacheline (often 64 bytes) alignment + to avoid unreliable results. This technique, storing the offset in the + byte before the pointer, doesn't require C11's aligned_alloc for + compatibility with older compilers. */ +static void *aligned_alloc_cacheline(size_t n) +{ + void *p = malloc((size_t)(void *) + n + L1_DATA_LINESIZE - 1); + if (p) { + void **newp = (void **) + (((uintptr_t)p + L1_DATA_LINESIZE) & (uintptr_t)-L1_DATA_LINESIZE); + newp[-1] = p; + p = newp; + } + return p; +} +#define malloc aligned_alloc_cacheline +#define free(p) free((p) ? ((void **)(p))[-1] : (p)) + #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) struct timeval start, stop; #elif defined(__APPLE__)