From 9959a60873fbddc9dea23f4c32cc035147d1f351 Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Thu, 20 Oct 2022 13:28:20 -0400 Subject: [PATCH 1/2] Benchmarks: align malloc'ed buffers. Benchmarks should allocate with cacheline (often 64 bytes) alignment to avoid unreliable timings. This technique, storing the offset in the byte before the pointer, doesn't require C11's aligned_alloc for compatibility with older compilers. For example, Glibc's x86_64 malloc returns 16-byte aligned buffers, which is not sufficient for AVX/AVX2 (32-byte preferred) or AVX512 (64-byte). --- benchmark/bench.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/benchmark/bench.h b/benchmark/bench.h index c03d72bef..f23e487aa 100644 --- a/benchmark/bench.h +++ b/benchmark/bench.h @@ -74,6 +74,24 @@ static void *huge_malloc(BLASLONG size){ #endif +/* Benchmarks should allocate with cacheline (often 64 bytes) alignment + to avoid unreliable results. This technique, storing the offset in the + byte before the pointer, doesn't require C11's aligned_alloc for + compatibility with older compilers. */ +static void *aligned_alloc_cacheline(size_t n) +{ + void *p = malloc((size_t)(void *) + n + L1_DATA_LINESIZE - 1); + if (p) { + void **newp = (void **) + (((uintptr_t)p + L1_DATA_LINESIZE) & (uintptr_t)-L1_DATA_LINESIZE); + newp[-1] = p; + p = newp; + } + return p; +} +#define malloc aligned_alloc_cacheline +#define free(p) free((p) ? ((void **)(p))[-1] : (p)) + #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) struct timeval start, stop; #elif defined(__APPLE__) From 9e6b060bf3d74dd9eac7325cb9e5cc262a5584a6 Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Thu, 20 Oct 2022 20:11:09 -0400 Subject: [PATCH 2/2] Fix comment. It stores the pointer, not an offset (that would be an alternative approach). --- benchmark/bench.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/bench.h b/benchmark/bench.h index f23e487aa..1dae4d0fd 100644 --- a/benchmark/bench.h +++ b/benchmark/bench.h @@ -75,9 +75,9 @@ static void *huge_malloc(BLASLONG size){ #endif /* Benchmarks should allocate with cacheline (often 64 bytes) alignment - to avoid unreliable results. This technique, storing the offset in the - byte before the pointer, doesn't require C11's aligned_alloc for - compatibility with older compilers. */ + to avoid unreliable results. This technique, storing the allocated + pointer value just before the aligned memory, doesn't require + C11's aligned_alloc for compatibility with older compilers. */ static void *aligned_alloc_cacheline(size_t n) { void *p = malloc((size_t)(void *) + n + L1_DATA_LINESIZE - 1);