Support arbitrary numbers of threads for memory allocation.

When I originally refactored memory.c to reduce locking, I made the (incorrect) assumption that all threads were managed by OpenBLAS.  The recent Issues we've seen show that really, any caller can make its own threads and call into OpenBLAS; they don't all come from blas_server.  Thus we have to be able to support an arbitrary number of threads that can come in any time.  The original implementation (before my changes) dealt with this by having a single allocation table, and everyone had to lock to get access to and update it, which was expensive.  Moving to thread-local allocation tables was much faster, but now we have to deal with the fact that thread local storage might not be cleaned up.

This change gives each thread its own local allocation table, and completely does away with the global table.  We cleanup allocations using pthreads' key destructor and Win32's DLL_THREAD_DETACH.

This change also removes compiler TLS, which in the end, wasn't really worth it given the issues with the glibc implementation.  The overall performance impact was < 1%, anyway.  Removing it also simplifies the code.

Support arbitrary numbers of threads for memory allocation.

When I originally refactored memory.c to reduce locking, I made the (incorrect) assumption that all threads were managed by OpenBLAS.  The recent Issues we've seen show that really, any caller can make its own threads and call into OpenBLAS; they don't all come from blas_server.  Thus we have to be able to support an arbitrary number of threads that can come in any time.  The original implementation (before my changes) dealt with this by having a single allocation table, and everyone had to lock to get access to and update it, which was expensive.  Moving to thread-local allocation tables was much faster, but now we have to deal with the fact that thread local storage might not be cleaned up.

This change gives each thread its own local allocation table, and completely does away with the global table.  We cleanup allocations using pthreads' key destructor and Win32's DLL_THREAD_DETACH.

This change also removes compiler TLS, which in the end, wasn't really worth it given the issues with the glibc implementation.  The overall performance impact was < 1%, anyway.  Removing it also simplifies the code.
This commit is contained in:
Craig Donner 2018-08-16 15:18:26 +01:00
parent 66da7677bd
commit c39fab3483
1 changed files with 86 additions and 117 deletions

View File

@ -143,14 +143,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FIXED_PAGESIZE 4096
#endif
#ifndef BUFFERS_PER_THREAD
#ifdef USE_OPENMP_UNUSED
#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
#else
#define BUFFERS_PER_THREAD NUM_BUFFERS
#endif
#endif
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
#if defined(_MSC_VER) && !defined(__clang__)
@ -338,7 +330,8 @@ int goto_get_num_procs (void) {
return blas_cpu_number;
}
static void blas_memory_init();
static void blas_memory_init_from_fork();
static void blas_tls_init();
void openblas_fork_handler()
{
@ -351,7 +344,7 @@ void openblas_fork_handler()
// implementation of OpenMP.
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
int err;
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init);
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init_from_fork);
if(err != 0)
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
#endif
@ -432,10 +425,8 @@ int openblas_get_num_threads(void) {
int hugetlb_allocated = 0;
#if defined(OS_WINDOWS)
#define THREAD_LOCAL __declspec(thread)
#define LIKELY_ONE(x) (x)
#else
#define THREAD_LOCAL __thread
#define LIKELY_ONE(x) (__builtin_expect(x, 1))
#endif
@ -471,105 +462,65 @@ struct alloc_t {
for an auxiliary tracking structure. */
static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
/* Clang supports TLS from version 2.8 */
#if defined(__clang__) && __clang_major__ > 2 || \
(__clang_minor__ == 2 || __clang_minor__ == 8)
#define HAS_COMPILER_TLS
#endif
/* GCC supports TLS from version 4.1 */
#if !defined(__clang__) && defined(__GNUC__) && \
(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
#define HAS_COMPILER_TLS
#endif
/* MSVC supports TLS from version 2005 */
#if defined(_MSC_VER) && _MSC_VER >= 1400
#define HAS_COMPILER_TLS
#endif
/* Versions of XCode before 8 did not properly support TLS */
#if defined(__apple_build_version__) && __apple_build_version__ < 8000042
#undef HAS_COMPILER_TLS
#endif
/* Android NDK's before version 12b did not support TLS */
#if defined(__ANDROID__) && defined(__clang__)
#if __has_include(<android/ndk-version.h>)
#include <android/ndk-version.h>
#endif
#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
defined(__NDK_MINOR__) && \
((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
#undef HAS_COMPILER_TLS
#endif
#endif
/* Holds pointers to allocated memory */
#if defined(SMP) && !defined(USE_OPENMP_UNUSED)
/* This is the number of threads than can be spawned by the server, which is the
server plus the number of threads in the thread pool */
# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER * 2
static int next_memory_table_pos = 0;
# if defined(HAS_COMPILER_TLS)
/* Use compiler generated thread-local-storage */
static int THREAD_LOCAL local_memory_table_pos = 0;
# else
/* Use system-dependent thread-local-storage */
#if defined(SMP)
# if defined(OS_WINDOWS)
static DWORD local_storage_key;
static DWORD local_storage_key = 0;
# else
static pthread_key_t local_storage_key;
static pthread_key_t local_storage_key = 0;
# endif /* defined(OS_WINDOWS) */
# endif /* defined(HAS_COMPILER_TLS) */
#else
/* There is only one allocating thread when in single-threaded mode and when using OpenMP */
# define MAX_ALLOCATING_THREADS 1
#endif /* defined(SMP) && !defined(USE_OPENMP) */
static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD];
#endif /* defined(SMP) */
#if defined(OS_LINUX) && !defined(NO_WARMUP)
static int hot_alloc = 0;
#endif
/* Global lock for memory allocation */
/* Global locks for memory allocation */
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t tls_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
static pthread_spinlock_t alloc_lock = 0;
static pthread_spinlock_t tls_lock = 0;
#else
static BLASULONG alloc_lock = 0UL;
static BLASULONG tls_lock = 0UL;
#endif
/* Returns a pointer to the start of the per-thread memory allocation data */
static __inline struct alloc_t ** get_memory_table() {
#if defined(SMP) && !defined(USE_OPENMP_UNUSED)
# if !defined(HAS_COMPILER_TLS)
# if defined(OS_WINDOWS)
int local_memory_table_pos = (int)::TlsGetValue(local_storage_key);
# else
int local_memory_table_pos = (int)pthread_getspecific(local_storage_key);
# endif /* defined(OS_WINDOWS) */
# endif /* !defined(HAS_COMPILER_TLS) */
if (!local_memory_table_pos) {
LOCK_COMMAND(&alloc_lock);
local_memory_table_pos = next_memory_table_pos++;
if (next_memory_table_pos > MAX_ALLOCATING_THREADS)
printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n");
UNLOCK_COMMAND(&alloc_lock);
# if !defined(HAS_COMPILER_TLS)
# if defined(OS_WINDOWS)
::TlsSetValue(local_storage_key, (void*)local_memory_table_pos);
# else
pthread_setspecific(local_storage_key, (void*)local_memory_table_pos);
# endif /* defined(OS_WINDOWS) */
# endif /* !defined(HAS_COMPILER_TLS) */
#if defined(SMP)
static int tls_initialized = 0;
if (!LIKELY_ONE(tls_initialized)) {
LOCK_COMMAND(&tls_lock);
/* Only one thread can get here at a time, so we are guaranteed to only do this initialization once */
if (!tls_initialized) {
blas_tls_init();
/* Now any new thread entering the outer block will either do the TLS init, or nothing */
tls_initialized = 1;
}
return local_memory_table[local_memory_table_pos];
UNLOCK_COMMAND(&tls_lock);
}
# if defined(OS_WINDOWS)
struct alloc_t ** local_memory_table = (struct alloc_t **)TlsGetValue(local_storage_key);
# else
struct alloc_t ** local_memory_table = (struct alloc_t **)pthread_getspecific(local_storage_key);
# endif /* defined(OS_WINDOWS) */
#else
return local_memory_table[0];
#endif /* defined(SMP) && !defined(USE_OPENMP) */
static struct alloc_t ** local_memory_table = NULL;
#endif /* defined(SMP) */
if (local_storage_key && !local_memory_table) {
local_memory_table = (struct alloc_t **)malloc(sizeof(struct alloc_t *) * NUM_BUFFERS);
memset(local_memory_table, 0, sizeof(struct alloc_t *) * NUM_BUFFERS);
#if defined(SMP)
# if defined(OS_WINDOWS)
TlsSetValue(local_storage_key, (void*)local_memory_table);
# else
pthread_setspecific(local_storage_key, (void*)local_memory_table);
# endif /* defined(OS_WINDOWS) */
#endif /* defined(SMP) */
}
return local_memory_table;
}
#ifdef ALLOC_MMAP
@ -1069,18 +1020,40 @@ static volatile int memory_initialized = 0;
/* 1 : Level 2 functions */
/* 2 : Thread */
static void blas_memory_init(){
#if defined(SMP) && !defined(USE_OPENMP_UNUSED)
next_memory_table_pos = 0;
# if !defined(HAS_COMPILER_TLS)
static void blas_memory_cleanup(void* ptr){
if (ptr) {
struct alloc_t ** table = (struct alloc_t **)ptr;
int pos;
for (pos = 0; pos < NUM_BUFFERS; pos ++){
struct alloc_t *alloc_info = table[pos];
if (alloc_info) {
alloc_info->release_func(alloc_info);
table[pos] = (void *)0;
}
}
free(table);
}
}
static void blas_tls_init(){
#if defined(SMP)
# if defined(OS_WINDOWS)
local_storage_key = ::TlsAlloc();
local_storage_key = TlsAlloc();
TlsSetValue(local_storage_key, (void*)0);
# else
pthread_key_create(&local_storage_key, NULL);
pthread_key_create(&local_storage_key, blas_memory_cleanup);
pthread_setspecific(local_storage_key, (void*)0);
# endif /* defined(OS_WINDOWS) */
# endif /* defined(HAS_COMPILER_TLS) */
#endif /* defined(SMP) && !defined(USE_OPENMP) */
memset(local_memory_table, 0, sizeof(local_memory_table));
#endif /* defined(SMP) */
}
static void blas_memory_init(){
memset(get_memory_table(), 0, sizeof(struct alloc_t *) * NUM_BUFFERS);
}
static void blas_memory_init_from_fork(){
blas_tls_init();
blas_memory_init();
}
void *blas_memory_alloc(int procpos){
@ -1162,7 +1135,7 @@ void *blas_memory_alloc(int procpos){
if (!alloc_table[position] || !alloc_table[position]->used) goto allocation;
position ++;
} while (position < BUFFERS_PER_THREAD);
} while (position < NUM_BUFFERS);
goto error;
@ -1260,7 +1233,7 @@ void blas_memory_free(void *buffer){
#ifdef DEBUG
alloc_table = get_memory_table();
for (position = 0; position < BUFFERS_PER_THREAD; position++){
for (position = 0; position < NUM_BUFFERS; position++){
if (alloc_table[position]) {
printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used);
}
@ -1280,22 +1253,15 @@ void blas_memory_free_nolock(void * map_address) {
}
void blas_shutdown(void){
int pos, thread;
#ifdef SMP
BLASFUNC(blas_thread_shutdown)();
#endif
for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){
for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
struct alloc_t *alloc_info = local_memory_table[thread][pos];
if (alloc_info) {
alloc_info->release_func(alloc_info);
local_memory_table[thread][pos] = (void *)0;
}
}
}
#ifdef SMP
/* Only cleanupIf we were built for threading and TLS was initialized */
if (local_storage_key)
#endif
blas_memory_cleanup((void*)get_memory_table());
#ifdef SEEK_ADDRESS
base_address = 0UL;
@ -1516,6 +1482,9 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
case DLL_THREAD_ATTACH:
break;
case DLL_THREAD_DETACH:
#if defined(SMP)
blas_memory_cleanup((void*)get_memory_table());
#endif
break;
case DLL_PROCESS_DETACH:
gotoblas_quit();