diff --git a/driver/others/memory.c b/driver/others/memory.c index c4bd9b73c..6c1fb2264 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -143,14 +143,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FIXED_PAGESIZE 4096 #endif -#ifndef BUFFERS_PER_THREAD -#ifdef USE_OPENMP_UNUSED -#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) -#else -#define BUFFERS_PER_THREAD NUM_BUFFERS -#endif -#endif - #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #if defined(_MSC_VER) && !defined(__clang__) @@ -338,7 +330,8 @@ int goto_get_num_procs (void) { return blas_cpu_number; } -static void blas_memory_init(); +static void blas_memory_init_from_fork(); +static void blas_tls_init(); void openblas_fork_handler() { @@ -351,7 +344,7 @@ void openblas_fork_handler() // implementation of OpenMP. #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) int err; - err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init); + err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init_from_fork); if(err != 0) openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n"); #endif @@ -432,10 +425,8 @@ int openblas_get_num_threads(void) { int hugetlb_allocated = 0; #if defined(OS_WINDOWS) -#define THREAD_LOCAL __declspec(thread) #define LIKELY_ONE(x) (x) #else -#define THREAD_LOCAL __thread #define LIKELY_ONE(x) (__builtin_expect(x, 1)) #endif @@ -471,105 +462,65 @@ struct alloc_t { for an auxiliary tracking structure. */ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); -/* Clang supports TLS from version 2.8 */ -#if defined(__clang__) && __clang_major__ > 2 || \ - (__clang_minor__ == 2 || __clang_minor__ == 8) -#define HAS_COMPILER_TLS -#endif - -/* GCC supports TLS from version 4.1 */ -#if !defined(__clang__) && defined(__GNUC__) && \ - (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) -#define HAS_COMPILER_TLS -#endif - -/* MSVC supports TLS from version 2005 */ -#if defined(_MSC_VER) && _MSC_VER >= 1400 -#define HAS_COMPILER_TLS -#endif - -/* Versions of XCode before 8 did not properly support TLS */ -#if defined(__apple_build_version__) && __apple_build_version__ < 8000042 -#undef HAS_COMPILER_TLS -#endif - -/* Android NDK's before version 12b did not support TLS */ -#if defined(__ANDROID__) && defined(__clang__) -#if __has_include() -#include -#endif -#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \ - defined(__NDK_MINOR__) && \ - ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1))) -#undef HAS_COMPILER_TLS -#endif -#endif - -/* Holds pointers to allocated memory */ -#if defined(SMP) && !defined(USE_OPENMP_UNUSED) -/* This is the number of threads than can be spawned by the server, which is the - server plus the number of threads in the thread pool */ -# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER * 2 -static int next_memory_table_pos = 0; -# if defined(HAS_COMPILER_TLS) -/* Use compiler generated thread-local-storage */ -static int THREAD_LOCAL local_memory_table_pos = 0; +#if defined(SMP) +# if defined(OS_WINDOWS) +static DWORD local_storage_key = 0; # else -/* Use system-dependent thread-local-storage */ -# if defined(OS_WINDOWS) -static DWORD local_storage_key; -# else -static pthread_key_t local_storage_key; -# endif /* defined(OS_WINDOWS) */ -# endif /* defined(HAS_COMPILER_TLS) */ -#else -/* There is only one allocating thread when in single-threaded mode and when using OpenMP */ -# define MAX_ALLOCATING_THREADS 1 -#endif /* defined(SMP) && !defined(USE_OPENMP) */ -static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD]; +static pthread_key_t local_storage_key = 0; +# endif /* defined(OS_WINDOWS) */ +#endif /* defined(SMP) */ #if defined(OS_LINUX) && !defined(NO_WARMUP) static int hot_alloc = 0; #endif -/* Global lock for memory allocation */ +/* Global locks for memory allocation */ #if defined(USE_PTHREAD_LOCK) static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_mutex_t tls_lock = PTHREAD_MUTEX_INITIALIZER; #elif defined(USE_PTHREAD_SPINLOCK) static pthread_spinlock_t alloc_lock = 0; +static pthread_spinlock_t tls_lock = 0; #else static BLASULONG alloc_lock = 0UL; +static BLASULONG tls_lock = 0UL; #endif /* Returns a pointer to the start of the per-thread memory allocation data */ static __inline struct alloc_t ** get_memory_table() { -#if defined(SMP) && !defined(USE_OPENMP_UNUSED) -# if !defined(HAS_COMPILER_TLS) -# if defined(OS_WINDOWS) - int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); -# else - int local_memory_table_pos = (int)pthread_getspecific(local_storage_key); -# endif /* defined(OS_WINDOWS) */ -# endif /* !defined(HAS_COMPILER_TLS) */ - if (!local_memory_table_pos) { - LOCK_COMMAND(&alloc_lock); - local_memory_table_pos = next_memory_table_pos++; - if (next_memory_table_pos > MAX_ALLOCATING_THREADS) - printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n"); - UNLOCK_COMMAND(&alloc_lock); -# if !defined(HAS_COMPILER_TLS) -# if defined(OS_WINDOWS) - ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos); -# else - pthread_setspecific(local_storage_key, (void*)local_memory_table_pos); -# endif /* defined(OS_WINDOWS) */ -# endif /* !defined(HAS_COMPILER_TLS) */ +#if defined(SMP) + static int tls_initialized = 0; + if (!LIKELY_ONE(tls_initialized)) { + LOCK_COMMAND(&tls_lock); + /* Only one thread can get here at a time, so we are guaranteed to only do this initialization once */ + if (!tls_initialized) { + blas_tls_init(); + /* Now any new thread entering the outer block will either do the TLS init, or nothing */ + tls_initialized = 1; + } + UNLOCK_COMMAND(&tls_lock); } - return local_memory_table[local_memory_table_pos]; +# if defined(OS_WINDOWS) + struct alloc_t ** local_memory_table = (struct alloc_t **)TlsGetValue(local_storage_key); +# else + struct alloc_t ** local_memory_table = (struct alloc_t **)pthread_getspecific(local_storage_key); +# endif /* defined(OS_WINDOWS) */ #else - return local_memory_table[0]; -#endif /* defined(SMP) && !defined(USE_OPENMP) */ + static struct alloc_t ** local_memory_table = NULL; +#endif /* defined(SMP) */ + if (local_storage_key && !local_memory_table) { + local_memory_table = (struct alloc_t **)malloc(sizeof(struct alloc_t *) * NUM_BUFFERS); + memset(local_memory_table, 0, sizeof(struct alloc_t *) * NUM_BUFFERS); +#if defined(SMP) +# if defined(OS_WINDOWS) + TlsSetValue(local_storage_key, (void*)local_memory_table); +# else + pthread_setspecific(local_storage_key, (void*)local_memory_table); +# endif /* defined(OS_WINDOWS) */ +#endif /* defined(SMP) */ + } + return local_memory_table; } #ifdef ALLOC_MMAP @@ -1069,18 +1020,40 @@ static volatile int memory_initialized = 0; /* 1 : Level 2 functions */ /* 2 : Thread */ +static void blas_memory_cleanup(void* ptr){ + if (ptr) { + struct alloc_t ** table = (struct alloc_t **)ptr; + int pos; + for (pos = 0; pos < NUM_BUFFERS; pos ++){ + struct alloc_t *alloc_info = table[pos]; + if (alloc_info) { + alloc_info->release_func(alloc_info); + table[pos] = (void *)0; + } + } + free(table); + } +} + +static void blas_tls_init(){ +#if defined(SMP) +# if defined(OS_WINDOWS) + local_storage_key = TlsAlloc(); + TlsSetValue(local_storage_key, (void*)0); +# else + pthread_key_create(&local_storage_key, blas_memory_cleanup); + pthread_setspecific(local_storage_key, (void*)0); +# endif /* defined(OS_WINDOWS) */ +#endif /* defined(SMP) */ +} + static void blas_memory_init(){ -#if defined(SMP) && !defined(USE_OPENMP_UNUSED) - next_memory_table_pos = 0; -# if !defined(HAS_COMPILER_TLS) -# if defined(OS_WINDOWS) - local_storage_key = ::TlsAlloc(); -# else - pthread_key_create(&local_storage_key, NULL); -# endif /* defined(OS_WINDOWS) */ -# endif /* defined(HAS_COMPILER_TLS) */ -#endif /* defined(SMP) && !defined(USE_OPENMP) */ - memset(local_memory_table, 0, sizeof(local_memory_table)); + memset(get_memory_table(), 0, sizeof(struct alloc_t *) * NUM_BUFFERS); +} + +static void blas_memory_init_from_fork(){ + blas_tls_init(); + blas_memory_init(); } void *blas_memory_alloc(int procpos){ @@ -1162,7 +1135,7 @@ void *blas_memory_alloc(int procpos){ if (!alloc_table[position] || !alloc_table[position]->used) goto allocation; position ++; - } while (position < BUFFERS_PER_THREAD); + } while (position < NUM_BUFFERS); goto error; @@ -1260,7 +1233,7 @@ void blas_memory_free(void *buffer){ #ifdef DEBUG alloc_table = get_memory_table(); - for (position = 0; position < BUFFERS_PER_THREAD; position++){ + for (position = 0; position < NUM_BUFFERS; position++){ if (alloc_table[position]) { printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used); } @@ -1280,22 +1253,15 @@ void blas_memory_free_nolock(void * map_address) { } void blas_shutdown(void){ - - int pos, thread; - #ifdef SMP BLASFUNC(blas_thread_shutdown)(); #endif - for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){ - for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ - struct alloc_t *alloc_info = local_memory_table[thread][pos]; - if (alloc_info) { - alloc_info->release_func(alloc_info); - local_memory_table[thread][pos] = (void *)0; - } - } - } +#ifdef SMP + /* Only cleanupIf we were built for threading and TLS was initialized */ + if (local_storage_key) +#endif + blas_memory_cleanup((void*)get_memory_table()); #ifdef SEEK_ADDRESS base_address = 0UL; @@ -1516,6 +1482,9 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser case DLL_THREAD_ATTACH: break; case DLL_THREAD_DETACH: +#if defined(SMP) + blas_memory_cleanup((void*)get_memory_table()); +#endif break; case DLL_PROCESS_DETACH: gotoblas_quit();