Add files via upload

This commit is contained in:
Martin Kroeker 2018-08-22 10:03:02 +02:00 committed by GitHub
parent 61659f8765
commit 6d00c674ab
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 110 additions and 125 deletions

View File

@ -74,6 +74,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <errno.h> #include <errno.h>
//#define DEBUG
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
#define ALLOC_WINDOWS #define ALLOC_WINDOWS
#ifndef MEM_LARGE_PAGES #ifndef MEM_LARGE_PAGES
@ -108,6 +110,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/resource.h> #include <sys/resource.h>
#endif #endif
#ifdef OS_HAIKU
#include <unistd.h>
#endif
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#include <sys/sysctl.h> #include <sys/sysctl.h>
#include <sys/resource.h> #include <sys/resource.h>
@ -139,14 +145,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FIXED_PAGESIZE 4096 #define FIXED_PAGESIZE 4096
#endif #endif
#ifndef BUFFERS_PER_THREAD
#ifdef USE_OPENMP
#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
#else
#define BUFFERS_PER_THREAD NUM_BUFFERS
#endif
#endif
#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
#if defined(_MSC_VER) && !defined(__clang__) #if defined(_MSC_VER) && !defined(__clang__)
@ -238,6 +236,14 @@ int get_num_procs(void) {
} }
#endif #endif
#ifdef OS_HAIKU
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
#ifdef OS_WINDOWS #ifdef OS_WINDOWS
int get_num_procs(void) { int get_num_procs(void) {
@ -327,6 +333,7 @@ int goto_get_num_procs (void) {
} }
static void blas_memory_init(); static void blas_memory_init();
static void blas_tls_init();
void openblas_fork_handler() void openblas_fork_handler()
{ {
@ -363,7 +370,7 @@ int blas_get_cpu_number(void){
#endif #endif
// blas_goto_num = 0; // blas_goto_num = 0;
#ifndef USE_OPENMP #ifndef USE_OPENMP_UNUSED
blas_goto_num=openblas_num_threads_env(); blas_goto_num=openblas_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0; if (blas_goto_num < 0) blas_goto_num = 0;
@ -420,10 +427,8 @@ int openblas_get_num_threads(void) {
int hugetlb_allocated = 0; int hugetlb_allocated = 0;
#if defined(OS_WINDOWS) #if defined(OS_WINDOWS)
#define THREAD_LOCAL __declspec(thread)
#define LIKELY_ONE(x) (x) #define LIKELY_ONE(x) (x)
#else #else
#define THREAD_LOCAL __thread
#define LIKELY_ONE(x) (__builtin_expect(x, 1)) #define LIKELY_ONE(x) (__builtin_expect(x, 1))
#endif #endif
@ -459,105 +464,68 @@ struct alloc_t {
for an auxiliary tracking structure. */ for an auxiliary tracking structure. */
static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
/* Clang supports TLS from version 2.8 */ #if defined(SMP)
#if defined(__clang__) && __clang_major__ > 2 || \
(__clang_minor__ == 2 || __clang_minor__ == 8)
#define HAS_COMPILER_TLS
#endif
/* GCC supports TLS from version 4.1 */
#if !defined(__clang__) && defined(__GNUC__) && \
(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
#define HAS_COMPILER_TLS
#endif
/* MSVC supports TLS from version 2005 */
#if defined(_MSC_VER) && _MSC_VER >= 1400
#define HAS_COMPILER_TLS
#endif
/* Versions of XCode before 8 did not properly support TLS */
#if defined(__apple_build_version__) && __apple_build_version__ < 8000042
#undef HAS_COMPILER_TLS
#endif
/* Android NDK's before version 12b did not support TLS */
#if defined(__ANDROID__) && defined(__clang__)
#if __has_include(<android/ndk-version.h>)
#include <android/ndk-version.h>
#endif
#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
defined(__NDK_MINOR__) && \
((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
#undef HAS_COMPILER_TLS
#endif
#endif
/* Holds pointers to allocated memory */
#if defined(SMP) && !defined(USE_OPENMP)
/* This is the number of threads than can be spawned by the server, which is the
server plus the number of threads in the thread pool */
# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1
static int next_memory_table_pos = 0;
# if defined(HAS_COMPILER_TLS)
/* Use compiler generated thread-local-storage */
static int THREAD_LOCAL local_memory_table_pos = 0;
# else
/* Use system-dependent thread-local-storage */
# if defined(OS_WINDOWS) # if defined(OS_WINDOWS)
static DWORD local_storage_key; static DWORD local_storage_key = 0;
# else # else
static pthread_key_t local_storage_key; static pthread_key_t local_storage_key = 0;
# endif /* defined(OS_WINDOWS) */ # endif /* defined(OS_WINDOWS) */
# endif /* defined(HAS_COMPILER_TLS) */ #endif /* defined(SMP) */
#else
/* There is only one allocating thread when in single-threaded mode and when using OpenMP */
# define MAX_ALLOCATING_THREADS 1
#endif /* defined(SMP) && !defined(USE_OPENMP) */
static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD];
#if defined(OS_LINUX) && !defined(NO_WARMUP) #if defined(OS_LINUX) && !defined(NO_WARMUP)
static int hot_alloc = 0; static int hot_alloc = 0;
#endif #endif
/* Global lock for memory allocation */ /* Global locks for memory allocation */
#if defined(USE_PTHREAD_LOCK) #if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t tls_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK) #elif defined(USE_PTHREAD_SPINLOCK)
static pthread_spinlock_t alloc_lock = 0; static pthread_spinlock_t alloc_lock = 0;
static pthread_spinlock_t tls_lock = 0;
#else #else
static BLASULONG alloc_lock = 0UL; static BLASULONG alloc_lock = 0UL;
static BLASULONG tls_lock = 0UL;
#endif #endif
/* Returns a pointer to the start of the per-thread memory allocation data */ /* Returns a pointer to the start of the per-thread memory allocation data */
static __inline struct alloc_t ** get_memory_table() { static __inline struct alloc_t ** get_memory_table() {
#if defined(SMP) && !defined(USE_OPENMP) #if defined(SMP)
# if !defined(HAS_COMPILER_TLS) static int tls_initialized = 0;
# if defined(OS_WINDOWS) if (!LIKELY_ONE(tls_initialized)) {
int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); LOCK_COMMAND(&tls_lock);
# else /* Only one thread can get here at a time, so we are guaranteed to only do this initialization once */
int local_memory_table_pos = (int)pthread_getspecific(local_storage_key); if (!tls_initialized) {
# endif /* defined(OS_WINDOWS) */ blas_tls_init();
# endif /* !defined(HAS_COMPILER_TLS) */ /* Now any new thread entering the outer block will either do the TLS init, or nothing */
if (!local_memory_table_pos) { tls_initialized = 1;
LOCK_COMMAND(&alloc_lock);
local_memory_table_pos = next_memory_table_pos++;
if (next_memory_table_pos > MAX_ALLOCATING_THREADS)
printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n");
UNLOCK_COMMAND(&alloc_lock);
# if !defined(HAS_COMPILER_TLS)
# if defined(OS_WINDOWS)
::TlsSetValue(local_storage_key, (void*)local_memory_table_pos);
# else
pthread_setspecific(local_storage_key, (void*)local_memory_table_pos);
# endif /* defined(OS_WINDOWS) */
# endif /* !defined(HAS_COMPILER_TLS) */
} }
return local_memory_table[local_memory_table_pos]; UNLOCK_COMMAND(&tls_lock);
}
# if defined(OS_WINDOWS)
struct alloc_t ** local_memory_table = (struct alloc_t **)TlsGetValue(local_storage_key);
# else # else
return local_memory_table[0]; struct alloc_t ** local_memory_table = (struct alloc_t **)pthread_getspecific(local_storage_key);
#endif /* defined(SMP) && !defined(USE_OPENMP) */ # endif /* defined(OS_WINDOWS) */
#else
static struct alloc_t ** local_memory_table = NULL;
#endif /* defined(SMP) */
//QUAK
if (!local_memory_table) fprintf(stderr,"get_memory_table: NULL\n");
if (!local_storage_key) fprintf(stderr,"get_memory_table: no key\n");
if (local_storage_key && !local_memory_table) {
local_memory_table = (struct alloc_t **)malloc(sizeof(struct alloc_t *) * NUM_BUFFERS);
memset(local_memory_table, 0, sizeof(struct alloc_t *) * NUM_BUFFERS);
#if defined(SMP)
# if defined(OS_WINDOWS)
TlsSetValue(local_storage_key, (void*)local_memory_table);
# else
pthread_setspecific(local_storage_key, (void*)local_memory_table);
# endif /* defined(OS_WINDOWS) */
#endif /* defined(SMP) */
}
return local_memory_table;
} }
#ifdef ALLOC_MMAP #ifdef ALLOC_MMAP
@ -637,7 +605,7 @@ static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
static void *alloc_mmap(void *address){ static void *alloc_mmap(void *address){
void *map_address, *best_address; void *map_address, *best_address;
BLASULONG best, start, current; BLASULONG best, start, current, original;
BLASULONG allocsize; BLASULONG allocsize;
if (address){ if (address){
@ -685,8 +653,9 @@ static void *alloc_mmap(void *address){
start = (BLASULONG)map_address; start = (BLASULONG)map_address;
current = (SCALING - 1) * allocation_block_size; current = (SCALING - 1) * allocation_block_size;
original = current;
while(current > 0) { while(current > 0 && current <= original) {
*(BLASLONG *)start = (BLASLONG)start + PAGESIZE; *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
start += PAGESIZE; start += PAGESIZE;
current -= PAGESIZE; current -= PAGESIZE;
@ -1056,18 +1025,35 @@ static volatile int memory_initialized = 0;
/* 1 : Level 2 functions */ /* 1 : Level 2 functions */
/* 2 : Thread */ /* 2 : Thread */
static void blas_memory_init(){ static void blas_memory_cleanup(void* ptr){
#if defined(SMP) && !defined(USE_OPENMP) fprintf(stderr,"blas_memory_cleanup\n");
next_memory_table_pos = 0; if (ptr) {
# if !defined(HAS_COMPILER_TLS) struct alloc_t ** table = (struct alloc_t **)ptr;
int pos;
for (pos = 0; pos < NUM_BUFFERS; pos ++){
struct alloc_t *alloc_info = table[pos];
if (alloc_info) {
alloc_info->release_func(alloc_info);
table[pos] = (void *)0;
}
}
free(table);
}
}
static void blas_tls_init(){
#if defined(SMP)
# if defined(OS_WINDOWS) # if defined(OS_WINDOWS)
local_storage_key = ::TlsAlloc(); local_storage_key = TlsAlloc();
# else # else
pthread_key_create(&local_storage_key, NULL); pthread_key_create(&local_storage_key, blas_memory_cleanup);
# endif /* defined(OS_WINDOWS) */ # endif /* defined(OS_WINDOWS) */
# endif /* defined(HAS_COMPILER_TLS) */ #endif /* defined(SMP) */
#endif /* defined(SMP) && !defined(USE_OPENMP) */ }
memset(local_memory_table, 0, sizeof(local_memory_table));
static void blas_memory_init(){
blas_tls_init();
memset(get_memory_table(), 0, sizeof(struct alloc_t *) * NUM_BUFFERS);
} }
void *blas_memory_alloc(int procpos){ void *blas_memory_alloc(int procpos){
@ -1104,15 +1090,16 @@ void *blas_memory_alloc(int procpos){
void *(**func)(void *address); void *(**func)(void *address);
struct alloc_t * alloc_info; struct alloc_t * alloc_info;
struct alloc_t ** alloc_table; struct alloc_t ** alloc_table;
//fprintf(stderr,"blas_memory_alloc procpos=%d\n",procpos);
if (!LIKELY_ONE(memory_initialized)) { if (!LIKELY_ONE(memory_initialized)) {
//QUAK
blas_tls_init();
#if defined(SMP) && !defined(USE_OPENMP) #if defined(SMP) && !defined(USE_OPENMP)
/* Only allow a single thread to initialize memory system */ /* Only allow a single thread to initialize memory system */
LOCK_COMMAND(&alloc_lock); LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) { if (!memory_initialized) {
#endif #endif
blas_memory_init();
#ifdef DYNAMIC_ARCH #ifdef DYNAMIC_ARCH
gotoblas_dynamic_init(); gotoblas_dynamic_init();
#endif #endif
@ -1146,10 +1133,10 @@ void *blas_memory_alloc(int procpos){
position = 0; position = 0;
alloc_table = get_memory_table(); alloc_table = get_memory_table();
do { do {
if (!alloc_table[position] || !alloc_table[position]->used) goto allocation; if (alloc_table == NULL || !alloc_table[position] || !alloc_table[position]->used) goto allocation;
position ++; position ++;
} while (position < BUFFERS_PER_THREAD); } while (position < NUM_BUFFERS);
goto error; goto error;
@ -1158,7 +1145,9 @@ void *blas_memory_alloc(int procpos){
#ifdef DEBUG #ifdef DEBUG
printf(" Position -> %d\n", position); printf(" Position -> %d\n", position);
#endif #endif
if (alloc_table == NULL)
alloc_info = NULL;
else
alloc_info = alloc_table[position]; alloc_info = alloc_table[position];
if (!alloc_info) { if (!alloc_info) {
do { do {
@ -1170,7 +1159,7 @@ void *blas_memory_alloc(int procpos){
func = &memoryalloc[0]; func = &memoryalloc[0];
while ((func != NULL) && (map_address == (void *) -1)) { while ((func != NULL) && ((*func) != NULL) && (map_address == (void *) -1)) {
map_address = (*func)((void *)base_address); map_address = (*func)((void *)base_address);
@ -1204,13 +1193,13 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1); } while ((BLASLONG)map_address == -1);
if (!alloc_table) alloc_table = map_address;
alloc_table[position] = alloc_info = map_address; alloc_table[position] = alloc_info = map_address;
#ifdef DEBUG #ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position); printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position);
#endif #endif
} }
#ifdef DEBUG #ifdef DEBUG
printf("Mapped : %p %3d\n\n", (void *)alloc_info, position); printf("Mapped : %p %3d\n\n", (void *)alloc_info, position);
#endif #endif
@ -1247,7 +1236,7 @@ void blas_memory_free(void *buffer){
#ifdef DEBUG #ifdef DEBUG
alloc_table = get_memory_table(); alloc_table = get_memory_table();
for (position = 0; position < BUFFERS_PER_THREAD; position++){ for (position = 0; position < NUM_BUFFERS; position++){
if (alloc_table[position]) { if (alloc_table[position]) {
printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used); printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used);
} }
@ -1267,22 +1256,15 @@ void blas_memory_free_nolock(void * map_address) {
} }
void blas_shutdown(void){ void blas_shutdown(void){
int pos, thread;
#ifdef SMP #ifdef SMP
BLASFUNC(blas_thread_shutdown)(); BLASFUNC(blas_thread_shutdown)();
#endif #endif
for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){ #ifdef SMP
for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ /* Only cleanupIf we were built for threading and TLS was initialized */
struct alloc_t *alloc_info = local_memory_table[thread][pos]; if (local_storage_key)
if (alloc_info) { #endif
alloc_info->release_func(alloc_info); blas_memory_cleanup((void*)get_memory_table());
alloc_info = (void *)0;
}
}
}
#ifdef SEEK_ADDRESS #ifdef SEEK_ADDRESS
base_address = 0UL; base_address = 0UL;
@ -1503,6 +1485,9 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
case DLL_THREAD_ATTACH: case DLL_THREAD_ATTACH:
break; break;
case DLL_THREAD_DETACH: case DLL_THREAD_DETACH:
#if defined(SMP)
blas_memory_cleanup((void*)get_memory_table());
#endif
break; break;
case DLL_PROCESS_DETACH: case DLL_PROCESS_DETACH:
gotoblas_quit(); gotoblas_quit();