diff --git a/driver/others/memory.c b/driver/others/memory.c index 6f7a7db82..db12d3c55 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -87,6 +87,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif +/* Memory buffer must fit two matrix subblocks of maximal size */ +#define XSTR(x) STR(x) +#define STR(x) #x +#if BUFFER_SIZE < (SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 * 2) || \ + BUFFER_SIZE < (SGEMM_DEFAULT_P * SGEMM_DEFAULT_R * 4 * 2) || \ + BUFFER_SIZE < (SGEMM_DEFAULT_R * SGEMM_DEFAULT_Q * 4 * 2) +#error BUFFER_SIZE is too small for P, Q, and R of SGEMM: +#pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(SGEMM_DEFAULT_P*SGEMM_DEFAULT_Q*4*2) +#pragma message " and " XSTR(SGEMM_DEFAULT_P*SGEMM_DEFAULT_R*4*2) +#pragma message " and " XSTR(SGEMM_DEFAULT_R*SGEMM_DEFAULT_Q*4*2) +#endif +#if BUFFER_SIZE < (DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 * 2) || \ + BUFFER_SIZE < (DGEMM_DEFAULT_P * DGEMM_DEFAULT_R * 8 * 2) || \ + BUFFER_SIZE < (DGEMM_DEFAULT_R * DGEMM_DEFAULT_Q * 8 * 2) +#error BUFFER_SIZE is too small for P, Q, and R of DGEMM +#pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(DGEMM_DEFAULT_P*DGEMM_DEFAULT_Q*4*2) +#pragma message " and " XSTR(DGEMM_DEFAULT_P*DGEMM_DEFAULT_R*4*2) +#pragma message " and " XSTR(DGEMM_DEFAULT_R*DGEMM_DEFAULT_Q*4*2) +#endif +#if BUFFER_SIZE < (CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 * 2) || \ + BUFFER_SIZE < (CGEMM_DEFAULT_P * CGEMM_DEFAULT_R * 8 * 2) || \ + BUFFER_SIZE < (CGEMM_DEFAULT_R * CGEMM_DEFAULT_Q * 8 * 2) +#error BUFFER_SIZE is too small for P, Q, and R of CGEMM +#pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(CGEMM_DEFAULT_P*CGEMM_DEFAULT_Q*4*2) +#pragma message " and " XSTR(CGEMM_DEFAULT_P*CGEMM_DEFAULT_R*4*2) +#pragma message " and " XSTR(CGEMM_DEFAULT_R*CGEMM_DEFAULT_Q*4*2) +#endif +#if BUFFER_SIZE < (ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 * 2) || \ + BUFFER_SIZE < (ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_R * 16 * 2) || \ + BUFFER_SIZE < (ZGEMM_DEFAULT_R * ZGEMM_DEFAULT_Q * 16 * 2) +#error BUFFER_SIZE is too small for P, Q, and R of ZGEMM +#pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(ZGEMM_DEFAULT_P*ZGEMM_DEFAULT_Q*4*2) +#pragma message " and " XSTR(ZGEMM_DEFAULT_P*ZGEMM_DEFAULT_R*4*2) +#pragma message " and " XSTR(ZGEMM_DEFAULT_R*ZGEMM_DEFAULT_Q*4*2) +#endif + #if defined(COMPILE_TLS) #include @@ -129,7 +165,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #include #include #endif @@ -192,51 +228,74 @@ void goto_set_num_threads(int num_threads) {}; #else -#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD) +#if defined(OS_LINUX) || defined(OS_SUNOS) #ifndef NO_AFFINITY int get_num_procs(void); #else int get_num_procs(void) { static int nums = 0; -cpu_set_t *cpusetp; -size_t size; -int ret; -int i,n; + cpu_set_t cpuset,*cpusetp; + size_t size; + int ret; + +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2, 7) + int i; +#if !__GLIBC_PREREQ(2, 6) + int n; +#endif +#endif +#endif if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); #if !defined(OS_LINUX) - return nums; + return nums; #endif #if !defined(__GLIBC_PREREQ) - return nums; + return nums; #else #if !__GLIBC_PREREQ(2, 3) - return nums; + return nums; #endif #if !__GLIBC_PREREQ(2, 7) - ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); if (ret!=0) return nums; n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i 0 && ret < nums) nums = ret; - CPU_FREE(cpusetp); - return nums; + if (nums >= CPU_SETSIZE) { + cpusetp = CPU_ALLOC(nums); + if (cpusetp == NULL) { + return nums; + } + size = CPU_ALLOC_SIZE(nums); + ret = sched_getaffinity(0,size,cpusetp); + if (ret!=0) { + CPU_FREE(cpusetp); + return nums; + } + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; + CPU_FREE(cpusetp); + return nums; + } else { + ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); + if (ret!=0) { + return nums; + } + ret = CPU_COUNT(&cpuset); + if (ret > 0 && ret < nums) nums = ret; + return nums; + } #endif #endif } @@ -289,7 +348,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) int get_num_procs(void) { @@ -381,7 +440,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -389,7 +448,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif @@ -413,7 +472,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -799,7 +858,7 @@ static void *alloc_qalloc(void *address){ static void alloc_windows_free(struct alloc_t *alloc_info){ - VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT); + VirtualFree(alloc_info, 0, MEM_RELEASE); } @@ -912,7 +971,7 @@ static void alloc_hugetlb_free(struct alloc_t *alloc_info){ #ifdef OS_WINDOWS - VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT); + VirtualFree(alloc_info, 0, MEM_LARGE_PAGES | MEM_RELEASE); #endif @@ -1073,11 +1132,6 @@ static volatile int memory_initialized = 0; } free(table); } -#if defined(OS_WINDOWS) - TlsFree(local_storage_key); -#else - pthread_key_delete(local_storage_key); -#endif } static void blas_memory_init(){ @@ -1295,6 +1349,13 @@ void blas_memory_free_nolock(void * map_address) { free(map_address); } +#ifdef SMP +void blas_thread_memory_cleanup(void) { + blas_memory_cleanup((void*)get_memory_table()); +} +#endif + + void blas_shutdown(void){ #ifdef SMP BLASFUNC(blas_thread_shutdown)(); @@ -1304,7 +1365,7 @@ void blas_shutdown(void){ /* Only cleanupIf we were built for threading and TLS was initialized */ if (local_storage_key) #endif - blas_memory_cleanup((void*)get_memory_table()); + blas_thread_memory_cleanup(); #ifdef SEEK_ADDRESS base_address = 0UL; @@ -1491,6 +1552,14 @@ void DESTRUCTOR gotoblas_quit(void) { blas_shutdown(); +#if defined(SMP) +#if defined(OS_WINDOWS) + TlsFree(local_storage_key); +#else + pthread_key_delete(local_storage_key); +#endif +#endif + #ifdef PROFILE moncontrol (0); #endif @@ -1526,7 +1595,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser break; case DLL_THREAD_DETACH: #if defined(SMP) - blas_memory_cleanup((void*)get_memory_table()); + blas_thread_memory_cleanup(); #endif break; case DLL_PROCESS_DETACH: @@ -1589,6 +1658,7 @@ void gotoblas_dummy_for_PGI(void) { gotoblas_init(); gotoblas_quit(); +#if __PGIC__ < 19 #if 0 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); @@ -1596,13 +1666,16 @@ void gotoblas_dummy_for_PGI(void) { asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); #endif +#endif } #endif #else +/* USE_TLS / COMPILE_TLS not set */ + #include -#ifdef OS_WINDOWS +#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) #define ALLOC_WINDOWS #ifndef MEM_LARGE_PAGES #define MEM_LARGE_PAGES 0x20000000 @@ -1616,7 +1689,7 @@ void gotoblas_dummy_for_PGI(void) { #include #include -#ifndef OS_WINDOWS +#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) #include #ifndef NO_SYSV_IPC #include @@ -1636,7 +1709,7 @@ void gotoblas_dummy_for_PGI(void) { #include #endif -#if defined(OS_FREEBSD) || defined(OS_DARWIN) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #include #include #endif @@ -1675,9 +1748,12 @@ void gotoblas_dummy_for_PGI(void) { #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) -#else +#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) #define CONSTRUCTOR __attribute__ ((constructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101))) +#else +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) #endif #ifdef DYNAMIC_ARCH @@ -1696,50 +1772,75 @@ void goto_set_num_threads(int num_threads) {}; #else -#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD) +#if defined(OS_LINUX) || defined(OS_SUNOS) #ifndef NO_AFFINITY int get_num_procs(void); #else int get_num_procs(void) { + static int nums = 0; -cpu_set_t *cpusetp; -size_t size; -int ret; -int i,n; + cpu_set_t cpuset,*cpusetp; + size_t size; + int ret; + +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2, 7) + int i; +#if !__GLIBC_PREREQ(2, 6) + int n; +#endif +#endif +#endif if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); #if !defined(OS_LINUX) - return nums; + return nums; #endif #if !defined(__GLIBC_PREREQ) - return nums; + return nums; #else #if !__GLIBC_PREREQ(2, 3) - return nums; + return nums; #endif #if !__GLIBC_PREREQ(2, 7) - ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); if (ret!=0) return nums; n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i= CPU_SETSIZE) { + cpusetp = CPU_ALLOC(nums); + if (cpusetp == NULL) { + return nums; + } + size = CPU_ALLOC_SIZE(nums); + ret = sched_getaffinity(0,size,cpusetp); + if (ret!=0) { + CPU_FREE(cpusetp); + return nums; + } + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; + CPU_FREE(cpusetp); + return nums; + } else { + ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); + if (ret!=0) { + return nums; + } + ret = CPU_COUNT(&cpuset); + if (ret > 0 && ret < nums) nums = ret; + return nums; + } #endif #endif } @@ -1753,7 +1854,7 @@ int get_num_procs(void) { return nums; } #endif - + #ifdef OS_HAIKU int get_num_procs(void) { static int nums = 0; @@ -1790,7 +1891,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) int get_num_procs(void) { @@ -1867,7 +1968,7 @@ void openblas_fork_handler() // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 // In the mean time build with USE_OPENMP=0 or link against another // implementation of OpenMP. -#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER) +#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) int err; err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); if(err != 0) @@ -1880,7 +1981,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -1888,11 +1989,11 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif - blas_goto_num = 0; + // blas_goto_num = 0; #ifndef USE_OPENMP blas_goto_num=openblas_num_threads_env(); if (blas_goto_num < 0) blas_goto_num = 0; @@ -1904,7 +2005,7 @@ int blas_get_cpu_number(void){ #endif - blas_omp_num = 0; + // blas_omp_num = 0; blas_omp_num=openblas_omp_num_threads_env(); if (blas_omp_num < 0) blas_omp_num = 0; @@ -1912,7 +2013,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -1976,8 +2077,12 @@ static BLASULONG alloc_lock = 0UL; static void alloc_mmap_free(struct release_t *release){ +if (!release->address) return; + if (munmap(release -> address, BUFFER_SIZE)) { - printf("OpenBLAS : munmap failed\n"); + int errsv=errno; + perror("OpenBLAS : munmap failed:"); + printf("error code=%d,\trelease->address=%lx\n",errsv,release->address); } } @@ -1999,11 +2104,21 @@ static void *alloc_mmap(void *address){ } if (map_address != (void *)-1) { +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); +#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); +#endif + } else { +#ifdef DEBUG + int errsv=errno; + perror("OpenBLAS : mmap failed:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); +#endif } #ifdef OS_LINUX @@ -2145,14 +2260,18 @@ static void *alloc_mmap(void *address){ #if defined(OS_LINUX) && !defined(NO_WARMUP) } #endif - LOCK_COMMAND(&alloc_lock); if (map_address != (void *)-1) { +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + LOCK_COMMAND(&alloc_lock); +#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif } - UNLOCK_COMMAND(&alloc_lock); return map_address; } @@ -2227,7 +2346,7 @@ static void *alloc_qalloc(void *address){ static void alloc_windows_free(struct release_t *release){ - VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT); + VirtualFree(release -> address, 0, MEM_RELEASE); } @@ -2349,7 +2468,7 @@ static void alloc_hugetlb_free(struct release_t *release){ #ifdef OS_WINDOWS - VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT); + VirtualFree(release -> address, 0, MEM_LARGE_PAGES | MEM_RELEASE); #endif @@ -2520,7 +2639,7 @@ void *blas_memory_alloc(int procpos){ int position; #if defined(WHEREAMI) && !defined(USE_OPENMP) - int mypos; + int mypos = 0; #endif void *map_address; @@ -2551,6 +2670,11 @@ void *blas_memory_alloc(int procpos){ NULL, }; void *(**func)(void *address); + +#if defined(USE_OPENMP) + if (!memory_initialized) { +#endif + LOCK_COMMAND(&alloc_lock); if (!memory_initialized) { @@ -2586,6 +2710,9 @@ void *blas_memory_alloc(int procpos){ } UNLOCK_COMMAND(&alloc_lock); +#if defined(USE_OPENMP) + } +#endif #ifdef DEBUG printf("Alloc Start ...\n"); @@ -2600,13 +2727,17 @@ void *blas_memory_alloc(int procpos){ do { if (!memory[position].used && (memory[position].pos == mypos)) { +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); -// blas_lock(&memory[position].lock); - +#else + blas_lock(&memory[position].lock); +#endif if (!memory[position].used) goto allocation; - +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); -// blas_unlock(&memory[position].lock); +#else + blas_unlock(&memory[position].lock); +#endif } position ++; @@ -2618,21 +2749,26 @@ void *blas_memory_alloc(int procpos){ position = 0; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); +#endif do { -/* if (!memory[position].used) { */ -/* blas_lock(&memory[position].lock);*/ - +#if defined(USE_OPENMP) + if (!memory[position].used) { + blas_lock(&memory[position].lock); +#endif if (!memory[position].used) goto allocation; -/* blas_unlock(&memory[position].lock);*/ -/* } */ - +#if defined(USE_OPENMP) + blas_unlock(&memory[position].lock); + } +#endif position ++; } while (position < NUM_BUFFERS); - UNLOCK_COMMAND(&alloc_lock); - +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif goto error; allocation : @@ -2642,10 +2778,11 @@ void *blas_memory_alloc(int procpos){ #endif memory[position].used = 1; - +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); -/* blas_unlock(&memory[position].lock);*/ - +#else + blas_unlock(&memory[position].lock); +#endif if (!memory[position].addr) { do { #ifdef DEBUG @@ -2662,7 +2799,7 @@ void *blas_memory_alloc(int procpos){ #ifdef ALLOC_DEVICEDRIVER if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { - fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); } #endif @@ -2690,9 +2827,13 @@ void *blas_memory_alloc(int procpos){ } while ((BLASLONG)map_address == -1); +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); +#endif memory[position].addr = map_address; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); +#endif #ifdef DEBUG printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); @@ -2746,8 +2887,9 @@ void blas_memory_free(void *free_area){ #endif position = 0; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); - +#endif while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) position++; @@ -2761,7 +2903,9 @@ void blas_memory_free(void *free_area){ WMB; memory[position].used = 0; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); +#endif #ifdef DEBUG printf("Unmap Succeeded.\n\n"); @@ -2776,8 +2920,9 @@ void blas_memory_free(void *free_area){ for (position = 0; position < NUM_BUFFERS; position++) printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); #endif +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); - +#endif return; } @@ -2827,7 +2972,7 @@ void blas_shutdown(void){ #if defined(OS_LINUX) && !defined(NO_WARMUP) -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) #if defined(USE_PTHREAD_LOCK) static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; #elif defined(USE_PTHREAD_SPINLOCK) @@ -2852,7 +2997,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, if (hot_alloc != 2) { #endif -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) LOCK_COMMAND(&init_lock); #endif @@ -2862,7 +3007,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, size -= PAGESIZE; } -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) UNLOCK_COMMAND(&init_lock); #endif @@ -3095,7 +3240,7 @@ void gotoblas_dummy_for_PGI(void) { gotoblas_init(); gotoblas_quit(); - +#if __PGIC__ < 19 #if 0 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); @@ -3103,6 +3248,7 @@ void gotoblas_dummy_for_PGI(void) { asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); #endif +#endif } #endif