diff --git a/Makefile.rule b/Makefile.rule index 21b7e138a..724a60ec4 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -97,6 +97,15 @@ VERSION = 0.3.9.dev # they need to wait for the preceding API calls to finish or risk data corruption. # NUM_PARALLEL = 2 +# When multithreading, OpenBLAS needs to use a memory buffer for communicating +# and collating results for individual subranges of the original matrix. Since +# the original GotoBLAS of the early 2000s, the default size of this buffer has +# been set at a value of 32<<20 (which is 32MB) on x86_64 , twice that on PPC. +# If you expect to handle large problem sizes (beyond about 30000x30000) uncomment +# this line and adjust the (32</dev/null 2>/dev/null"); system(@cmd) == 0; if ($? != 0) { $have_msa = 0; diff --git a/cmake/system.cmake b/cmake/system.cmake index 4f8011603..ce980a7b9 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -289,6 +289,10 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}") +if (BUFFERSIZE) +set(CCOMMON_OPT "${CCOMMON_OPT} -DBUFFERSIZE=${BUFFERSIZE}") +endif () + if (USE_SIMPLE_THREADED_LEVEL3) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") endif () diff --git a/common_x86_64.h b/common_x86_64.h index c05998d58..fe5539abe 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -225,7 +225,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #endif #define HUGE_PAGESIZE ( 2 << 20) +#ifndef BUFFERSIZE #define BUFFER_SIZE (32 << 20) +#else +#define BUFFER_SIZE (32 << BUFFERSIZE) +#endif #define SEEK_ADDRESS diff --git a/cpuid_x86.c b/cpuid_x86.c index 9e1c8e752..e29adecae 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -2006,6 +2006,38 @@ int get_coretype(void){ return CORE_NEHALEM; } break; + case 6: + if (model == 6) +#ifndef NO_AVX512 + return CORE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; +#endif + break; + case 7: + if (model == 10) + return CORE_NEHALEM; + if (model == 14) +#ifndef NO_AVX512 + return CORE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; +#endif + break; case 9: case 8: if (model == 14) { // Kaby Lake diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index 1206bf870..90d3051b1 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -3,12 +3,12 @@ extern gotoblas_t gotoblas_Z13; extern gotoblas_t gotoblas_Z14; -extern gotoblas_t gotoblas_Z15; +//extern gotoblas_t gotoblas_Z15; //#if (!defined C_GCC) || (GCC_VERSION >= 60000) //extern gotoblas_t gotoblas_Z14; //#endif -#define NUM_CORETYPES 5 +#define NUM_CORETYPES 4 extern void openblas_warning(int verbose, const char* msg); @@ -16,14 +16,14 @@ static char* corename[] = { "unknown", "Z13", "Z14", - "Z15", +// "Z15", "ZARCH_GENERIC", }; char* gotoblas_corename(void) { if (gotoblas == &gotoblas_Z13) return corename[1]; if (gotoblas == &gotoblas_Z14) return corename[2]; - if (gotoblas == &gotoblas_Z15) return corename[3]; +// if (gotoblas == &gotoblas_Z15) return corename[3]; //#if (!defined C_GCC) || (GCC_VERSION >= 60000) // if (gotoblas == &gotoblas_POWER9) return corename[3]; //#endif @@ -31,7 +31,7 @@ char* gotoblas_corename(void) { } // __builtin_cpu_is is not supported by zarch -static gotolabs_t* get_coretype(void) { +static gotoblas_t* get_coretype(void) { FILE* infile; char buffer[512], * p; @@ -78,7 +78,7 @@ static gotoblas_t* force_coretype(char* coretype) { { case 1: return (&gotoblas_Z13); case 2: return (&gotoblas_Z14); - case 3: return (&gotoblas_Z15); +// case 3: return (&gotoblas_Z15); //#if (!defined C_GCC) || (GCC_VERSION >= 60000) // case 3: return (&gotoblas_POWER9); //#endif diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index ad15b8f25..b3310e87e 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -121,8 +121,10 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.L3 set(USE_TRMM false) - - if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex") + if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) ) + set(USE_TRMM true) + endif () + if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9)) set(USE_TRMM true) endif () diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index d0d7801fd..e2436f789 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -136,10 +136,10 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ "jnz 1b \n\t" : - "+r" (n) // 0 + "+r" (n), // 0 + "+r" (x), // 1 + "+r" (x1) // 2 : - "r" (x), // 1 - "r" (x1), // 2 "r" (alpha), // 3 "r" (inc_x), // 4 "r" (inc_x3) // 5