diff --git a/Changelog.txt b/Changelog.txt index 8cd101699..ee0484e2b 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,5 +1,20 @@ OpenBLAS ChangeLog ==================================================================== +Version 0.3.17 + 15-Jul-2021 + +common: + - reverted the optimization of SGEMV_N/DGEMV_N for small input sizes + and consecutive arguments as it led to stack overflows on x86_64 + with some operating systems (notably OSX and Windows) + + x86_64: + - reverted the performance patch for SGEMV_T on AVX512 as it caused + wrong results in some applications + + SPARC: + - fixed compilation with compilers other than gcc +==================================================================== Version 0.3.16 11-Jul-2021 diff --git a/Makefile.rule b/Makefile.rule index 19dd32919..bdc9b69e4 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.16 +VERSION = 0.3.16.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/cpuid.h b/cpuid.h index 824e0bc70..2c43922e7 100644 --- a/cpuid.h +++ b/cpuid.h @@ -54,6 +54,7 @@ #define VENDOR_TRANSMETA 9 #define VENDOR_NSC 10 #define VENDOR_HYGON 11 +#define VENDOR_ZHAOXIN 12 #define VENDOR_UNKNOWN 99 #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) diff --git a/cpuid_x86.c b/cpuid_x86.c index 00fc8baa0..5aa49055a 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -283,7 +283,7 @@ int get_vendor(void){ if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX; if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN; if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR; - if (!strcmp(vendor, " Shanghai ")) return VENDOR_CENTAUR; + if (!strcmp(vendor, " Shanghai ")) return VENDOR_ZHAOXIN; if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE; if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; @@ -1067,7 +1067,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_HYGON) || - (get_vendor() == VENDOR_CENTAUR)) { + (get_vendor() == VENDOR_CENTAUR) || + (get_vendor() == VENDOR_ZHAOXIN)) { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); LDTB.size = 4096; @@ -1190,7 +1191,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ int get_cpuname(void){ - int family, exfamily, model, vendor, exmodel; + int family, exfamily, model, vendor, exmodel, stepping; if (!have_cpuid()) return CPUTYPE_80386; @@ -1198,6 +1199,7 @@ int get_cpuname(void){ exfamily = get_cputype(GET_EXFAMILY); model = get_cputype(GET_MODEL); exmodel = get_cputype(GET_EXMODEL); + stepping = get_cputype(GET_STEPPING); vendor = get_vendor(); @@ -1628,15 +1630,20 @@ int get_cpuname(void){ switch (family) { case 0x5: return CPUTYPE_CENTAURC6; - break; case 0x6: - return CPUTYPE_NANO; - break; - case 0x7: + if (model == 0xf && stepping < 0xe) + return CPUTYPE_NANO; return CPUTYPE_NEHALEM; - break; + default: + if (family >= 0x7) + return CPUTYPE_NEHALEM; + else + return CPUTYPE_VIAC3; } - return CPUTYPE_VIAC3; + } + + if (vendor == VENDOR_ZHAOXIN){ + return CPUTYPE_NEHALEM; } if (vendor == VENDOR_RISE){ @@ -1869,7 +1876,7 @@ char *get_lower_cpunamechar(void){ int get_coretype(void){ - int family, exfamily, model, exmodel, vendor; + int family, exfamily, model, exmodel, vendor, stepping; if (!have_cpuid()) return CORE_80486; @@ -1877,6 +1884,7 @@ int get_coretype(void){ exfamily = get_cputype(GET_EXFAMILY); model = get_cputype(GET_MODEL); exmodel = get_cputype(GET_EXMODEL); + stepping = get_cputype(GET_STEPPING); vendor = get_vendor(); @@ -2286,13 +2294,19 @@ int get_coretype(void){ if (vendor == VENDOR_CENTAUR) { switch (family) { case 0x6: - return CORE_NANO; - break; - case 0x7: + if (model == 0xf && stepping < 0xe) + return CORE_NANO; return CORE_NEHALEM; - break; + default: + if (family >= 0x7) + return CORE_NEHALEM; + else + return CORE_VIAC3; } - return CORE_VIAC3; + } + + if (vendor == VENDOR_ZHAOXIN) { + return CORE_NEHALEM; } return CORE_UNKNOWN; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 1a33870db..071788a9b 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -292,6 +292,7 @@ extern gotoblas_t gotoblas_COOPERLAKE; #define VENDOR_AMD 2 #define VENDOR_CENTAUR 3 #define VENDOR_HYGON 4 +#define VENDOR_ZHAOXIN 5 #define VENDOR_UNKNOWN 99 #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) @@ -404,7 +405,7 @@ static int get_vendor(void){ if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; - if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_CENTAUR; + if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_ZHAOXIN; if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; @@ -415,7 +416,7 @@ static int get_vendor(void){ static gotoblas_t *get_coretype(void){ int eax, ebx, ecx, edx; - int family, exfamily, model, vendor, exmodel; + int family, exfamily, model, vendor, exmodel, stepping; cpuid(1, &eax, &ebx, &ecx, &edx); @@ -423,6 +424,7 @@ static gotoblas_t *get_coretype(void){ exfamily = BITMASK(eax, 20, 0xff); model = BITMASK(eax, 4, 0x0f); exmodel = BITMASK(eax, 16, 0x0f); + stepping = BITMASK(eax, 0, 0x0f); vendor = get_vendor(); @@ -824,13 +826,19 @@ static gotoblas_t *get_coretype(void){ if (vendor == VENDOR_CENTAUR) { switch (family) { case 0x6: - return &gotoblas_NANO; - break; - case 0x7: + if (model == 0xf && stepping < 0xe) + return &gotoblas_NANO; return &gotoblas_NEHALEM; + default: + if (family >= 0x7) + return &gotoblas_NEHALEM; } } + if (vendor == VENDOR_ZHAOXIN) { + return &gotoblas_NEHALEM; + } + return NULL; } diff --git a/interface/gemv.c b/interface/gemv.c index 1f14cdb2c..1f0763579 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -201,12 +201,14 @@ void CNAME(enum CBLAS_ORDER order, if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; - + +#if 0 +/* this optimization causes stack corruption on x86_64 under OSX, Windows and FreeBSD */ if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL); return; } - +#endif IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index a36c8ace9..76236cd16 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_haswell-4.c" #elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "sgemv_t_microk_haswell-4.c" -#include "sgemv_t_microk_skylakex.c" +/*#include "sgemv_t_microk_skylakex.c"*/ #endif #if defined(STEAMROLLER) || defined(EXCAVATOR) diff --git a/openblas_config_template.h b/openblas_config_template.h index 1e17c9a16..6a7382108 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -99,6 +99,8 @@ typedef int blasint; /* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */ #ifdef OPENBLAS_OS_LINUX -#define _GNU_SOURCE +#ifndef _GNU_SOURCE + #define _GNU_SOURCE +#endif #include #endif diff --git a/param.h b/param.h index 01048023f..965b97466 100644 --- a/param.h +++ b/param.h @@ -2502,7 +2502,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL +#define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -2534,7 +2534,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL +#define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4