From ae1d1f74f7ff96b8345189bcba058b7acdc7d494 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 16:55:33 +0100 Subject: [PATCH 1/7] Query AVX2 and AVX512 capability for runtime cpu selection --- driver/others/dynamic.c | 141 +++++++++++++++++++++++++++++----------- 1 file changed, 102 insertions(+), 39 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 1f67dc521..7cc911d32 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -304,9 +304,47 @@ int support_avx(){ #endif } +int support_avx2(){ +#ifndef NO_AVX2 + int eax, ebx, ecx=0, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 0) + ret=1; //OS supports AVX2 + return ret; +#else + return 0; +#endif +} + +int support_avx512(){ +#ifndef NO_AVX512 + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 1){ + ret=0; //OS does not even support AVX2 + } + if((ebx & (1<<31)) != 0){ + ret=1; //OS supports AVX512VL + } + return ret; +#else + return 0; +#endif +} + extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" +#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" +#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512 instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" static int get_vendor(void){ @@ -403,18 +441,24 @@ static gotoblas_t *get_coretype(void){ } //Intel Haswell if (model == 12 || model == 15) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Broadwell if (model == 13) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } @@ -424,27 +468,36 @@ static gotoblas_t *get_coretype(void){ case 4: //Intel Haswell if (model == 5 || model == 6) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Broadwell if (model == 7 || model == 15) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Skylake if (model == 14) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } @@ -457,40 +510,50 @@ static gotoblas_t *get_coretype(void){ case 5: //Intel Broadwell if (model == 6) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } if (model == 5) { // Intel Skylake X -#ifndef NO_AVX512 - return &gotoblas_SKYLAKEX; -#else - if(support_avx()) + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()) return &gotoblas_HASWELL; - else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; - } -#endif + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } } //Intel Skylake if (model == 14) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Phi Knights Landing if (model == 7) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } @@ -503,26 +566,26 @@ static gotoblas_t *get_coretype(void){ case 6: if (model == 6) { // Cannon Lake -#ifndef NO_AVX512 - return &gotoblas_SKYLAKEX; -#else - if(support_avx()) -#ifndef NO_AVX2 - return &gotoblas_HASWELL; -#else - return &gotoblas_SANDYBRIDGE; -#endif - else - return &gotoblas_NEHALEM; -#endif + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } } return NULL; case 9: case 8: if (model == 14 ) { // Kaby Lake - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } From 0afaae4b2323b28af49ffe81b98d17bd4ced96f3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 16:58:56 +0100 Subject: [PATCH 2/7] Query AVX2 and AVX512VL capability in x86 cpu detection --- common_x86_64.h | 2 +- cpuid.h | 1 + cpuid_x86.c | 132 +++++++++++++++++++++++++++--------------------- 3 files changed, 76 insertions(+), 59 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index 62e138e34..f27c1e9be 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -134,7 +134,7 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ "=b" (*ebx), "=c" (*ecx), "=d" (*edx) - : "0" (op)); + : "0" (op), "c"(0)); #endif } diff --git a/cpuid.h b/cpuid.h index a6bc211f3..c56672ad8 100644 --- a/cpuid.h +++ b/cpuid.h @@ -139,6 +139,7 @@ #define HAVE_FMA4 (1 << 19) #define HAVE_FMA3 (1 << 20) #define HAVE_AVX512VL (1 << 21) +#define HAVE_AVX2 (1 << 22) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 diff --git a/cpuid_x86.c b/cpuid_x86.c index eb986b6b6..ddc09857b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -97,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ ("mov %%ebx, %%edi;" "cpuid;" "xchgl %%ebx, %%edi;" - : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); + : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc"); #else __asm__ __volatile__ - ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); + ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc"); #endif } @@ -211,6 +211,42 @@ int support_avx(){ #endif } +int support_avx2(){ +#ifndef NO_AVX2 + int eax, ebx, ecx=0, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 0) + ret=1; //OS supports AVX2 + return ret; +#else + return 0; +#endif +} + +int support_avx512(){ +#ifndef NO_AVX512 + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & 32) != 32){ + ret=0; //OS does not even support AVX2 + } + if((ebx & (1<<31)) != 0){ + ret=1; //OS supports AVX512VL + } + return ret; +#else + return 0; +#endif +} + int get_vendor(void){ int eax, ebx, ecx, edx; @@ -294,6 +330,8 @@ int get_cputype(int gettype){ if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; #ifndef NO_AVX if (support_avx()) feature |= HAVE_AVX; + if (support_avx2()) feature |= HAVE_AVX2; + if (support_avx512()) feature |= HAVE_AVX512VL; if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; #endif @@ -1228,22 +1266,18 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 12: case 15: - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 13: //Broadwell - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; } @@ -1252,33 +1286,27 @@ int get_cpuname(void){ switch (model) { case 5: case 6: - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 7: case 15: //Broadwell - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 14: //Skylake - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 12: @@ -1292,46 +1320,36 @@ int get_cpuname(void){ switch (model) { case 6: //Broadwell - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 5: // Skylake X -#ifndef NO_AVX512 - return CPUTYPE_SKYLAKEX; -#else - if(support_avx()) -#ifndef NO_AVX2 - return CPUTYPE_HASWELL; -#else - return CPUTYPE_SANDYBRIDGE; -#endif + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; -#endif case 14: // Skylake - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 7: // Xeon Phi Knights Landing - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 12: @@ -1342,30 +1360,24 @@ int get_cpuname(void){ case 6: switch (model) { case 6: // Cannon Lake -#ifndef NO_AVX512 - return CPUTYPE_SKYLAKEX; -#else - if(support_avx()) -#ifndef NO_AVX2 - return CPUTYPE_HASWELL; -#else - return CPUTYPE_SANDYBRIDGE; -#endif + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; -#endif } break; case 9: case 8: switch (model) { case 14: // Kaby Lake - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; } @@ -2112,6 +2124,8 @@ void get_cpuconfig(void){ if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); + if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); + if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); @@ -2180,6 +2194,8 @@ void get_sse(void){ if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); + if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); + if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); From 68eb3146ce4c50ac557cf5f199cc1b4294ba3817 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 18:07:14 +0100 Subject: [PATCH 3/7] Add xcr0 (os support) check --- cpuid_x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index ddc09857b..377267fcc 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -239,6 +239,8 @@ int support_avx512(){ ret=0; //OS does not even support AVX2 } if((ebx & (1<<31)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 0xe0) == 0xe0) ret=1; //OS supports AVX512VL } return ret; From e1574fa2b4a2a781be70d8d521bb3b80a572ca9d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 18:08:02 +0100 Subject: [PATCH 4/7] Add xcr0 (os support) check --- driver/others/dynamic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 7cc911d32..4c966260d 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -332,6 +332,8 @@ int support_avx512(){ ret=0; //OS does not even support AVX2 } if((ebx & (1<<31)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 0xe0) == 0xe0) ret=1; //OS supports AVX512VL } return ret; From 31ed19e8b907f72ed4c8ef3165d8577b55264861 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 19:41:13 +0100 Subject: [PATCH 5/7] Add message for SkylakeX and KNL fallbacks to Haswell --- driver/others/dynamic.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 4c966260d..ba93fca8b 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -346,7 +346,7 @@ extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" #define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" -#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512 instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" +#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" static int get_vendor(void){ @@ -526,8 +526,10 @@ static gotoblas_t *get_coretype(void){ // Intel Skylake X if (support_avx512()) return &gotoblas_SKYLAKEX; - if(support_avx2()) + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); return &gotoblas_HASWELL; + } if(support_avx()) { openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); return &gotoblas_SANDYBRIDGE; @@ -550,8 +552,10 @@ static gotoblas_t *get_coretype(void){ } //Intel Phi Knights Landing if (model == 7) { - if(support_avx2()) + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); return &gotoblas_HASWELL; + } if(support_avx()) { openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); return &gotoblas_SANDYBRIDGE; From 191677b902054d1476f3bb12b5360c337c47eb7e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Jan 2019 10:46:47 +0100 Subject: [PATCH 6/7] Add travis_wait to the OSX brew install phase --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 3f323a854..e8b7e0a27 100644 --- a/.travis.yml +++ b/.travis.yml @@ -153,7 +153,7 @@ matrix: before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update - - brew install gcc # for gfortran + - travis_wait 30 brew install gcc # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: From 1650311246d185ca2631c76c33c0212848b57d2a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Jan 2019 14:43:45 +0100 Subject: [PATCH 7/7] Bump xcode to 8.3 --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index e8b7e0a27..51679af62 100644 --- a/.travis.yml +++ b/.travis.yml @@ -149,11 +149,11 @@ matrix: - &test-macos os: osx - osx_image: xcode8 + osx_image: xcode8.3 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update - - travis_wait 30 brew install gcc # for gfortran + - brew install gcc # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: