From cda29633a30bf7ecbc64f85e4bcc6517ad954f1c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Apr 2023 17:59:48 +0200 Subject: [PATCH 01/18] move ALPHA_I out of register 18 (reserved on OSX) --- kernel/arm64/cgemm_kernel_8x4.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S index 24e08a646..f100adc7a 100644 --- a/kernel/arm64/cgemm_kernel_8x4.S +++ b/kernel/arm64/cgemm_kernel_8x4.S @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alphaR w17 -#define alphaI w18 +#define alphaI w19 #define alpha0_R s10 #define alphaV0_R v10.s[0] From c7bbad09adf8cdd2fa4b8709ea669e530a0136a4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Apr 2023 18:00:47 +0200 Subject: [PATCH 02/18] Move ALPHA_I out of register 18 (reserved on OSX) --- kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S index 29a68ff22..2c63925be 100644 --- a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S +++ b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alphaR w17 -#define alphaI w18 +#define alphaI w19 #define alpha0_R s10 #define alphaV0_R v10.s[0] From 0b1acb0ba3aa327fee65bc6bcf596080dfc39f4b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Apr 2023 18:03:35 +0200 Subject: [PATCH 03/18] Move ALPHA_I out of register 18 (reserved on OSX) --- kernel/arm64/ctrmm_kernel_8x4.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S index 5c0827397..e8f1d8cf3 100644 --- a/kernel/arm64/ctrmm_kernel_8x4.S +++ b/kernel/arm64/ctrmm_kernel_8x4.S @@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alphaR w17 -#define alphaI w18 -#define temp x19 -#define tempOffset x20 -#define tempK x21 +#define alphaI w19 +#define temp x20 +#define tempOffset x21 +#define tempK x22 #define alpha0_R s10 #define alphaV0_R v10.s[0] From 108a21e47a754032a9fb5477afcb76c6c158a146 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Apr 2023 18:05:14 +0200 Subject: [PATCH 04/18] Move ALPHA out of register 18 (reserved on OSX) --- kernel/arm64/sgemm_kernel_sve_v2x8.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/sgemm_kernel_sve_v2x8.S b/kernel/arm64/sgemm_kernel_sve_v2x8.S index c969ed4db..60e1f347b 100644 --- a/kernel/arm64/sgemm_kernel_sve_v2x8.S +++ b/kernel/arm64/sgemm_kernel_sve_v2x8.S @@ -55,8 +55,8 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ #define lanes x15 #define pA1 x16 #define pA2 x17 -#define alpha w18 -#define vec_len x19 +#define alpha w19 +#define vec_len x20 #define vec_lenx2 x20 #define alpha0 s10 From 3727672a74c18938230c3a2db012a5693688bfd6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Apr 2023 18:07:52 +0200 Subject: [PATCH 05/18] Improve workaround and keep compilers from optimizing it out --- kernel/arm64/dznrm2_thunderx2t99.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index e342b0b63..0bd274b3f 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" - +#include #include #if defined(SMP) @@ -344,6 +344,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT dummy_alpha[2]; #endif FLOAT ssq, scale; + volatile FLOAT sca; if (n <= 0 || inc_x <= 0) return 0.0; @@ -404,7 +405,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #else nrm2_compute(n, x, inc_x, &ssq, &scale); #endif - if (fabs(scale) <1.e-300) return 0.; + sca = fabs(scale); + if (sca < DBL_MIN) return 0.; ssq = sqrt(ssq) * scale; return ssq; From f096a339e4a22f4bc6dc454640e5d4007b07368b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Apr 2023 18:16:09 +0200 Subject: [PATCH 06/18] Use long value fields for cpu ident on OSX --- cpuid_arm64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 809f48e95..e586f9a3c 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -267,9 +267,9 @@ int detect(void) } #else #ifdef __APPLE__ - sysctlbyname("hw.cpufamily",&value,&length,NULL,0); - if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; //A12/M1 - if (value == 3660830781) return CPU_VORTEX; //A15/M2 + sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); + if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 + if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 #endif return CPU_ARMV8; #endif From 8be68fa7f4edfa0c65949faf67f8feea2c7f0f43 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Apr 2023 12:02:39 +0200 Subject: [PATCH 07/18] move declaration of sca to really keep the compiler from throwing it out (for now) --- kernel/arm64/dznrm2_thunderx2t99.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index 0bd274b3f..6077c85dd 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -344,7 +344,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT dummy_alpha[2]; #endif FLOAT ssq, scale; - volatile FLOAT sca; if (n <= 0 || inc_x <= 0) return 0.0; @@ -405,7 +404,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #else nrm2_compute(n, x, inc_x, &ssq, &scale); #endif - sca = fabs(scale); + volatile FLOAT sca = fabs(scale); if (sca < DBL_MIN) return 0.; ssq = sqrt(ssq) * scale; From 44164e3a3d7f5c956728596b9f88d43cad0a8c14 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Apr 2023 14:23:13 +0200 Subject: [PATCH 08/18] revert "move alpha out of register 18" (out of PR scope, no SVE on Apple hw) --- kernel/arm64/sgemm_kernel_sve_v2x8.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/sgemm_kernel_sve_v2x8.S b/kernel/arm64/sgemm_kernel_sve_v2x8.S index 60e1f347b..c969ed4db 100644 --- a/kernel/arm64/sgemm_kernel_sve_v2x8.S +++ b/kernel/arm64/sgemm_kernel_sve_v2x8.S @@ -55,8 +55,8 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ #define lanes x15 #define pA1 x16 #define pA2 x17 -#define alpha w19 -#define vec_len x20 +#define alpha w18 +#define vec_len x19 #define vec_lenx2 x20 #define alpha0 s10 From 32f2fafde75ca674401c1ce4bc4301ca271536fa Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Thu, 24 Nov 2022 13:38:20 +0000 Subject: [PATCH 09/18] Propagate SWITCH_RATIO to DYNAMIC_ARCH builds Previously dynamic builds were either using the default SWITCH_RATIO or one from the higher level architecture; this patch ensures the dynamic builds can use this parameter as well. --- common_param.h | 2 ++ driver/level3/level3_gemm3m_thread.c | 15 ++++++++------ driver/level3/level3_syrk_threaded.c | 13 +++++++----- driver/level3/level3_thread.c | 31 +++++++++++++++++----------- kernel/setparam-ref.c | 5 ++++- lapack/potrf/potrf_parallel.c | 4 ---- param.h | 4 ++++ 7 files changed, 46 insertions(+), 28 deletions(-) diff --git a/common_param.h b/common_param.h index e14ef2782..7230dd00d 100644 --- a/common_param.h +++ b/common_param.h @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -45,6 +46,7 @@ typedef struct { int dtb_entries; + int switch_ratio; int offsetA, offsetB, align; #ifdef BUILD_BFLOAT16 diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index 39824fc5a..26d07fa94 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -44,10 +45,6 @@ #define DIVIDE_RATE 2 #endif -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - //The array of job_t may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD @@ -1015,6 +1012,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG divN, divT; int mode; +#if defined(DYNAMIC_ARCH) + int switch_ratio = gotoblas->switch_ratio; +#else + int switch_ratio = SWITCH_RATIO; +#endif + if (range_m) { BLASLONG m_from = *(((BLASLONG *)range_m) + 0); BLASLONG m_to = *(((BLASLONG *)range_m) + 1); @@ -1030,7 +1033,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO } */ - if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { + if ((args -> m < nthreads * switch_ratio) || (args -> n < nthreads * switch_ratio)) { GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0); return 0; } @@ -1038,7 +1041,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO divT = nthreads; divN = 1; - while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) { + while ((GEMM3M_P * divT > m * switch_ratio) && (divT > 1)) { do { divT --; divN = 1; diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index d7dcd68a3..b03577fb3 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -44,10 +45,6 @@ #define DIVIDE_RATE 2 #endif -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - //The array of job_t may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD @@ -528,7 +525,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO int mode, mask; double dnum, di, dinum; - if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) { +#if defined(DYNAMIC_ARCH) + int switch_ratio = gotoblas->switch_ratio; +#else + int switch_ratio = SWITCH_RATIO; +#endif + + if ((nthreads == 1) || (args->n < nthreads * switch_ratio)) { SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); return 0; } diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 02b60b50d..c9ecf73e8 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -44,10 +45,6 @@ #define DIVIDE_RATE 2 #endif -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - #ifndef GEMM_PREFERED_SIZE #define GEMM_PREFERED_SIZE 1 #endif @@ -577,6 +574,11 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); BLASLONG width, i, j, k, js; BLASLONG m, n, n_from, n_to; int mode; +#if defined(DYNAMIC_ARCH) + int switch_ratio = gotoblas->switch_ratio; +#else + int switch_ratio = SWITCH_RATIO; +#endif /* Get execution mode */ #ifndef COMPLEX @@ -698,8 +700,8 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); num_parts = 0; while (n > 0){ width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); - if (width < SWITCH_RATIO) { - width = SWITCH_RATIO; + if (width < switch_ratio) { + width = switch_ratio; } width = round_up(n, width, GEMM_PREFERED_SIZE); @@ -746,6 +748,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF BLASLONG m = args -> m; BLASLONG n = args -> n; BLASLONG nthreads_m, nthreads_n; +#if defined(DYNAMIC_ARCH) + int switch_ratio = gotoblas->switch_ratio; +#else + int switch_ratio = SWITCH_RATIO; +#endif /* Get dimensions from index ranges if available */ if (range_m) { @@ -755,21 +762,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF n = range_n[1] - range_n[0]; } - /* Partitions in m should have at least SWITCH_RATIO rows */ - if (m < 2 * SWITCH_RATIO) { + /* Partitions in m should have at least switch_ratio rows */ + if (m < 2 * switch_ratio) { nthreads_m = 1; } else { nthreads_m = args -> nthreads; - while (m < nthreads_m * SWITCH_RATIO) { + while (m < nthreads_m * switch_ratio) { nthreads_m = nthreads_m / 2; } } - /* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */ - if (n < SWITCH_RATIO * nthreads_m) { + /* Partitions in n should have at most switch_ratio * nthreads_m columns */ + if (n < switch_ratio * nthreads_m) { nthreads_n = 1; } else { - nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m); + nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m); if (nthreads_m * nthreads_n > args -> nthreads) { nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m); } diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 522c6d7d9..79436f43b 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -49,7 +50,9 @@ static void init_parameter(void); gotoblas_t TABLE_NAME = { - DTB_DEFAULT_ENTRIES , + DTB_DEFAULT_ENTRIES, + + SWITCH_RATIO, GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index 29364cc05..a7c28f4c2 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -80,10 +80,6 @@ static FLOAT dm1 = -1.; #define DIVIDE_RATE 2 #endif -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - #ifndef LOWER #define TRANS #endif diff --git a/param.h b/param.h index 19cbe75a5..aec1b6a92 100644 --- a/param.h +++ b/param.h @@ -3838,6 +3838,10 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #endif +#ifndef SWITCH_RATIO +#define SWITCH_RATIO 2 +#endif + #ifndef QGEMM_DEFAULT_UNROLL_M #define QGEMM_DEFAULT_UNROLL_M 2 #endif From 5b165420b5962b2b73319f55b747be4f6c697860 Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Mon, 5 Dec 2022 15:17:52 +0000 Subject: [PATCH 10/18] SWITCH_RATIO for Arm(R) Neoverse(TM) architecture This seems like a good balance of values for reasonably sized matrices. With `SWITCH_RATIO=16` the DGEMM scales better to bigger sizes but the better solution would be some kind of thread throttling so I've gone with `SWITCH_RATIO=8`. --- param.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index f1f5cbdad..ae391dd3f 100644 --- a/param.h +++ b/param.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2014, The OpenBLAS Project +Copyright (c) 2011-2023, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -3338,6 +3338,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEN1) +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif + #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3367,7 +3373,11 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEV1) -#define SWITCH_RATIO 16 +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3398,6 +3408,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEN2) +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif + #undef SBGEMM_ALIGN_K #define SBGEMM_ALIGN_K 4 From 479509bb37d5cd26baa62462abe461a0a1d43bb2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Apr 2023 21:57:25 +0200 Subject: [PATCH 11/18] Remove any stray trailing dash from CROSS_SUFFIX (as would result from clang -arch) --- c_check | 1 + 1 file changed, 1 insertion(+) diff --git a/c_check b/c_check index 9be152b12..232adba67 100755 --- a/c_check +++ b/c_check @@ -40,6 +40,7 @@ bn=`basename \"$compiler_name\"` case "$bn" in *-*) if [ "$bn" != '-' ]; then cross_suffix="$cross_suffix${bn%-*}-" + cross_suffix=`echo $cross_suffix|sed -e 's/ -$//'` fi esac From 7de9335c56aed6fb7fb1590c4d146338ca666726 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Apr 2023 23:42:09 +0200 Subject: [PATCH 12/18] Disable gcc's tree-vectorizer pass on all operating systems --- kernel/x86_64/sgemv_n_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index c9681fa8b..296eded5a 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif From 99f6d31ed52822ec69bba1f225ef889e99d38f99 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Apr 2023 23:42:55 +0200 Subject: [PATCH 13/18] Disable gcc's tree-vectorizer pass on all operating systems --- kernel/x86_64/sgemv_t_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 07aa51503..ea89a2aaf 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif From d18efaed20e4ed48bba3777a091fa4e49e35b67f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Apr 2023 23:43:43 +0200 Subject: [PATCH 14/18] Disable gcc's tree-vectorizer pass on all operating systems --- kernel/x86_64/ssymv_L.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 45914daf5..4826b00c6 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif From bb6d6735bf094b8f1bf6fb7a986f5360c0baf2c7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Apr 2023 23:44:15 +0200 Subject: [PATCH 15/18] Disable gcc's tree-vectorizer pass on all operating systems --- kernel/x86_64/ssymv_U.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 26e5ca7e9..06db14ebe 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif From 66b39b835c33b1f9300d042010d255f1465a8e3e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Apr 2023 23:44:45 +0200 Subject: [PATCH 16/18] Disable gcc's tree-vectorizer pass on all operating systems --- kernel/x86_64/zdot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 27397ccfa..72a712a9e 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif From c2fe9cb91fab22ff00a3a660aa106cfe6b9e132f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Apr 2023 23:45:14 +0200 Subject: [PATCH 17/18] Disable gcc's tree-vectorizer pass on all operating systems --- kernel/x86_64/zgemv_n_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 8fc960610..678cea957 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif From c9174ae8d7e385c9fd030d263f6ff5e07aa9b2ee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Apr 2023 23:45:44 +0200 Subject: [PATCH 18/18] Disable gcc's tree-vectorizer pass on all operating systems --- kernel/x86_64/zgemv_t_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 63c8b11a4..44d545df7 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif