Merge branch 'xianyi:develop' into cirrusjobs

This commit is contained in:
Martin Kroeker 2023-04-20 11:27:39 +02:00 committed by GitHub
commit d6a7809504
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 84 additions and 48 deletions

View File

@ -40,6 +40,7 @@ bn=`basename \"$compiler_name\"`
case "$bn" in case "$bn" in
*-*) if [ "$bn" != '-' ]; then *-*) if [ "$bn" != '-' ]; then
cross_suffix="$cross_suffix${bn%-*}-" cross_suffix="$cross_suffix${bn%-*}-"
cross_suffix=`echo $cross_suffix|sed -e 's/ -$//'`
fi fi
esac esac

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -45,6 +46,7 @@
typedef struct { typedef struct {
int dtb_entries; int dtb_entries;
int switch_ratio;
int offsetA, offsetB, align; int offsetA, offsetB, align;
#if BUILD_BFLOAT16 == 1 #if BUILD_BFLOAT16 == 1

View File

@ -267,9 +267,9 @@ int detect(void)
} }
#else #else
#ifdef __APPLE__ #ifdef __APPLE__
sysctlbyname("hw.cpufamily",&value,&length,NULL,0); sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; //A12/M1 if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
if (value == 3660830781) return CPU_VORTEX; //A15/M2 if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
#endif #endif
return CPU_ARMV8; return CPU_ARMV8;
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -44,10 +45,6 @@
#define DIVIDE_RATE 2 #define DIVIDE_RATE 2
#endif #endif
#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif
//The array of job_t may overflow the stack. //The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t. //Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@ -1015,6 +1012,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
BLASLONG divN, divT; BLASLONG divN, divT;
int mode; int mode;
#if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif
if (range_m) { if (range_m) {
BLASLONG m_from = *(((BLASLONG *)range_m) + 0); BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
BLASLONG m_to = *(((BLASLONG *)range_m) + 1); BLASLONG m_to = *(((BLASLONG *)range_m) + 1);
@ -1030,7 +1033,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
} }
*/ */
if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { if ((args -> m < nthreads * switch_ratio) || (args -> n < nthreads * switch_ratio)) {
GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0); GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
return 0; return 0;
} }
@ -1038,7 +1041,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
divT = nthreads; divT = nthreads;
divN = 1; divN = 1;
while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) { while ((GEMM3M_P * divT > m * switch_ratio) && (divT > 1)) {
do { do {
divT --; divT --;
divN = 1; divN = 1;

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -44,10 +45,6 @@
#define DIVIDE_RATE 2 #define DIVIDE_RATE 2
#endif #endif
#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif
//The array of job_t may overflow the stack. //The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t. //Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@ -528,7 +525,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
int mode, mask; int mode, mask;
double dnum, di, dinum; double dnum, di, dinum;
if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) { #if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif
if ((nthreads == 1) || (args->n < nthreads * switch_ratio)) {
SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); SYRK_LOCAL(args, range_m, range_n, sa, sb, 0);
return 0; return 0;
} }

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -44,10 +45,6 @@
#define DIVIDE_RATE 2 #define DIVIDE_RATE 2
#endif #endif
#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif
#ifndef GEMM_PREFERED_SIZE #ifndef GEMM_PREFERED_SIZE
#define GEMM_PREFERED_SIZE 1 #define GEMM_PREFERED_SIZE 1
#endif #endif
@ -577,6 +574,11 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
BLASLONG width, i, j, k, js; BLASLONG width, i, j, k, js;
BLASLONG m, n, n_from, n_to; BLASLONG m, n, n_from, n_to;
int mode; int mode;
#if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif
/* Get execution mode */ /* Get execution mode */
#ifndef COMPLEX #ifndef COMPLEX
@ -698,8 +700,8 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
num_parts = 0; num_parts = 0;
while (n > 0){ while (n > 0){
width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
if (width < SWITCH_RATIO) { if (width < switch_ratio) {
width = SWITCH_RATIO; width = switch_ratio;
} }
width = round_up(n, width, GEMM_PREFERED_SIZE); width = round_up(n, width, GEMM_PREFERED_SIZE);
@ -746,6 +748,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
BLASLONG m = args -> m; BLASLONG m = args -> m;
BLASLONG n = args -> n; BLASLONG n = args -> n;
BLASLONG nthreads_m, nthreads_n; BLASLONG nthreads_m, nthreads_n;
#if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif
/* Get dimensions from index ranges if available */ /* Get dimensions from index ranges if available */
if (range_m) { if (range_m) {
@ -755,21 +762,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
n = range_n[1] - range_n[0]; n = range_n[1] - range_n[0];
} }
/* Partitions in m should have at least SWITCH_RATIO rows */ /* Partitions in m should have at least switch_ratio rows */
if (m < 2 * SWITCH_RATIO) { if (m < 2 * switch_ratio) {
nthreads_m = 1; nthreads_m = 1;
} else { } else {
nthreads_m = args -> nthreads; nthreads_m = args -> nthreads;
while (m < nthreads_m * SWITCH_RATIO) { while (m < nthreads_m * switch_ratio) {
nthreads_m = nthreads_m / 2; nthreads_m = nthreads_m / 2;
} }
} }
/* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */ /* Partitions in n should have at most switch_ratio * nthreads_m columns */
if (n < SWITCH_RATIO * nthreads_m) { if (n < switch_ratio * nthreads_m) {
nthreads_n = 1; nthreads_n = 1;
} else { } else {
nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m); nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m);
if (nthreads_m * nthreads_n > args -> nthreads) { if (nthreads_m * nthreads_n > args -> nthreads) {
nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m); nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
} }

View File

@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15 #define pCRow3 x15
#define pA x16 #define pA x16
#define alphaR w17 #define alphaR w17
#define alphaI w18 #define alphaI w19
#define alpha0_R s10 #define alpha0_R s10
#define alphaV0_R v10.s[0] #define alphaV0_R v10.s[0]

View File

@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15 #define pCRow3 x15
#define pA x16 #define pA x16
#define alphaR w17 #define alphaR w17
#define alphaI w18 #define alphaI w19
#define alpha0_R s10 #define alpha0_R s10
#define alphaV0_R v10.s[0] #define alphaV0_R v10.s[0]

View File

@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15 #define pCRow3 x15
#define pA x16 #define pA x16
#define alphaR w17 #define alphaR w17
#define alphaI w18 #define alphaI w19
#define temp x19 #define temp x20
#define tempOffset x20 #define tempOffset x21
#define tempK x21 #define tempK x22
#define alpha0_R s10 #define alpha0_R s10
#define alphaV0_R v10.s[0] #define alphaV0_R v10.s[0]

View File

@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <float.h>
#include <arm_neon.h> #include <arm_neon.h>
#if defined(SMP) #if defined(SMP)
@ -404,7 +404,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
#else #else
nrm2_compute(n, x, inc_x, &ssq, &scale); nrm2_compute(n, x, inc_x, &ssq, &scale);
#endif #endif
if (fabs(scale) <1.e-300) return 0.; volatile FLOAT sca = fabs(scale);
if (sca < DBL_MIN) return 0.;
ssq = sqrt(ssq) * scale; ssq = sqrt(ssq) * scale;
return ssq; return ssq;

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -49,7 +50,9 @@
static void init_parameter(void); static void init_parameter(void);
gotoblas_t TABLE_NAME = { gotoblas_t TABLE_NAME = {
DTB_DEFAULT_ENTRIES , DTB_DEFAULT_ENTRIES,
SWITCH_RATIO,
GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN,

View File

@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) #if (defined(__GNUC__) && __GNUC__ > 11)
#pragma GCC optimize("no-tree-vectorize") #pragma GCC optimize("no-tree-vectorize")
#endif #endif

View File

@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) #if (defined(__GNUC__) && __GNUC__ > 11)
#pragma GCC optimize("no-tree-vectorize") #pragma GCC optimize("no-tree-vectorize")
#endif #endif

View File

@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) #if (defined(__GNUC__) && __GNUC__ > 11)
#pragma GCC optimize("no-tree-vectorize") #pragma GCC optimize("no-tree-vectorize")
#endif #endif

View File

@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) #if (defined(__GNUC__) && __GNUC__ > 11)
#pragma GCC optimize("no-tree-vectorize") #pragma GCC optimize("no-tree-vectorize")
#endif #endif

View File

@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) #if (defined(__GNUC__) && __GNUC__ > 11)
#pragma GCC optimize("no-tree-vectorize") #pragma GCC optimize("no-tree-vectorize")
#endif #endif

View File

@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) #if (defined(__GNUC__) && __GNUC__ > 11)
#pragma GCC optimize("no-tree-vectorize") #pragma GCC optimize("no-tree-vectorize")
#endif #endif

View File

@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) #if (defined(__GNUC__) && __GNUC__ > 11)
#pragma GCC optimize("no-tree-vectorize") #pragma GCC optimize("no-tree-vectorize")
#endif #endif

View File

@ -80,10 +80,6 @@ static FLOAT dm1 = -1.;
#define DIVIDE_RATE 2 #define DIVIDE_RATE 2
#endif #endif
#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif
#ifndef LOWER #ifndef LOWER
#define TRANS #define TRANS
#endif #endif

22
param.h
View File

@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011-2014, The OpenBLAS Project Copyright (c) 2011-2023, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -3338,6 +3338,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#elif defined(NEOVERSEN1) #elif defined(NEOVERSEN1)
#if defined(XDOUBLE) || defined(DOUBLE)
#define SWITCH_RATIO 8
#else
#define SWITCH_RATIO 16
#endif
#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_N 4
@ -3367,7 +3373,11 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#elif defined(NEOVERSEV1) #elif defined(NEOVERSEV1)
#if defined(XDOUBLE) || defined(DOUBLE)
#define SWITCH_RATIO 8
#else
#define SWITCH_RATIO 16 #define SWITCH_RATIO 16
#endif
#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_N 4
@ -3398,6 +3408,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#elif defined(NEOVERSEN2) #elif defined(NEOVERSEN2)
#if defined(XDOUBLE) || defined(DOUBLE)
#define SWITCH_RATIO 8
#else
#define SWITCH_RATIO 16
#endif
#undef SBGEMM_ALIGN_K #undef SBGEMM_ALIGN_K
#define SBGEMM_ALIGN_K 4 #define SBGEMM_ALIGN_K 4
@ -3838,6 +3854,10 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
#endif #endif
#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif
#ifndef QGEMM_DEFAULT_UNROLL_M #ifndef QGEMM_DEFAULT_UNROLL_M
#define QGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2
#endif #endif