Merge branch 'develop' into issue2588-cmake

This commit is contained in:
Martin Kroeker 2020-10-11 13:57:07 +02:00 committed by GitHub
commit ac653c94f3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
89 changed files with 1280 additions and 297 deletions

View File

@ -233,6 +233,21 @@ matrix:
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
- CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
- BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
- &test-graviton2
os: linux
arch: arm64-graviton2
dist: focal
group: edge
virt: lxd
compiler: gcc
addons:
apt:
packages:
- gfortran
script:
- travis_wait 45 make && make lapack-test
# whitelist
branches:
only:

View File

@ -279,7 +279,22 @@ COMMON_PROF = -pg
# If you want to enable the experimental BFLOAT16 support
# BUILD_HALF = 1
#
# Set the thread number threshold beyond which the job array for the threaded level3 BLAS
# will be allocated on the heap rather than the stack. (This array alone requires
# NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu
# counts, but obviously it is not the only item that ends up on the stack.
# The default value of 32 ensures that the overall requirement is compatible
# with the default 1MB stacksize imposed by having the Java VM loaded without use
# of its -Xss parameter.
# The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible
# with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java
# VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code
# BLAS3_MEM_ALLOC_THRESHOLD = 160
# the below is not yet configurable, use cmake if you need to build only select types
BUILD_SINGLE = 1
BUILD_DOUBLE = 1

View File

@ -8,6 +8,11 @@ endif
endif
endif
ifdef HAVE_SSE3
CCOMMON_OPT += -msse3
FCOMMON_OPT += -msse3
endif
ifeq ($(CORE), SKYLAKEX)
ifndef DYNAMIC_ARCH
ifndef NO_AVX512

View File

@ -46,7 +46,10 @@ Building OpenBLAS requires the following to be installed:
Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically.
To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`.
The full target list is in the file `TargetList.txt`.
The full target list is in the file `TargetList.txt`. For building with `cmake`, the
usual conventions apply, i.e. create a build directory either underneath the toplevel
OpenBLAS source directory or separate from it, and invoke `cmake` there with the path
to the source tree and any build options you plan to set.
### Cross compile
@ -152,13 +155,17 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
- **Falkor**: same as A57 (different cpu specifications)
- **ThunderX**: Optimized some Level-1 functions
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
- **ThunderX3T110**
- **TSV110**: Optimized some Level-3 helper functions
- **EMAG 8180**: preliminary support based on A57
- **Neoverse N1**: (AWS Graviton2) preliminary support
- **Apple Vortex**: preliminary support based on ARMV8
#### PPC/PPC64
- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
- **POWER10**:
#### IBM zEnterprise System
@ -167,18 +174,18 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
### Support for multiple targets in a single library
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake.
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default.
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
`DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the
The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the
common code in the library, usually you will want to set this to the oldest model you expect to encounter.
Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
@ -226,7 +233,8 @@ We provide the following functions to control the number of threads at runtime:
void goto_set_num_threads(int num_threads);
void openblas_set_num_threads(int num_threads);
```
Note that these are only used once at library initialization, and are not available for
fine-tuning thread numbers in individual BLAS calls.
If you compile this library with `USE_OPENMP=1`, you should use the above functions too.
## Reporting bugs

View File

@ -146,7 +146,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -145,7 +145,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -152,7 +152,7 @@ int main(int argc, char *argv[]){
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -152,7 +152,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -151,7 +151,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -154,7 +154,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -145,7 +145,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -214,7 +214,7 @@ int main(int argc, char *argv[]){
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -197,7 +197,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -163,7 +163,7 @@ int main(int argc, char *argv[]){
loops = atoi(p);
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -181,7 +181,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -165,7 +165,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -165,7 +165,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -188,7 +188,7 @@ int main(int argc, char *argv[]){
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -158,7 +158,7 @@ int main(int argc, char *argv[]){
exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -151,7 +151,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -152,7 +152,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -149,7 +149,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -151,7 +151,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -150,7 +150,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -149,7 +149,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -155,7 +155,7 @@ int main(int argc, char *argv[]){
exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -145,7 +145,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -145,7 +145,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -139,7 +139,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -139,7 +139,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -174,7 +174,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -139,7 +139,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -139,7 +139,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -145,7 +145,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -156,7 +156,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -168,7 +168,7 @@ int main(int argc, char *argv[])
exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -150,7 +150,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -163,7 +163,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -149,7 +149,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -153,7 +153,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -151,7 +151,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -162,7 +162,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -162,7 +162,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -144,7 +144,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -150,7 +150,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -162,7 +162,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -159,7 +159,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -132,7 +132,7 @@ int main(int argc, char *argv[])
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -132,7 +132,7 @@ int main(int argc, char *argv[])
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -162,7 +162,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -132,7 +132,7 @@ int main(int argc, char *argv[])
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -172,7 +172,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -159,7 +159,7 @@ int main(int argc, char *argv[]){
uplo,diag,loops);
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -146,7 +146,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -145,7 +145,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@ -1,4 +1,3 @@
##
## Author: Hank Anderson <hank@statease.com>
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets various variables based on architecture.
@ -80,10 +79,15 @@ if (DYNAMIC_ARCH)
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
endif ()
endif ()
CHECK_INCLUDE_FILE ("${PROJECT_SOURCE_DIR}/config_kernel.h" TRAP)
if (TRAP)
message (FATAL_ERROR "Your build directory contains a file config_kernel.h, probably from a previous compilation with make. This will conflict with the cmake compilation and cause strange compiler errors - please remove the file before trying again")
endif ()
if (NOT DYNAMIC_CORE)
message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options")
unset(DYNAMIC_ARCH CACHE)

View File

@ -70,6 +70,9 @@ if (DEFINED TARGET)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
endif()
if (DEFINED HAVE_SSE3)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
endif()
endif()
if (DEFINED TARGET)
@ -323,7 +326,13 @@ else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048")
endif ()
endif ()
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
if (DEFINED BLAS3_MEM_ALLOC_THRESHOLD)
if (NOT ${BLAS3_MEM_ALLOC_THRESHOLD} EQUAL 32)
set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_THRESHOLD}")
endif()
endif()
endif()
if (DEFINED LIBNAMESUFFIX)
set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
else ()

View File

@ -121,7 +121,6 @@ endif()
include(CheckIncludeFile)
CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11)
if (HAVE_C11 EQUAL 1)
message (STATUS found stdatomic.h)
if (HAVE_C11)
set (CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_C11")
endif()

View File

@ -352,7 +352,7 @@ typedef int blasint;
#endif
#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) || defined(ARMV5)
#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
#endif
#ifdef BULLDOZER
@ -402,7 +402,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#endif
#ifndef BLAS3_MEM_ALLOC_THRESHOLD
#define BLAS3_MEM_ALLOC_THRESHOLD 160
#define BLAS3_MEM_ALLOC_THRESHOLD 32
#endif
#ifdef QUAD_PRECISION

View File

@ -54,7 +54,7 @@ static char *cpuname_lower[] = {
int get_feature(char *search)
{
#ifdef linux
#ifdef __linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;
@ -90,7 +90,7 @@ int get_feature(char *search)
int detect(void)
{
#ifdef linux
#ifdef __linux
FILE *infile;
char buffer[512], *p;
@ -289,7 +289,7 @@ void get_libname(void)
void get_features(void)
{
#ifdef linux
#ifdef __linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;

View File

@ -90,7 +90,7 @@ static char *cpuname_lower[] = {
int get_feature(char *search)
{
#ifdef linux
#ifdef __linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;
@ -126,7 +126,7 @@ int get_feature(char *search)
int detect(void)
{
#ifdef linux
#ifdef __linux
FILE *infile;
char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL;
@ -242,7 +242,7 @@ void get_cpucount(void)
{
int n=0;
#ifdef linux
#ifdef __linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;
@ -441,7 +441,7 @@ void get_libname(void)
void get_features(void)
{
#ifdef linux
#ifdef __linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;

View File

@ -84,7 +84,7 @@ static char *cpuname[] = {
int detect(void){
#ifdef linux
#ifdef __linux
FILE *infile;
char buffer[512], *p;

View File

@ -90,7 +90,7 @@ static char *cpuname[] = {
int detect(void){
#ifdef linux
#ifdef __linux
FILE *infile;
char buffer[512], *p;

View File

@ -104,7 +104,7 @@ char *corename[] = {
int detect(void){
#ifdef linux
#ifdef __linux
FILE *infile;
char buffer[512], *p;
@ -214,6 +214,8 @@ switch ( id >> 16 ) {
return CPUTYPE_UNKNOWN;
}
#endif
return CPUTYPE_UNKNOWN;
}
void get_architecture(void){

View File

@ -122,6 +122,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_l > GEMM_Q) min_l = GEMM_Q;
min_i = min_l;
if (min_i > GEMM_P) min_i = GEMM_P;
if( min_i > GEMM_UNROLL_M){
min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
START_RPCC();
@ -161,9 +164,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
}
for(is = min_i; is < min_l; is += GEMM_P){
for(is = min_i; is < min_l; is += min_i){
min_i = min_l - is;
if (min_i > GEMM_P) min_i = GEMM_P;
if( min_i > GEMM_UNROLL_M){
min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
START_RPCC();
@ -192,6 +198,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_l > GEMM_Q) min_l = GEMM_Q;
min_i = ls;
if (min_i > GEMM_P) min_i = GEMM_P;
if( min_i > GEMM_UNROLL_M){
min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
START_RPCC();
@ -231,9 +241,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
STOP_RPCC(gemmcost);
}
for(is = min_i; is < ls; is += GEMM_P){
for(is = min_i; is < ls; is += min_i){
min_i = ls - is;
if (min_i > GEMM_P) min_i = GEMM_P;
if( min_i > GEMM_UNROLL_M){
min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
START_RPCC();
@ -256,9 +269,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
STOP_RPCC(gemmcost);
}
for(is = ls; is < ls + min_l; is += GEMM_P){
for(is = ls; is < ls + min_l; is += min_i){
min_i = ls + min_l - is;
if (min_i > GEMM_P) min_i = GEMM_P;
if( min_i > GEMM_UNROLL_M){
min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
START_RPCC();
@ -287,6 +303,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_l > GEMM_Q) min_l = GEMM_Q;
min_i = min_l;
if (min_i > GEMM_P) min_i = GEMM_P;
if (min_i > GEMM_UNROLL_M){
min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
START_RPCC();
@ -327,9 +347,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
STOP_RPCC(trmmcost);
}
for(is = m - min_l + min_i; is < m; is += GEMM_P){
for(is = m - min_l + min_i; is < m; is += min_i){
min_i = m - is;
if (min_i > GEMM_P) min_i = GEMM_P;
if (min_i > GEMM_UNROLL_M){
min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
START_RPCC();
@ -357,6 +382,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_l > GEMM_Q) min_l = GEMM_Q;
min_i = min_l;
if (min_i > GEMM_P) min_i = GEMM_P;
if (min_i > GEMM_UNROLL_M){
min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
START_RPCC();
@ -397,9 +426,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
STOP_RPCC(trmmcost);
}
for(is = ls - min_l + min_i; is < ls; is += GEMM_P){
for(is = ls - min_l + min_i; is < ls; is += min_i){
min_i = ls - is;
if (min_i > GEMM_P) min_i = GEMM_P;
if (min_i > GEMM_UNROLL_M){
min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
START_RPCC();
@ -423,9 +456,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
}
for(is = ls; is < m; is += GEMM_P){
for(is = ls; is < m; is += min_i){
min_i = m - is;
if (min_i > GEMM_P) min_i = GEMM_P;
if (min_i > GEMM_UNROLL_M){
min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
START_RPCC();

View File

@ -48,6 +48,21 @@
#else
#ifndef likely
#ifdef __GNUC__
#define likely(x) __builtin_expect(!!(x), 1)
#else
#define likely(x) (x)
#endif
#endif
#ifndef unlikely
#ifdef __GNUC__
#define unlikely(x) __builtin_expect(!!(x), 0)
#else
#define unlikely(x) (x)
#endif
#endif
#ifndef OMP_SCHED
#define OMP_SCHED static
#endif
@ -362,6 +377,9 @@ if (!sb) fprintf(stderr,"SB not declared!!!\n");
int exec_blas(BLASLONG num, blas_queue_t *queue){
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
BLASLONG i, buf_index;
if ((num <= 0) || (queue == NULL)) return 0;

View File

@ -68,7 +68,7 @@ extern void openblas_warning(int verbose, const char * msg);
#endif
#define get_cpu_ftr(id, var) ({ \
asm("mrs %0, "#id : "=r" (var)); \
__asm__("mrs %0, "#id : "=r" (var)); \
})
static char *corename[] = {

View File

@ -80,7 +80,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#undef COMPILE_TLS
#endif
#if defined(__GLIBC_PREREQ)
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2,20)
#undef COMPILE_TLS
#endif
@ -161,7 +161,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
#include <conio.h>
#undef printf
#define printf _cprintf
#define printf _cprintf
#endif
#ifdef OS_LINUX
@ -190,14 +190,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CONSTRUCTOR __cdecl
#define DESTRUCTOR __cdecl
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
#define CONSTRUCTOR __attribute__ ((constructor(101)))
#define DESTRUCTOR __attribute__ ((destructor(101)))
#define CONSTRUCTOR __attribute__ ((constructor(101)))
#define DESTRUCTOR __attribute__ ((destructor(101)))
#else
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#endif
#ifdef DYNAMIC_ARCH
@ -272,7 +272,7 @@ int get_num_procs(void) {
return nums;
}
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
} else {
@ -281,7 +281,7 @@ int get_num_procs(void) {
return nums;
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
if (ret > 0 && ret < nums) nums = ret;
return nums;
}
#endif
@ -628,12 +628,12 @@ static void *alloc_mmap(void *address){
if (address){
map_address = mmap(address,
allocation_block_size,
MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
allocation_block_size,
MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
} else {
map_address = mmap(address,
allocation_block_size,
MMAP_ACCESS, MMAP_POLICY, -1, 0);
allocation_block_size,
MMAP_ACCESS, MMAP_POLICY, -1, 0);
}
STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
@ -648,7 +648,7 @@ static void *alloc_mmap(void *address){
#else
#define BENCH_ITERATION 4
#define SCALING 2
#define SCALING 2
static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
@ -711,60 +711,60 @@ static void *alloc_mmap(void *address){
#endif
map_address = mmap(NULL, allocation_block_size * SCALING,
MMAP_ACCESS, MMAP_POLICY, -1, 0);
MMAP_ACCESS, MMAP_POLICY, -1, 0);
if (map_address != (void *)-1) {
#ifdef OS_LINUX
#ifdef DEBUG
int ret=0;
ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
if(ret==-1){
int errsv=errno;
perror("OpenBLAS alloc_mmap:");
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
}
int ret=0;
ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
if(ret==-1){
int errsv=errno;
perror("OpenBLAS alloc_mmap:");
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
}
#else
my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
#endif
#endif
allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
start = (BLASULONG)map_address;
current = (SCALING - 1) * allocation_block_size;
original = current;
start = (BLASULONG)map_address;
current = (SCALING - 1) * allocation_block_size;
original = current;
while(current > 0 && current <= original) {
*(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
start += PAGESIZE;
current -= PAGESIZE;
}
while(current > 0 && current <= original) {
*(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
start += PAGESIZE;
current -= PAGESIZE;
}
*(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
*(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
start = (BLASULONG)map_address;
start = (BLASULONG)map_address;
best = (BLASULONG)-1;
best_address = map_address;
best = (BLASULONG)-1;
best_address = map_address;
while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) {
while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) {
current = run_bench(start, allocsize);
current = run_bench(start, allocsize);
if (best > current) {
best = current;
best_address = (void *)start;
}
if (best > current) {
best = current;
best_address = (void *)start;
}
start += PAGESIZE;
start += PAGESIZE;
}
}
if ((BLASULONG)best_address > (BLASULONG)map_address)
munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address);
@ -854,9 +854,9 @@ static void *alloc_windows(void *address){
void *map_address;
map_address = VirtualAlloc(address,
allocation_block_size,
MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE);
allocation_block_size,
MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE);
if (map_address == (void *)NULL) map_address = (void *)-1;
@ -897,9 +897,9 @@ static void *alloc_devicedirver(void *address){
}
map_address = mmap(address, allocation_block_size,
PROT_READ | PROT_WRITE,
MAP_FILE | MAP_SHARED,
fd, 0);
PROT_READ | PROT_WRITE,
MAP_FILE | MAP_SHARED,
fd, 0);
STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd);
@ -974,12 +974,12 @@ static void *alloc_hugetlb(void *address){
shmid = shmget(IPC_PRIVATE, allocation_block_size,
#ifdef OS_LINUX
SHM_HUGETLB |
SHM_HUGETLB |
#endif
#ifdef OS_AIX
SHM_LGPAGE | SHM_PIN |
SHM_LGPAGE | SHM_PIN |
#endif
IPC_CREAT | SHM_R | SHM_W);
IPC_CREAT | SHM_R | SHM_W);
if (shmid != -1) {
map_address = (void *)shmat(shmid, address, SHM_RND);
@ -1026,9 +1026,9 @@ static void *alloc_hugetlb(void *address){
}
map_address = (void *)VirtualAlloc(address,
allocation_block_size,
MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE);
allocation_block_size,
MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE);
tp.Privileges[0].Attributes = 0;
AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
@ -1078,9 +1078,9 @@ static void *alloc_hugetlbfile(void *address){
unlink(filename);
map_address = mmap(address, allocation_block_size,
PROT_READ | PROT_WRITE,
MAP_SHARED,
fd, 0);
PROT_READ | PROT_WRITE,
MAP_SHARED,
fd, 0);
STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd);
@ -1107,7 +1107,7 @@ static volatile int memory_initialized = 0;
/* 1 : Level 2 functions */
/* 2 : Thread */
static void blas_memory_cleanup(void* ptr){
static void blas_memory_cleanup(void* ptr){
if (ptr) {
struct alloc_t ** table = (struct alloc_t **)ptr;
int pos;
@ -1243,27 +1243,27 @@ UNLOCK_COMMAND(&alloc_lock);
while ((func != NULL) && (map_address == (void *) -1)) {
map_address = (*func)((void *)base_address);
map_address = (*func)((void *)base_address);
#ifdef ALLOC_DEVICEDRIVER
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
}
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
}
#endif
#ifdef ALLOC_HUGETLBFILE
if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
#ifndef OS_WINDOWS
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
#endif
}
}
#endif
#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
#endif
func ++;
func ++;
}
#ifdef DEBUG
@ -1377,7 +1377,7 @@ static BLASULONG init_lock = 0UL;
#endif
static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
void *sa, void *sb, BLASLONG pos) {
void *sa, void *sb, BLASLONG pos) {
#if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
@ -1507,11 +1507,11 @@ void CONSTRUCTOR gotoblas_init(void) {
struct rlimit curlimit;
if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
{
if ( curlimit.rlim_cur != curlimit.rlim_max )
{
curlimit.rlim_cur = curlimit.rlim_max;
setrlimit(RLIMIT_STACK, &curlimit);
}
if ( curlimit.rlim_cur != curlimit.rlim_max )
{
curlimit.rlim_cur = curlimit.rlim_max;
setrlimit(RLIMIT_STACK, &curlimit);
}
}
#endif
@ -1545,7 +1545,7 @@ void DESTRUCTOR gotoblas_quit(void) {
TlsFree(local_storage_key);
#else
pthread_key_delete(local_storage_key);
#endif
#endif
#endif
#ifdef PROFILE
@ -1605,8 +1605,8 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
*/
static int on_process_term(void)
{
gotoblas_quit();
return 0;
gotoblas_quit();
return 0;
}
#ifdef _WIN64
#pragma comment(linker, "/INCLUDE:_tls_used")
@ -1705,7 +1705,7 @@ void gotoblas_dummy_for_PGI(void) {
#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
#include <conio.h>
#undef printf
#define printf _cprintf
#define printf _cprintf
#endif
#ifdef OS_LINUX
@ -1734,14 +1734,14 @@ void gotoblas_dummy_for_PGI(void) {
#define CONSTRUCTOR __cdecl
#define DESTRUCTOR __cdecl
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
#define CONSTRUCTOR __attribute__ ((constructor(101)))
#define DESTRUCTOR __attribute__ ((destructor(101)))
#define CONSTRUCTOR __attribute__ ((constructor(101)))
#define DESTRUCTOR __attribute__ ((destructor(101)))
#else
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#endif
#ifdef DYNAMIC_ARCH
@ -1817,7 +1817,7 @@ int get_num_procs(void) {
return nums;
}
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
} else {
@ -1826,7 +1826,7 @@ int get_num_procs(void) {
return nums;
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
if (ret > 0 && ret < nums) nums = ret;
return nums;
}
#endif
@ -2083,26 +2083,26 @@ static void *alloc_mmap(void *address){
if (address){
map_address = mmap(address,
BUFFER_SIZE,
MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
BUFFER_SIZE,
MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
} else {
map_address = mmap(address,
BUFFER_SIZE,
MMAP_ACCESS, MMAP_POLICY, -1, 0);
BUFFER_SIZE,
MMAP_ACCESS, MMAP_POLICY, -1, 0);
}
if (map_address != (void *)-1) {
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#endif
} else {
#ifdef DEBUG
#ifdef DEBUG
int errsv=errno;
perror("OpenBLAS : mmap failed:");
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
@ -2119,7 +2119,7 @@ static void *alloc_mmap(void *address){
#else
#define BENCH_ITERATION 4
#define SCALING 2
#define SCALING 2
static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
@ -2182,22 +2182,22 @@ static void *alloc_mmap(void *address){
#endif
map_address = mmap(NULL, BUFFER_SIZE * SCALING,
MMAP_ACCESS, MMAP_POLICY, -1, 0);
MMAP_ACCESS, MMAP_POLICY, -1, 0);
if (map_address != (void *)-1) {
#ifdef OS_LINUX
#ifdef DEBUG
int ret=0;
ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
if(ret==-1){
int errsv=errno;
perror("OpenBLAS alloc_mmap:");
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
}
int ret=0;
ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
if(ret==-1){
int errsv=errno;
perror("OpenBLAS alloc_mmap:");
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
}
#else
my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
#endif
#endif
@ -2213,34 +2213,34 @@ static void *alloc_mmap(void *address){
start = (BLASULONG)map_address;
current = (SCALING - 1) * BUFFER_SIZE;
while(current > 0) {
*(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
start += PAGESIZE;
current -= PAGESIZE;
}
while(current > 0) {
*(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
start += PAGESIZE;
current -= PAGESIZE;
}
*(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
*(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
start = (BLASULONG)map_address;
start = (BLASULONG)map_address;
best = (BLASULONG)-1;
best_address = map_address;
best = (BLASULONG)-1;
best_address = map_address;
while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
current = run_bench(start, allocsize);
current = run_bench(start, allocsize);
if (best > current) {
best = current;
best_address = (void *)start;
}
if (best > current) {
best = current;
best_address = (void *)start;
}
start += PAGESIZE;
start += PAGESIZE;
}
}
if ((BLASULONG)best_address > (BLASULONG)map_address)
munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
@ -2348,9 +2348,9 @@ static void *alloc_windows(void *address){
void *map_address;
map_address = VirtualAlloc(address,
BUFFER_SIZE,
MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE);
BUFFER_SIZE,
MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE);
if (map_address == (void *)NULL) map_address = (void *)-1;
@ -2394,9 +2394,9 @@ static void *alloc_devicedirver(void *address){
}
map_address = mmap(address, BUFFER_SIZE,
PROT_READ | PROT_WRITE,
MAP_FILE | MAP_SHARED,
fd, 0);
PROT_READ | PROT_WRITE,
MAP_FILE | MAP_SHARED,
fd, 0);
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
@ -2477,12 +2477,12 @@ static void *alloc_hugetlb(void *address){
shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
#ifdef OS_LINUX
SHM_HUGETLB |
SHM_HUGETLB |
#endif
#ifdef OS_AIX
SHM_LGPAGE | SHM_PIN |
SHM_LGPAGE | SHM_PIN |
#endif
IPC_CREAT | SHM_R | SHM_W);
IPC_CREAT | SHM_R | SHM_W);
if (shmid != -1) {
map_address = (void *)shmat(shmid, address, SHM_RND);
@ -2517,7 +2517,7 @@ static void *alloc_hugetlb(void *address){
tp.PrivilegeCount = 1;
tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
CloseHandle(hToken);
return (void*)-1;
@ -2529,9 +2529,9 @@ static void *alloc_hugetlb(void *address){
}
map_address = (void *)VirtualAlloc(address,
BUFFER_SIZE,
MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE);
BUFFER_SIZE,
MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE);
tp.Privileges[0].Attributes = 0;
AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
@ -2584,9 +2584,9 @@ static void *alloc_hugetlbfile(void *address){
unlink(filename);
map_address = mmap(address, BUFFER_SIZE,
PROT_READ | PROT_WRITE,
MAP_SHARED,
fd, 0);
PROT_READ | PROT_WRITE,
MAP_SHARED,
fd, 0);
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
@ -2723,7 +2723,7 @@ void *blas_memory_alloc(int procpos){
if (!memory[position].used && (memory[position].pos == mypos)) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#else
#else
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
@ -2731,7 +2731,7 @@ void *blas_memory_alloc(int procpos){
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
#endif
#endif
}
position ++;
@ -2747,22 +2747,22 @@ void *blas_memory_alloc(int procpos){
LOCK_COMMAND(&alloc_lock);
#endif
do {
RMB;
#if defined(USE_OPENMP)
if (!memory[position].used) {
RMB;
#if defined(USE_OPENMP)
if (!memory[position].used) {
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
#if defined(USE_OPENMP)
blas_unlock(&memory[position].lock);
blas_unlock(&memory[position].lock);
}
#endif
position ++;
} while (position < NUM_BUFFERS);
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
UNLOCK_COMMAND(&alloc_lock);
#endif
goto error;
@ -2776,7 +2776,7 @@ void *blas_memory_alloc(int procpos){
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
blas_unlock(&memory[position].lock);
#endif
if (!memory[position].addr) {
do {
@ -2790,27 +2790,27 @@ void *blas_memory_alloc(int procpos){
while ((func != NULL) && (map_address == (void *) -1)) {
map_address = (*func)((void *)base_address);
map_address = (*func)((void *)base_address);
#ifdef ALLOC_DEVICEDRIVER
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
}
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
}
#endif
#ifdef ALLOC_HUGETLBFILE
if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
#ifndef OS_WINDOWS
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
#endif
}
}
#endif
#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
#endif
func ++;
func ++;
}
#ifdef DEBUG
@ -2824,7 +2824,7 @@ void *blas_memory_alloc(int procpos){
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
#endif
memory[position].addr = map_address;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
@ -2862,7 +2862,7 @@ void *blas_memory_alloc(int procpos){
#ifdef DEBUG
printf("Mapped : %p %3d\n\n",
(void *)memory[position].addr, position);
(void *)memory[position].addr, position);
#endif
return (void *)memory[position].addr;
@ -2888,9 +2888,10 @@ void blas_memory_free(void *free_area){
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
position++;
if (memory[position].addr != free_area) goto error;
if (position >= NUM_BUFFERS) goto error;
#ifdef DEBUG
if (memory[position].addr != free_area) goto error;
printf(" Position : %d\n", position);
#endif
@ -2978,7 +2979,7 @@ static BLASULONG init_lock = 0UL;
#endif
static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
void *sa, void *sb, BLASLONG pos) {
void *sa, void *sb, BLASLONG pos) {
#if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
@ -3105,15 +3106,15 @@ void CONSTRUCTOR gotoblas_init(void) {
//#if defined(OS_LINUX)
#if 0
struct rlimit curlimit;
if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
{
if ( curlimit.rlim_cur != curlimit.rlim_max )
{
curlimit.rlim_cur = curlimit.rlim_max;
setrlimit(RLIMIT_STACK, &curlimit);
}
}
struct rlimit curlimit;
if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
{
if ( curlimit.rlim_cur != curlimit.rlim_max )
{
curlimit.rlim_cur = curlimit.rlim_max;
setrlimit(RLIMIT_STACK, &curlimit);
}
}
#endif
#ifdef SMP
@ -3195,8 +3196,8 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
*/
static int on_process_term(void)
{
gotoblas_quit();
return 0;
gotoblas_quit();
return 0;
}
#ifdef _WIN64
#pragma comment(linker, "/INCLUDE:_tls_used")
@ -3243,7 +3244,7 @@ void gotoblas_dummy_for_PGI(void) {
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
#endif
#endif
#endif
}
#endif

View File

@ -492,7 +492,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \
"-DHAVE_AVX -DHAVE_FMA4"
"-DHAVE_AVX"
#define LIBNAME "bulldozer"
#define CORENAME "BULLDOZER"
#endif
@ -508,7 +508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
"-DHAVE_AVX -DHAVE_FMA3"
#define LIBNAME "piledriver"
#define CORENAME "PILEDRIVER"
#endif
@ -524,7 +524,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
"-DHAVE_AVX -DHAVE_FMA3"
#define LIBNAME "steamroller"
#define CORENAME "STEAMROLLER"
#endif
@ -540,7 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
"-DHAVE_AVX -DHAVE_FMA3"
#define LIBNAME "excavator"
#define CORENAME "EXCAVATOR"
#endif

View File

@ -5,6 +5,9 @@ endif
TOPDIR = ..
include $(TOPDIR)/Makefile.system
ifdef HAVE_SSE3
CFLAGS += -msse3
endif
ifeq ($(C_COMPILER), GCC)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)

View File

@ -97,7 +97,7 @@ CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
SDOTKERNEL = dot.S
SDOTKERNEL = ../generic/dot.c
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S

View File

@ -97,7 +97,7 @@ CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
SDOTKERNEL = dot.S
SDOTKERNEL = ../generic/dot.c
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S

View File

@ -70,7 +70,7 @@ DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SDOTKERNEL = dot.S
SDOTKERNEL = ../generic/dot.c
DDOTKERNEL = dot.S
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S

View File

@ -62,7 +62,7 @@ static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
y5 = a * x[5] + y[5];
y6 = a * x[6] + y[6];
y7 = a * x[7] + y[7];
asm("":"+w"(y0),"+w"(y1),"+w"(y2),"+w"(y3),"+w"(y4),"+w"(y5),"+w"(y6),"+w"(y7));
__asm__("":"+w"(y0),"+w"(y1),"+w"(y2),"+w"(y3),"+w"(y4),"+w"(y5),"+w"(y6),"+w"(y7));
y[0] = y0;
y[1] = y1;
y[2] = y2;
@ -74,7 +74,7 @@ static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
xx = (x + 4*128/sizeof(*x));
yy = (y + 4*128/sizeof(*y));
asm("":"+r"(yy)::"memory");
__asm__("":"+r"(yy)::"memory");
prefetch(xx);
prefetch(yy);

View File

@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include "../simd/intrin.h"
#if defined(DSDOT)
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#else
@ -47,27 +47,59 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -4;
while(i < n1)
int n1 = n & -4;
#if V_SIMD && !defined(DSDOT)
const int vstep = v_nlanes_f32;
const int unrollx4 = n & (-vstep * 4);
const int unrollx = n & -vstep;
v_f32 vsum0 = v_zero_f32();
v_f32 vsum1 = v_zero_f32();
v_f32 vsum2 = v_zero_f32();
v_f32 vsum3 = v_zero_f32();
while(i < unrollx4)
{
vsum0 = v_muladd_f32(
v_loadu_f32(x + i), v_loadu_f32(y + i), vsum0
);
vsum1 = v_muladd_f32(
v_loadu_f32(x + i + vstep), v_loadu_f32(y + i + vstep), vsum1
);
vsum2 = v_muladd_f32(
v_loadu_f32(x + i + vstep*2), v_loadu_f32(y + i + vstep*2), vsum2
);
vsum3 = v_muladd_f32(
v_loadu_f32(x + i + vstep*3), v_loadu_f32(y + i + vstep*3), vsum3
);
i += vstep*4;
}
vsum0 = v_add_f32(
v_add_f32(vsum0, vsum1), v_add_f32(vsum2 , vsum3)
);
while(i < unrollx)
{
vsum0 = v_muladd_f32(
v_loadu_f32(x + i), v_loadu_f32(y + i), vsum0
);
i += vstep;
}
dot = v_sum_f32(vsum0);
#elif defined(DSDOT)
for (; i < n1; i += 4)
{
#if defined(DSDOT)
dot += (double) y[i] * (double) x[i]
+ (double) y[i+1] * (double) x[i+1]
+ (double) y[i+2] * (double) x[i+2]
+ (double) y[i+3] * (double) x[i+3] ;
}
#else
for (; i < n1; i += 4)
{
dot += y[i] * x[i]
+ y[i+1] * x[i+1]
+ y[i+2] * x[i+2]
+ y[i+3] * x[i+3] ;
#endif
i+=4 ;
}
#endif
while(i < n)
{

View File

@ -151,9 +151,9 @@ endif
ZAXPYKERNEL = zaxpy_power10.c
#
SCOPYKERNEL = scopy.c
DCOPYKERNEL = dcopy.c
DCOPYKERNEL = dcopy_power10.c
CCOPYKERNEL = ccopy.c
ZCOPYKERNEL = zcopy.c
ZCOPYKERNEL = zcopy_power10.c
#
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c

View File

@ -0,0 +1,134 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_64 1
static void dcopy_kernel_64 (long n, double *x, double *y)
{
__asm__
(
"lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t"
"lxvp 36, 64(%2) \n\t"
"lxvp 38, 96(%2) \n\t"
"lxvp 40, 128(%2) \n\t"
"lxvp 42, 160(%2) \n\t"
"lxvp 44, 192(%2) \n\t"
"lxvp 46, 224(%2) \n\t"
"lxvp 48, 256(%2) \n\t"
"lxvp 50, 288(%2) \n\t"
"lxvp 52, 320(%2) \n\t"
"lxvp 54, 352(%2) \n\t"
"lxvp 56, 384(%2) \n\t"
"lxvp 58, 416(%2) \n\t"
"lxvp 60, 448(%2) \n\t"
"lxvp 62, 480(%2) \n\t"
"addi %2, %2, 512 \n\t"
"addic. %1, %1, -64 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"stxvp 32, 0(%3) \n\t"
"lxvp 32, 0(%2) \n\t"
"stxvp 34, 32(%3) \n\t"
"lxvp 34, 32(%2) \n\t"
"stxvp 36, 64(%3) \n\t"
"lxvp 36, 64(%2) \n\t"
"stxvp 38, 96(%3) \n\t"
"lxvp 38, 96(%2) \n\t"
"stxvp 40, 128(%3) \n\t"
"lxvp 40, 128(%2) \n\t"
"stxvp 42, 160(%3) \n\t"
"lxvp 42, 160(%2) \n\t"
"stxvp 44, 192(%3) \n\t"
"lxvp 44, 192(%2) \n\t"
"stxvp 46, 224(%3) \n\t"
"lxvp 46, 224(%2) \n\t"
"stxvp 48, 256(%3) \n\t"
"lxvp 48, 256(%2) \n\t"
"stxvp 50, 288(%3) \n\t"
"lxvp 50, 288(%2) \n\t"
"stxvp 52, 320(%3) \n\t"
"lxvp 52, 320(%2) \n\t"
"stxvp 54, 352(%3) \n\t"
"lxvp 54, 352(%2) \n\t"
"stxvp 56, 384(%3) \n\t"
"lxvp 56, 384(%2) \n\t"
"stxvp 58, 416(%3) \n\t"
"lxvp 58, 416(%2) \n\t"
"stxvp 60, 448(%3) \n\t"
"lxvp 60, 448(%2) \n\t"
"stxvp 62, 480(%3) \n\t"
"lxvp 62, 480(%2) \n\t"
"addi %3, %3, 512 \n\t"
"addi %2, %2, 512 \n\t"
"addic. %1, %1, -64 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"stxvp 32, 0(%3) \n\t"
"stxvp 34, 32(%3) \n\t"
"stxvp 36, 64(%3) \n\t"
"stxvp 38, 96(%3) \n\t"
"stxvp 40, 128(%3) \n\t"
"stxvp 42, 160(%3) \n\t"
"stxvp 44, 192(%3) \n\t"
"stxvp 46, 224(%3) \n\t"
"stxvp 48, 256(%3) \n\t"
"stxvp 50, 288(%3) \n\t"
"stxvp 52, 320(%3) \n\t"
"stxvp 54, 352(%3) \n\t"
"stxvp 56, 384(%3) \n\t"
"stxvp 58, 416(%3) \n\t"
"stxvp 60, 448(%3) \n\t"
"stxvp 62, 480(%3) \n\t"
"#n=%1 x=%4=%2 y=%0=%3"
:
"=m" (*y),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y) // 3
:
"m" (*x)
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
);
}

View File

@ -0,0 +1,123 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "dcopy_microk_power10.c"
#endif
#ifndef HAVE_KERNEL_64
static void dcopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
FLOAT *x1=x;
FLOAT *y1=y;
while ( i<n )
{
f0 = x1[0];
f1 = x1[1];
f2 = x1[2];
f3 = x1[3];
f4 = x1[4];
f5 = x1[5];
f6 = x1[6];
f7 = x1[7];
y1[0] = f0;
y1[1] = f1;
y1[2] = f2;
y1[3] = f3;
y1[4] = f4;
y1[5] = f5;
y1[6] = f6;
y1[7] = f7;
x1 += 8;
y1 += 8;
i+=8;
}
return;
}
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -64;
if ( n1 > 0 )
{
dcopy_kernel_64(n1, x, y);
i=n1;
}
while(i < n)
{
y[i] = x[i] ;
i++ ;
}
}
else
{
while(i < n)
{
y[iy] = x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
return(0);
}

View File

@ -0,0 +1,134 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_32 1
static void zcopy_kernel_32 (long n, double *x, double *y)
{
__asm__
(
"lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t"
"lxvp 36, 64(%2) \n\t"
"lxvp 38, 96(%2) \n\t"
"lxvp 40, 128(%2) \n\t"
"lxvp 42, 160(%2) \n\t"
"lxvp 44, 192(%2) \n\t"
"lxvp 46, 224(%2) \n\t"
"lxvp 48, 256(%2) \n\t"
"lxvp 50, 288(%2) \n\t"
"lxvp 52, 320(%2) \n\t"
"lxvp 54, 352(%2) \n\t"
"lxvp 56, 384(%2) \n\t"
"lxvp 58, 416(%2) \n\t"
"lxvp 60, 448(%2) \n\t"
"lxvp 62, 480(%2) \n\t"
"addi %2, %2, 512 \n\t"
"addic. %1, %1, -32 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"stxvp 32, 0(%3) \n\t"
"lxvp 32, 0(%2) \n\t"
"stxvp 34, 32(%3) \n\t"
"lxvp 34, 32(%2) \n\t"
"stxvp 36, 64(%3) \n\t"
"lxvp 36, 64(%2) \n\t"
"stxvp 38, 96(%3) \n\t"
"lxvp 38, 96(%2) \n\t"
"stxvp 40, 128(%3) \n\t"
"lxvp 40, 128(%2) \n\t"
"stxvp 42, 160(%3) \n\t"
"lxvp 42, 160(%2) \n\t"
"stxvp 44, 192(%3) \n\t"
"lxvp 44, 192(%2) \n\t"
"stxvp 46, 224(%3) \n\t"
"lxvp 46, 224(%2) \n\t"
"stxvp 48, 256(%3) \n\t"
"lxvp 48, 256(%2) \n\t"
"stxvp 50, 288(%3) \n\t"
"lxvp 50, 288(%2) \n\t"
"stxvp 52, 320(%3) \n\t"
"lxvp 52, 320(%2) \n\t"
"stxvp 54, 352(%3) \n\t"
"lxvp 54, 352(%2) \n\t"
"stxvp 56, 384(%3) \n\t"
"lxvp 56, 384(%2) \n\t"
"stxvp 58, 416(%3) \n\t"
"lxvp 58, 416(%2) \n\t"
"stxvp 60, 448(%3) \n\t"
"lxvp 60, 448(%2) \n\t"
"stxvp 62, 480(%3) \n\t"
"lxvp 62, 480(%2) \n\t"
"addi %3, %3, 512 \n\t"
"addi %2, %2, 512 \n\t"
"addic. %1, %1, -32 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"stxvp 32, 0(%3) \n\t"
"stxvp 34, 32(%3) \n\t"
"stxvp 36, 64(%3) \n\t"
"stxvp 38, 96(%3) \n\t"
"stxvp 40, 128(%3) \n\t"
"stxvp 42, 160(%3) \n\t"
"stxvp 44, 192(%3) \n\t"
"stxvp 46, 224(%3) \n\t"
"stxvp 48, 256(%3) \n\t"
"stxvp 50, 288(%3) \n\t"
"stxvp 52, 320(%3) \n\t"
"stxvp 54, 352(%3) \n\t"
"stxvp 56, 384(%3) \n\t"
"stxvp 58, 416(%3) \n\t"
"stxvp 60, 448(%3) \n\t"
"stxvp 62, 480(%3) \n\t"
"#n=%1 x=%4=%2 y=%0=%3"
:
"=m" (*y),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y) // 3
:
"m" (*x)
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
);
}

View File

@ -0,0 +1,132 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "zcopy_microk_power10.c"
#endif
#ifndef HAVE_KERNEL_32
static void zcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
FLOAT *x1=x;
FLOAT *y1=y;
while ( i<n )
{
f0 = x1[0];
f1 = x1[1];
f2 = x1[2];
f3 = x1[3];
f4 = x1[4];
f5 = x1[5];
f6 = x1[6];
f7 = x1[7];
y1[0] = f0;
y1[1] = f1;
y1[2] = f2;
y1[3] = f3;
y1[4] = f4;
y1[5] = f5;
y1[6] = f6;
y1[7] = f7;
x1 += 8;
y1 += 8;
i+=4;
}
return;
}
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
zcopy_kernel_32(n1, x, y);
i=n1;
ix=n1*2;
iy=n1*2;
}
while(i < n)
{
y[iy] = x[iy] ;
y[iy+1] = x[ix+1] ;
ix+=2;
iy+=2;
i++ ;
}
}
else
{
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;
while(i < n)
{
y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}

80
kernel/simd/intrin.h Normal file
View File

@ -0,0 +1,80 @@
#ifndef _INTRIN_H_
#define _INTRIN_H_
#if defined(_MSC_VER)
#define BLAS_INLINE __inline
#elif defined(__GNUC__)
#if defined(__STRICT_ANSI__)
#define BLAS_INLINE __inline__
#else
#define BLAS_INLINE inline
#endif
#else
#define BLAS_INLINE
#endif
#ifdef _MSC_VER
#define BLAS_FINLINE static __forceinline
#elif defined(__GNUC__)
#define BLAS_FINLINE static BLAS_INLINE __attribute__((always_inline))
#else
#define BLAS_FINLINE static
#endif
#ifdef __cplusplus
extern "C" {
#endif
// include head
/** SSE **/
#ifdef HAVE_SSE
#include <xmmintrin.h>
#endif
/** SSE2 **/
#ifdef HAVE_SSE2
#include <emmintrin.h>
#endif
/** SSE3 **/
#ifdef HAVE_SSE3
#include <pmmintrin.h>
#endif
/** SSSE3 **/
#ifdef HAVE_SSSE3
#include <tmmintrin.h>
#endif
/** SSE41 **/
#ifdef HAVE_SSE4_1
#include <smmintrin.h>
#endif
/** AVX **/
#ifdef HAVE_AVX
#include <immintrin.h>
#endif
/** NEON **/
#ifdef HAVE_NEON
#include <arm_neon.h>
#endif
// distribute
#if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16)
#include "intrin_avx512.h"
#elif defined(HAVE_AVX2)
#include "intrin_avx.h"
#elif defined(HAVE_SSE2)
#include "intrin_sse.h"
#endif
#ifdef HAVE_NEON
#include "intrin_neon.h"
#endif
#ifndef V_SIMD
#define V_SIMD 0
#define V_SIMD_F64 0
#endif
#ifdef __cplusplus
}
#endif
#endif // _INTRIN_H_

41
kernel/simd/intrin_avx.h Normal file
View File

@ -0,0 +1,41 @@
#define V_SIMD 256
#define V_SIMD_F64 1
/***************************
* Data Type
***************************/
typedef __m256 v_f32;
#define v_nlanes_f32 8
/***************************
* Arithmetic
***************************/
#define v_add_f32 _mm256_add_ps
#define v_mul_f32 _mm256_mul_ps
#ifdef HAVE_FMA3
// multiply and add, a*b + c
#define v_muladd_f32 _mm256_fmadd_ps
#else
// multiply and add, a*b + c
BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
{ return v_add_f32(v_mul_f32(a, b), c); }
#endif // !HAVE_FMA3
// Horizontal add: Calculates the sum of all vector elements.
BLAS_FINLINE float v_sum_f32(__m256 a)
{
__m256 sum_halves = _mm256_hadd_ps(a, a);
sum_halves = _mm256_hadd_ps(sum_halves, sum_halves);
__m128 lo = _mm256_castps256_ps128(sum_halves);
__m128 hi = _mm256_extractf128_ps(sum_halves, 1);
__m128 sum = _mm_add_ps(lo, hi);
return _mm_cvtss_f32(sum);
}
/***************************
* memory
***************************/
// unaligned load
#define v_loadu_f32 _mm256_loadu_ps
#define v_storeu_f32 _mm256_storeu_ps
#define v_setall_f32(VAL) _mm256_set1_ps(VAL)
#define v_zero_f32 _mm256_setzero_ps

View File

@ -0,0 +1,35 @@
#define V_SIMD 512
#define V_SIMD_F64 1
/***************************
* Data Type
***************************/
typedef __m512 v_f32;
#define v_nlanes_f32 16
/***************************
* Arithmetic
***************************/
#define v_add_f32 _mm512_add_ps
#define v_mul_f32 _mm512_mul_ps
// multiply and add, a*b + c
#define v_muladd_f32 _mm512_fmadd_ps
BLAS_FINLINE float v_sum_f32(v_f32 a)
{
__m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));
__m512 sum32 = _mm512_add_ps(a, h64);
__m512 h32 = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2));
__m512 sum16 = _mm512_add_ps(sum32, h32);
__m512 h16 = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2));
__m512 sum8 = _mm512_add_ps(sum16, h16);
__m512 h4 = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1));
__m512 sum4 = _mm512_add_ps(sum8, h4);
return _mm_cvtss_f32(_mm512_castps512_ps128(sum4));
}
/***************************
* memory
***************************/
// unaligned load
#define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR))
#define v_storeu_f32 _mm512_storeu_ps
#define v_setall_f32(VAL) _mm512_set1_ps(VAL)
#define v_zero_f32 _mm512_setzero_ps

42
kernel/simd/intrin_neon.h Normal file
View File

@ -0,0 +1,42 @@
#define V_SIMD 128
#ifdef __aarch64__
#define V_SIMD_F64 1
#else
#define V_SIMD_F64 0
#endif
/***************************
* Data Type
***************************/
typedef float32x4_t v_f32;
#define v_nlanes_f32 4
/***************************
* Arithmetic
***************************/
#define v_add_f32 vaddq_f32
#define v_mul_f32 vmulq_f32
// FUSED F32
#ifdef HAVE_VFPV4 // FMA
// multiply and add, a*b + c
BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
{ return vfmaq_f32(c, a, b); }
#else
// multiply and add, a*b + c
BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
{ return vmlaq_f32(c, a, b); }
#endif
// Horizontal add: Calculates the sum of all vector elements.
BLAS_FINLINE float v_sum_f32(float32x4_t a)
{
float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a));
return vget_lane_f32(vpadd_f32(r, r), 0);
}
/***************************
* memory
***************************/
// unaligned load
#define v_loadu_f32(a) vld1q_f32((const float*)a)
#define v_storeu_f32 vst1q_f32
#define v_setall_f32(VAL) vdupq_n_f32(VAL)
#define v_zero_f32() vdupq_n_f32(0.0f)

46
kernel/simd/intrin_sse.h Normal file
View File

@ -0,0 +1,46 @@
#define V_SIMD 128
#define V_SIMD_F64 1
/***************************
* Data Type
***************************/
typedef __m128 v_f32;
#define v_nlanes_f32 4
/***************************
* Arithmetic
***************************/
#define v_add_f32 _mm_add_ps
#define v_mul_f32 _mm_mul_ps
#ifdef HAVE_FMA3
// multiply and add, a*b + c
#define v_muladd_f32 _mm_fmadd_ps
#elif defined(HAVE_FMA4)
// multiply and add, a*b + c
#define v_muladd_f32 _mm_macc_ps
#else
// multiply and add, a*b + c
BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
{ return v_add_f32(v_mul_f32(a, b), c); }
#endif // HAVE_FMA3
// Horizontal add: Calculates the sum of all vector elements.
BLAS_FINLINE float v_sum_f32(__m128 a)
{
#ifdef HAVE_SSE3
__m128 sum_halves = _mm_hadd_ps(a, a);
return _mm_cvtss_f32(_mm_hadd_ps(sum_halves, sum_halves));
#else
__m128 t1 = _mm_movehl_ps(a, a);
__m128 t2 = _mm_add_ps(a, t1);
__m128 t3 = _mm_shuffle_ps(t2, t2, 1);
__m128 t4 = _mm_add_ss(t2, t3);
return _mm_cvtss_f32(t4);
#endif
}
/***************************
* memory
***************************/
// unaligned load
#define v_loadu_f32 _mm_loadu_ps
#define v_storeu_f32 _mm_storeu_ps
#define v_setall_f32(VAL) _mm_set1_ps(VAL)
#define v_zero_f32 _mm_setzero_ps

View File

@ -45,28 +45,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "daxpy_microk_sandy-2.c"
#endif
#ifndef HAVE_KERNEL_8
#include"../simd/intrin.h"
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
FLOAT a = *alpha;
#if V_SIMD
v_f32 __alpha, tmp;
__alpha = v_setall_f32(*alpha);
const int vstep = v_nlanes_f32;
for (; i < n; i += vstep) {
tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i));
v_storeu_f32(y + i, tmp);
}
#else
while(i < n)
{
y[i] += a * x[i];
y[i+1] += a * x[i+1];
y[i+2] += a * x[i+2];
y[i+3] += a * x[i+3];
y[i+4] += a * x[i+4];
y[i+5] += a * x[i+5];
y[i+6] += a * x[i+6];
y[i+7] += a * x[i+7];
i+=8 ;
}
{
y[i] += a * x[i];
y[i+1] += a * x[i+1];
y[i+2] += a * x[i+2];
y[i+3] += a * x[i+3];
y[i+4] += a * x[i+4];
y[i+5] += a * x[i+5];
y[i+6] += a * x[i+6];
y[i+7] += a * x[i+7];
i+=8 ;
}
#endif
}
#endif

View File

@ -140,13 +140,16 @@
*
* .. Parameters ..
DOUBLE PRECISION ZERO, HALF, ONE
PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0 )
PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0,
$ TWO = 2.0D0 )
DOUBLE PRECISION MULTPL
PARAMETER ( MULTPL = 4.0D+0 )
* ..
* .. Local Scalars ..
DOUBLE PRECISION AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB,
$ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z
$ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN,
$ SAFMN2, SAFMX2
INTEGER COUNT
* ..
* .. External Functions ..
DOUBLE PRECISION DLAMCH, DLAPY2
@ -157,7 +160,11 @@
* ..
* .. Executable Statements ..
*
SAFMIN = DLAMCH( 'S' )
EPS = DLAMCH( 'P' )
SAFMN2 = DLAMCH( 'B' )**INT( LOG( SAFMIN / EPS ) /
$ LOG( DLAMCH( 'B' ) ) / TWO )
SAFMX2 = ONE / SAFMN2
IF( C.EQ.ZERO ) THEN
CS = ONE
SN = ZERO
@ -212,7 +219,24 @@
* Complex eigenvalues, or real (almost) equal eigenvalues.
* Make diagonal elements equal.
*
COUNT = 0
SIGMA = B + C
10 CONTINUE
COUNT = COUNT + 1
SCALE = MAX( ABS(TEMP), ABS(SIGMA) )
IF( SCALE.GE.SAFMX2 ) THEN
SIGMA = SIGMA * SAFMN2
TEMP = TEMP * SAFMN2
IF (COUNT .LE. 20)
$ GOTO 10
END IF
IF( SCALE.LE.SAFMN2 ) THEN
SIGMA = SIGMA * SAFMX2
TEMP = TEMP * SAFMX2
IF (COUNT .LE. 20)
$ GOTO 10
END IF
P = HALF*TEMP
TAU = DLAPY2( SIGMA, TEMP )
CS = SQRT( HALF*( ONE+ABS( SIGMA ) / TAU ) )
SN = -( P / ( TAU*CS ) )*SIGN( ONE, SIGMA )

View File

@ -140,13 +140,16 @@
*
* .. Parameters ..
REAL ZERO, HALF, ONE
PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0 )
PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0,
$ TWO = 2.0E+0 )
REAL MULTPL
PARAMETER ( MULTPL = 4.0E+0 )
* ..
* .. Local Scalars ..
REAL AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB,
$ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z
$ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN,
$ SAFMN2, SAFMX2
INTEGER COUNT
* ..
* .. External Functions ..
REAL SLAMCH, SLAPY2
@ -157,7 +160,11 @@
* ..
* .. Executable Statements ..
*
SAFMIN = SLAMCH( 'S' )
EPS = SLAMCH( 'P' )
SAFMN2 = SLAMCH( 'B' )**INT( LOG( SAFMIN / EPS ) /
$ LOG( SLAMCH( 'B' ) ) / TWO )
SAFMX2 = ONE / SAFMN2
IF( C.EQ.ZERO ) THEN
CS = ONE
SN = ZERO
@ -212,7 +219,24 @@
* Complex eigenvalues, or real (almost) equal eigenvalues.
* Make diagonal elements equal.
*
COUNT = 0
SIGMA = B + C
10 CONTINUE
COUNT = COUNT + 1
SCALE = MAX( ABS(TEMP), ABS(SIGMA) )
IF( SCALE.GE.SAFMX2 ) THEN
SIGMA = SIGMA * SAFMN2
TEMP = TEMP * SAFMN2
IF (COUNT .LE. 20)
$ GOTO 10
END IF
IF( SCALE.LE.SAFMN2 ) THEN
SIGMA = SIGMA * SAFMX2
TEMP = TEMP * SAFMX2
IF (COUNT .LE. 20)
$ GOTO 10
END IF
P = HALF*TEMP
TAU = SLAPY2( SIGMA, TEMP )
CS = SQRT( HALF*( ONE+ABS( SIGMA ) / TAU ) )
SN = -( P / ( TAU*CS ) )*SIGN( ONE, SIGMA )

View File

@ -47,3 +47,17 @@ CTEST(dsdot,dsdot_n_1)
ASSERT_DBL_NEAR_TOL(res2, res1, DOUBLE_EPS);
}
CTEST(dsdot,dsdot_n_2)
{
float x[] = {0.1F, 0.2F, 0.3F, 0.4F, 0.5F, 0.6F, 0.7F, 0.8F};
float y[] = {0.1F, 0.2F, 0.3F, 0.4F, 0.5F, 0.6F, 0.7F, 0.8F};
blasint incx=1;
blasint incy=1;
blasint n=8;
double res1=0.0f, res2= 2.0400000444054616;
res1=BLASFUNC(dsdot)(&n, &x, &incx, &y, &incy);
ASSERT_DBL_NEAR_TOL(res2, res1, DOUBLE_EPS);
}