Merge pull request #1570 from xianyi/develop

Update release-0.3.0 branch to match develop
This commit is contained in:
Martin Kroeker 2018-05-23 15:12:20 +02:00 committed by GitHub
commit 939452ea9d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
81 changed files with 1477 additions and 421 deletions

View File

@ -7,6 +7,7 @@ language: c
jobs:
include:
- &test-ubuntu
os: linux
stage: test
compiler: gcc
addons:
@ -57,7 +58,8 @@ jobs:
- TARGET_BOX=LINUX32
- BTYPE="BINARY=32"
- stage: test
- os: linux
stage: test
compiler: gcc
addons:
apt:
@ -77,6 +79,7 @@ jobs:
# which is slower than container-based infrastructure used for jobs
# that don't require sudo.
- &test-alpine
os: linux
stage: test
dist: trusty
sudo: true
@ -120,6 +123,7 @@ jobs:
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
- &test-cmake
os: linux
stage: test
compiler: clang
addons:
@ -147,6 +151,23 @@ jobs:
env:
- CMAKE=1
- &test-macos
os: osx
stage: test
osx_image: xcode8
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
- brew update
- brew install gcc # for gfortran
script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env:
- BTYPE="BINARY=64 INTERFACE64=1"
- <<: *test-macos
env:
- BTYPE="BINARY=32"
# whitelist
branches:
only:

View File

@ -91,11 +91,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif

View File

@ -72,12 +72,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ -101,8 +96,9 @@ endif
#Generating openblas.pc
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@ -115,7 +111,7 @@ endif
ifndef NO_SHARED
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))

View File

@ -17,6 +17,10 @@ ifdef CPUIDEMU
EXFLAGS = -DCPUIDEMU -DVENDOR=99
endif
ifeq ($(TARGET), 1004K)
TARGET_FLAGS = -mips32r2
endif
ifeq ($(TARGET), P5600)
TARGET_FLAGS = -mips32r5
endif

View File

@ -60,6 +60,13 @@ VERSION = 0.3.0.dev
# automatically detected by the the script.
# NUM_THREADS = 24
# If you have enabled USE_OPENMP and your application would call
# OpenBLAS's calculation API from multi threads, please comment it in.
# This flag defines how many instances of OpenBLAS's calculation API can
# actually run in parallel. If more threads call OpenBLAS's calculation API,
# they need to wait for the preceding API calls to finish or risk data corruption.
# NUM_PARALLEL = 2
# if you don't need to install the static library, please comment it in.
# NO_STATIC = 1

View File

@ -17,15 +17,24 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
# http://stackoverflow.com/questions/4029274/mingw-and-make-variables
# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
ifeq ($(origin CC),default)
# Check if $(CC) refers to a valid command and set the value to gcc if not
ifneq ($(findstring cmd.exe,$(SHELL)),)
ifeq ($(shell where $(CC) 2>NUL),)
CC = gcc
# Change the default compile to clang on Mac OSX.
# http://stackoverflow.com/questions/714100/os-detecting-makefile
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
CC = clang
# EXTRALIB += -Wl,-no_compact_unwind
endif
endif
else # POSIX-ish
ifeq ($(shell command -v $(CC) 2>/dev/null),)
ifeq ($(shell uname -s),Darwin)
CC = clang
# EXTRALIB += -Wl,-no_compact_unwind
else
CC = gcc
endif # Darwin
endif # CC exists
endif # Shell is sane
endif # CC is set to default
# Default Fortran compiler (FC) is selected by f_check.
@ -175,6 +184,10 @@ endif
endif
ifndef NUM_PARALLEL
NUM_PARALLEL = 1
endif
ifndef NUM_THREADS
NUM_THREADS = $(NUM_CORES)
endif
@ -230,7 +243,7 @@ endif
MD5SUM = md5 -r
endif
ifeq ($(OSNAME), FreeBSD)
ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly))
MD5SUM = md5 -r
endif
@ -424,7 +437,7 @@ CCOMMON_OPT += -fopenmp
endif
ifeq ($(C_COMPILER), INTEL)
CCOMMON_OPT += -openmp
CCOMMON_OPT += -fopenmp
endif
ifeq ($(C_COMPILER), PGI)
@ -555,9 +568,14 @@ CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
ifeq ($(CORE), 1004K)
CCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
FCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
endif
ifeq ($(CORE), P5600)
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
endif
ifeq ($(CORE), I6400)
@ -704,7 +722,7 @@ FCOMMON_OPT += -i8
endif
endif
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp
FCOMMON_OPT += -fopenmp
endif
endif
@ -952,6 +970,8 @@ endif
CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS)
CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL)
ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif

230
README.md
View File

@ -5,175 +5,219 @@
Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS)
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
## Binary Packages
We provide binary packages for the following platform.
We provide official binary packages for the following platform:
* Windows x86/x86_64
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
## Installation from Source
Download from project homepage. http://xianyi.github.com/OpenBLAS/
Or, check out codes from git://github.com/xianyi/OpenBLAS.git
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
using Git from https://github.com/xianyi/OpenBLAS.git.
### Dependencies
Building OpenBLAS requires the following to be installed:
* GNU Make
* A C compiler, e.g. GCC or Clang
* A Fortran compiler (optional, for LAPACK)
* IBM MASS (optional, see below)
### Normal compile
* type "make" to detect the CPU automatically.
or
* type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically.
To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`.
The full target list is in the file `TargetList.txt`.
### Cross compile
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler.
The target must be specified explicitly when cross compiling.
Examples:
On X86 box, compile this library for loongson3a CPU.
* On an x86 box, compile this library for a loongson3a CPU:
```sh
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
```
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler.
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
* On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler:
```sh
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
```
### Debug version
make DEBUG=1
A debug version can be built using `make DEBUG=1`.
### Compile with MASS Support on Power CPU (Optional dependency)
### Compile with MASS support on Power CPU (optional)
[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and
Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER.
The library can be installed as below -
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
consists of a set of mathematical functions for C, C++, and Fortran applications that are
are tuned for optimum performance on POWER architectures.
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
The library can be installed as shown:
* On Ubuntu:
* On Ubuntu:
```sh
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
sudo apt-get update
sudo apt-get install libxlmass-devel.8.1.5
```
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -</br>
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list</br>
sudo apt-get update</br>
sudo apt-get install libxlmass-devel.8.1.5</br>
* On RHEL/CentOS:
```sh
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
sudo rpm --import repomd.xml.key
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
sudo yum install libxlmass-devel.8.1.5
```
* On RHEL/CentOS:
After installing the MASS library, compile OpenBLAS with `USE_MASS=1`.
For example, to compile on Power8 with MASS support: `make USE_MASS=1 TARGET=POWER8`.
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key</br>
sudo rpm --import repomd.xml.key</br>
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo</br>
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/</br>
sudo yum install libxlmass-devel.8.1.5</br>
### Install to a specific directory (optional)
After installing MASS library, compile openblas with USE_MASS=1.
Use `PREFIX=` when invoking `make`, for example
Example:
```sh
make install PREFIX=your_installation_directory
```
Compiling on Power8 with MASS support -
The default installation directory is `/opt/OpenBLAS`.
make USE_MASS=1 TARGET=POWER8
## Supported CPUs and Operating Systems
### Install to the directory (optional)
Please read `GotoBLAS_01Readme.txt`.
Example:
### Additional supported CPUs
make install PREFIX=your_installation_directory
#### x86/x86-64
The default directory is /opt/OpenBLAS
## Support CPU & OS
Please read GotoBLAS_01Readme.txt
### Additional support CPU:
#### x86/x86-64:
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar)
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
#### MIPS64:
#### MIPS64
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
- **ICT Loongson 3B**: Experimental
#### ARM:
- **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ )
- **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 )
#### ARM
#### ARM64:
- **ARMV8**: Experimental
- **ARMv6**: Optimized BLAS for vfpv2 and vfpv3-d16 (e.g. BCM2835, Cortex M0+)
- **ARMv7**: Optimized BLAS for vfpv3-d32 (e.g. Cortex A8, A9 and A15)
#### ARM64
- **ARMv8**: Experimental
- **ARM Cortex-A57**: Experimental
#### PPC/PPC64
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with USE_OPENMP=1
#### IBM zEnterprise System:
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
#### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
### Support OS:
### Supported OS
- **GNU/Linux**
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
- **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
- **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts.
- **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
- **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
## Usages
Link with libopenblas.a or -lopenblas for shared library.
## Usage
### Set the number of threads with environment variables.
Statically link with `libopenblas.a` or dynamically link with `-lopenblas` if OpenBLAS was
compiled as a shared library.
Examples:
### Setting the number of threads using environment variables
export OPENBLAS_NUM_THREADS=4
Environment variables are used to specify a maximum number of threads.
For example,
or
```sh
export OPENBLAS_NUM_THREADS=4
export GOTO_NUM_THREADS=4
export OMP_NUM_THREADS=4
```
export GOTO_NUM_THREADS=4
The priorities are `OPENBLAS_NUM_THREADS` > `GOTO_NUM_THREADS` > `OMP_NUM_THREADS`.
or
If you compile this library with `USE_OPENMP=1`, you should set the `OMP_NUM_THREADS`
environment variable; OpenBLAS ignores `OPENBLAS_NUM_THREADS` and `GOTO_NUM_THREADS` when
compiled with `USE_OPENMP=1`.
export OMP_NUM_THREADS=4
### Setting the number of threads at runtime
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
We provide the following functions to control the number of threads at runtime:
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
```c
void goto_set_num_threads(int num_threads);
void openblas_set_num_threads(int num_threads);
```
### Set the number of threads on runtime.
If you compile this library with `USE_OPENMP=1`, you should use the above functions too.
We provided the below functions to control the number of threads on runtime.
## Reporting bugs
void goto_set_num_threads(int num_threads);
void openblas_set_num_threads(int num_threads);
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
## Report Bugs
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
Please submit an issue in https://github.com/xianyi/OpenBLAS/issues.
## Contact
* OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
* OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev
## ChangeLog
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
## Change log
Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version.
## Troubleshooting
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
* Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first.
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
the library with `BIGNUMA=1`.
* OpenBLAS does not set processor affinity by default.
On Linux, you can enable processor affinity by commenting out the line `NO_AFFINITY=1` in
Makefile.rule. However, note that this may cause
[a conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
* On Loongson 3A, `make test` may fail with a `pthread_create` error (`EAGAIN`).
However, it will be okay when you run the same test case on the shell.
## Contributing
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
1. Write a test which shows that the bug was fixed or that the feature works as expected.
1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue
to start a discussion around a feature idea or a bug.
2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
3. Write a test which shows that the bug was fixed or that the feature works as expected.
4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
## Donation
Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation).

View File

@ -56,6 +56,7 @@ CELL
3.MIPS CPU:
P5600
1004K
4.MIPS64 CPU:
SICORTEX

View File

@ -14,6 +14,20 @@ Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
`MAX_CPU_NUMBER=NUM_THREADS`.
Despite its name, and due to the use of memory buffers in functions like SGEMM,
the setting of NUM_THREADS can be relevant even for a single-threaded build
of OpenBLAS, if such functions get called by multiple threads of a program
that uses OpenBLAS. In some cases, the affected code may simply crash or throw
a segmentation fault without displaying the above warning first.
Note that the number of threads used at runtime can be altered to differ from the
value NUM_THREADS was set to at build time. At runtime, the actual number of
threads can be set anywhere from 1 to the build's NUM_THREADS (note however,
that this does not change the number of memory buffers that will be allocated,
which is set at build time). The number of threads for a process can be set by
using the mechanisms described below.
#### How can I use OpenBLAS in multi-threaded applications?
If your application is already multi-threaded, it will conflict with OpenBLAS

View File

@ -54,6 +54,8 @@ $compiler = GCC if ($compiler eq "");
$os = Linux if ($data =~ /OS_LINUX/);
$os = FreeBSD if ($data =~ /OS_FREEBSD/);
$os = NetBSD if ($data =~ /OS_NETBSD/);
$os = OpenBSD if ($data =~ /OS_OPENBSD/);
$os = DragonFly if ($data =~ /OS_DRAGONFLY/);
$os = Darwin if ($data =~ /OS_DARWIN/);
$os = SunOS if ($data =~ /OS_SUNOS/);
$os = AIX if ($data =~ /OS_AIX/);

View File

@ -1,6 +1,7 @@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
Name: OpenBLAS
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: @OPENBLAS_VERSION@

View File

@ -96,6 +96,10 @@ if (NOT CMAKE_CROSSCOMPILING)
endif()
if (NOT DEFINED NUM_PARALLEL)
set(NUM_PARALLEL 1)
endif()
if (NOT DEFINED NUM_THREADS)
if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0)
# HT?
@ -224,6 +228,8 @@ endif ()
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}")
if (USE_SIMPLE_THREADED_LEVEL3)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
endif ()

View File

@ -93,7 +93,7 @@ extern "C" {
#include <sched.h>
#endif
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID)
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID)
#include <sched.h>
#endif
@ -179,7 +179,7 @@ extern "C" {
#define ALLOCA_ALIGN 63UL
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
#ifdef NEEDBUNDERSCORE
#define BLASFUNC(FUNC) FUNC##_
@ -649,6 +649,12 @@ int omp_get_num_procs(void);
__declspec(dllimport) int __cdecl omp_in_parallel(void);
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
#endif
#if (__STDC_VERSION__ >= 201112L)
#ifndef _Atomic
#define _Atomic volatile
#endif
#include <stdatomic.h>
#endif
#else
#ifdef __ELF__
int omp_in_parallel (void) __attribute__ ((weak));

View File

@ -178,7 +178,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
result = x/y;
return result;
#else
#if (MAX_CPU_NUMBER > 64)
if ( y > 64) {
result = x/y;
return result;
}
#endif
y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
@ -327,7 +333,7 @@ REALNAME:
#endif
#endif
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__)
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__)
#define PROLOGUE \
.text; \
.align 16; \

View File

@ -196,6 +196,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
if (y <= 1) return x;
#if (MAX_CPU_NUMBER > 64)
if (y > 64) {
result = x / y;
return result;
}
#endif
y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
@ -403,7 +410,7 @@ REALNAME:
#define EPILOGUE .end
#endif
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(__ELF__) || defined(C_PGI)
#define PROLOGUE \
.text; \
.align 512; \

View File

@ -121,7 +121,7 @@ int detect(void)
return CPU_VULCAN;
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
return CPU_THUNDERX;
else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */
else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43"))
return CPU_THUNDERX2T99;
}

View File

@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_UNKNOWN 0
#define CPU_P5600 1
#define CPU_1004K 2
static char *cpuname[] = {
"UNKOWN",
"P5600"
"P5600",
"1004K"
};
int detect(void){
@ -90,7 +92,7 @@ int detect(void){
if (!strncmp("cpu", buffer, 3)){
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
fprintf(stderr, "%s \n", p);
#endif
break;
}
@ -99,43 +101,13 @@ int detect(void){
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}else if (strstr(p, "Loongson-3")){
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("system type", buffer, 11)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if (strstr(p, "loongson3a"))
return CPU_LOONGSON3A;
}else{
if (strstr(p, "5600")) {
return CPU_P5600;
} else if (strstr(p, "1004K")) {
return CPU_1004K;
} else
return CPU_UNKNOWN;
}
}
//Check model name for Loongson3
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("model name", buffer, 10)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}
}
#endif
return CPU_UNKNOWN;
}
@ -149,7 +121,7 @@ void get_architecture(void){
}
void get_subarchitecture(void){
if(detect()==CPU_P5600){
if(detect()==CPU_P5600|| detect()==CPU_1004K){
printf("P5600");
}else{
printf("UNKNOWN");
@ -170,6 +142,14 @@ void get_cpuconfig(void){
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n");
} else if (detect()==CPU_1004K) {
printf("#define MIPS1004K\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 26144\n");
printf("#define DTB_DEFAULT_ENTRIES 8\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else{
printf("#define UNKNOWN\n");
}
@ -178,6 +158,8 @@ void get_cpuconfig(void){
void get_libname(void){
if(detect()==CPU_P5600) {
printf("p5600\n");
} else if (detect()==CPU_1004K) {
printf("1004K\n");
}else{
printf("mips\n");
}

View File

@ -60,6 +60,14 @@ OS_FREEBSD
OS_NETBSD
#endif
#if defined(__OpenBSD__)
OS_OPENBSD
#endif
#if defined(__DragonFly__)
OS_DRAGONFLY
#endif
#if defined(__sun)
OS_SUNOS
#endif

View File

@ -91,7 +91,12 @@
#endif
typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;

View File

@ -67,7 +67,12 @@
#endif
typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;

View File

@ -91,7 +91,12 @@
#endif
typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;

View File

@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************/
#include "common.h"
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD)
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
#include <dlfcn.h>
#include <signal.h>
#include <sys/resource.h>

View File

@ -36,6 +36,7 @@
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
//#include <sys/mman.h>
@ -49,11 +50,16 @@
int blas_server_avail = 0;
static void * blas_thread_buffer[MAX_CPU_NUMBER];
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
#if __STDC_VERSION__ >= 201112L
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
#else
static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
#endif
void goto_set_num_threads(int num_threads) {
int i=0;
int i=0, j=0;
if (num_threads < 1) num_threads = blas_num_threads;
@ -68,15 +74,17 @@ void goto_set_num_threads(int num_threads) {
omp_set_num_threads(blas_cpu_number);
//adjust buffer for each thread
for(i=0; i<blas_cpu_number; i++){
if(blas_thread_buffer[i]==NULL){
blas_thread_buffer[i]=blas_memory_alloc(2);
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
for(j=0; j<blas_cpu_number; j++){
if(blas_thread_buffer[i][j]==NULL){
blas_thread_buffer[i][j]=blas_memory_alloc(2);
}
}
}
for(; i<MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i]!=NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i]=NULL;
for(; j<MAX_CPU_NUMBER; j++){
if(blas_thread_buffer[i][j]!=NULL){
blas_memory_free(blas_thread_buffer[i][j]);
blas_thread_buffer[i][j]=NULL;
}
}
}
#if defined(ARCH_MIPS64)
@ -92,30 +100,34 @@ void openblas_set_num_threads(int num_threads) {
int blas_thread_init(void){
int i=0;
int i=0, j=0;
blas_get_cpu_number();
blas_server_avail = 1;
for(i=0; i<blas_num_threads; i++){
blas_thread_buffer[i]=blas_memory_alloc(2);
}
for(; i<MAX_CPU_NUMBER; i++){
blas_thread_buffer[i]=NULL;
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
for(j=0; j<blas_num_threads; j++){
blas_thread_buffer[i][j]=blas_memory_alloc(2);
}
for(; j<MAX_CPU_NUMBER; j++){
blas_thread_buffer[i][j]=NULL;
}
}
return 0;
}
int BLASFUNC(blas_thread_shutdown)(void){
int i=0;
int i=0, j=0;
blas_server_avail = 0;
for(i=0; i<MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i]!=NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i]=NULL;
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
for(j=0; j<MAX_CPU_NUMBER; j++){
if(blas_thread_buffer[i][j]!=NULL){
blas_memory_free(blas_thread_buffer[i][j]);
blas_thread_buffer[i][j]=NULL;
}
}
}
@ -206,7 +218,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
}
}
static void exec_threads(blas_queue_t *queue){
static void exec_threads(blas_queue_t *queue, int buf_index){
void *buffer, *sa, *sb;
int pos=0, release_flag=0;
@ -223,7 +235,7 @@ static void exec_threads(blas_queue_t *queue){
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
pos = omp_get_thread_num();
buffer = blas_thread_buffer[pos];
buffer = blas_thread_buffer[buf_index][pos];
//fallback
if(buffer==NULL) {
@ -291,7 +303,7 @@ static void exec_threads(blas_queue_t *queue){
int exec_blas(BLASLONG num, blas_queue_t *queue){
BLASLONG i;
BLASLONG i, buf_index;
if ((num <= 0) || (queue == NULL)) return 0;
@ -302,6 +314,23 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
}
#endif
while(true) {
for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
#if __STDC_VERSION__ >= 201112L
_Bool inuse = false;
if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) {
#else
if(blas_buffer_inuse[i] == false) {
blas_buffer_inuse[i] = true;
#endif
buf_index = i;
break;
}
}
if(i != MAX_PARALLEL_NUMBER)
break;
}
#pragma omp parallel for schedule(static)
for (i = 0; i < num; i ++) {
@ -309,9 +338,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
queue[i].position = i;
#endif
exec_threads(&queue[i]);
exec_threads(&queue[i], buf_index);
}
#if __STDC_VERSION__ >= 201112L
atomic_store(&blas_buffer_inuse[buf_index], false);
#else
blas_buffer_inuse[buf_index] = false;
#endif
return 0;
}

View File

@ -108,7 +108,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/resource.h>
#endif
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#include <sys/sysctl.h>
#include <sys/resource.h>
#endif
@ -147,9 +147,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#else
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
#define CONSTRUCTOR __attribute__ ((constructor(101)))
#define DESTRUCTOR __attribute__ ((destructor(101)))
#else
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#endif
#ifdef DYNAMIC_ARCH
@ -209,7 +212,8 @@ int ret;
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums;
nums = CPU_COUNT_S(size,cpusetp);
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
#endif
@ -246,7 +250,7 @@ int get_num_procs(void) {
#endif
#if defined(OS_FREEBSD)
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
int get_num_procs(void) {
@ -336,7 +340,7 @@ extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num;
#endif
int blas_goto_num = 0;
@ -344,7 +348,7 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
max_num = get_num_procs();
#endif
@ -368,7 +372,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif

View File

@ -54,6 +54,9 @@ static char* openblas_config_str=""
#ifdef NO_AFFINITY
"NO_AFFINITY "
#endif
#ifdef USE_OPENMP
"USE_OPENMP "
#endif
#ifndef DYNAMIC_ARCH
CHAR_CORENAME
#endif
@ -61,18 +64,23 @@ static char* openblas_config_str=""
#ifdef DYNAMIC_ARCH
char *gotoblas_corename();
static char tmp_config_str[256];
#endif
static char tmp_config_str[256];
int openblas_get_parallel();
char* CNAME() {
#ifndef DYNAMIC_ARCH
return openblas_config_str;
#else
char tmpstr[20];
strcpy(tmp_config_str, openblas_config_str);
#ifdef DYNAMIC_ARCH
strcat(tmp_config_str, gotoblas_corename());
return tmp_config_str;
#endif
if (openblas_get_parallel() == 0)
sprintf(tmpstr, " SINGLE_THREADED");
else
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
strcat(tmp_config_str, tmpstr);
return tmp_config_str;
}
@ -83,3 +91,4 @@ char* openblas_get_corename() {
return gotoblas_corename();
#endif
}

View File

@ -156,7 +156,7 @@ endif
endif
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD))
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
so : ../$(LIBSONAME)

View File

@ -97,7 +97,7 @@ if ($compiler eq "") {
if ($data =~ /Intel/) {
$vendor = INTEL;
$openmp = "-openmp";
$openmp = "-fopenmp";
}
if ($data =~ /Sun Fortran/) {
@ -127,7 +127,7 @@ if ($compiler eq "") {
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data =~ /zho_ge__/) {
if ($data =~ / zho_ge__/) {
$need2bu = 1;
}
}
@ -155,7 +155,7 @@ if ($compiler eq "") {
if ($compiler =~ /ifort/) {
$vendor = INTEL;
$bu = "_";
$openmp = "-openmp";
$openmp = "-fopenmp";
}
if ($compiler =~ /pathf/) {

View File

@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef OS_WINDOWS
#include <windows.h>
#endif
#if defined(__FreeBSD__) || defined(__APPLE__)
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
@ -1074,7 +1074,7 @@ static int get_num_cores(void) {
#ifdef OS_WINDOWS
SYSTEM_INFO sysinfo;
#elif defined(__FreeBSD__) || defined(__APPLE__)
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
int m[2], count;
size_t len;
#endif
@ -1088,7 +1088,7 @@ static int get_num_cores(void) {
GetSystemInfo(&sysinfo);
return sysinfo.dwNumberOfProcessors;
#elif defined(__FreeBSD__) || defined(__APPLE__)
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
m[0] = CTL_HW;
m[1] = HW_NCPU;
len = sizeof(int);

View File

@ -239,6 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
} else
nthreads = 1;
/* FIXME TRMV multithreading appears to be broken, see issue 1332*/
nthreads = 1;
if(nthreads > 1) {
buffer_size = n > 16 ? 0 : n * 4 + 40;
}

View File

@ -29,10 +29,8 @@ USE_TRMM = 1
endif
ifeq ($(CORE), HASWELL)
ifeq ($(ARCH), x86_64)
USE_TRMM = 1
endif
endif
ifeq ($(CORE), ZEN)
USE_TRMM = 1

View File

@ -440,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0
ble axpy_kernel_L999
/*
cmp INC_X, #0
beq axpy_kernel_L999
cmp INC_Y, #0
beq axpy_kernel_L999
*/
cmp INC_X, #1
bne axpy_kernel_S_BEGIN

View File

@ -483,13 +483,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0
ble rot_kernel_L999
/*
cmp INC_X, #0
beq rot_kernel_L999
cmp INC_Y, #0
beq rot_kernel_L999
*/
cmp INC_X, #1
bne rot_kernel_S_BEGIN
@ -584,6 +584,12 @@ rot_kernel_S1:
rot_kernel_S10:
KERNEL_S1
cmp INC_X, #0
beq rot_kernel_L999
cmp INC_Y, #0
beq rot_kernel_L999
subs I, I, #1
bne rot_kernel_S10

View File

@ -116,22 +116,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (m & 1) {
if (X > posY) {
/* ao1 += 1;
ao2 += 1; */
ao1 += 1;
ao2 += 1;
b += 2;
} else
#ifdef UNIT
if (X < posY) {
#endif
b[ 0] = *(ao1 + 0);
#ifdef UNIT
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
b[ 0] = data01;
b[ 1] = data02;
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
data02 = *(ao1 + 1);
b[ 0] = ONE;
b[ 1] = data02;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
b[ 0] = data01;
b[ 1] = data02;
#endif
ao1 += 2;
b += 2;
}
#endif
b[ 1] = *(ao1 + 1);
b += 2;
}
posY += 2;
@ -178,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} while (i > 0);
}
// posY += 1;
posY += 1;
}
return 0;

View File

@ -518,7 +518,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 15);
if (i > 0) {
if (X < posY) {
/* a01 += i;
a01 += i;
a02 += i;
a03 += i;
a04 += i;
@ -533,7 +533,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a13 += i;
a14 += i;
a15 += i;
a16 += i; */
a16 += i;
b += 16 * i;
} else
if (X > posY) {
@ -1130,14 +1130,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 7);
if (i > 0) {
if (X < posY) {
/* a01 += i;
a01 += i;
a02 += i;
a03 += i;
a04 += i;
a05 += i;
a06 += i;
a07 += i;
a08 += i; */
a08 += i;
b += 8 * i;
} else
if (X > posY) {
@ -1156,13 +1156,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 8;
}
/* a02 += i * lda;
a02 += i * lda;
a03 += i * lda;
a04 += i * lda;
a05 += i * lda;
a06 += i * lda;
a07 += i * lda;
a08 += i * lda; */
a08 += i * lda;
} else {
#ifdef UNIT
b[ 0] = ONE;
@ -1371,10 +1371,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i > 0) {
if (X < posY) {
/* a01 += i;
a01 += i;
a02 += i;
a03 += i;
a04 += i; */
a04 += i;
b += 4 * i;
} else
if (X > posY) {
@ -1387,9 +1387,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 += lda;
b += 4;
}
/* a02 += lda;
a02 += lda;
a03 += lda;
a04 += lda; */
a04 += lda;
} else {
#ifdef UNIT
@ -1487,19 +1487,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X < posY) {
a01 ++;
a02 ++;
} else {
#ifdef UNIT
b += 2;
} else
if (X > posY) {
#endif
b[ 0] = *(a01 + 0);
#ifdef UNIT
b[ 1] = *(a01 + 1);
a01 += lda;
b += 2;
} else {
#ifdef UNIT
b[ 0] = ONE;
}
b[ 1] = *(a01 + 1);
#else
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
#endif
b[ 1] = *(a01 + 1);
}
b += 2;
b += 2;
}
}
posY += 2;
}
@ -1518,25 +1522,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (i > 0) {
do {
if (X < posY) {
a01 ++;
} else {
#ifdef UNIT
a01 += 1;
b ++;
} else
if (X > posY) {
#endif
b[ 0] = *(a01 + 0);
#ifdef UNIT
a01 += lda;
b ++;
} else {
#ifdef UNIT
b[ 0] = ONE;
}
#else
b[ 0] = *(a01 + 0);
#endif
a01 += lda;
}
b ++;
X ++;
i --;
a01 += lda;
b ++;
}
X += 1;
i --;
} while (i > 0);
}
// posY += 1;
posY += 1;
}
return 0;

View File

@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (m & 1) {
if (X < posY) {
/* ao1 += 1;
ao2 += 1; */
ao1 += 1;
ao2 += 1;
b += 2;
} else
if (X > posY) {
@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = data01;
b[ 1] = data02;
// ao1 += lda;
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = data01;
b[ 1] = ZERO;
#endif
// ao1 += lda;
ao1 += lda;
b += 2;
}
}
@ -161,18 +161,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = m;
if (m > 0) {
do {
if (X < posY) {
b += 1;
ao1 += 1;
} else
if (X > posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
b += 1;
ao1 += lda;
} else {
#ifdef UNIT
if (X > posY) {
b[ 0] = ONE;
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif
b[ 0] = *(ao1 + 0);
#ifdef UNIT
} else {
b[ 0] = ONE;
}
#endif
b ++;
ao1 += lda;
X ++;
b += 1;
ao1 += lda;
}
X += 1;
i --;
} while (i > 0);
}

View File

@ -201,18 +201,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X < posY) {
if (m & 2) {
/* ao1 += 2;
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2; */
ao4 += 2;
b += 8;
}
if (m & 1) {
/* ao1 += 1;
ao1 += 1;
ao2 += 1;
ao3 += 1;
ao4 += 1; */
ao4 += 1;
b += 4;
}
@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = data08;
ao1 += 2 * lda;
// ao2 += 2 * lda;
ao2 += 2 * lda;
b += 8;
}
@ -253,7 +253,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = data03;
b[ 3] = data04;
// ao1 += lda;
ao1 += lda;
b += 4;
}
@ -401,7 +401,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (i) {
if (X < posY) {
// ao1 += 2;
ao1 += 2;
b += 2;
} else
if (X > posY) {
@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = data01;
b[ 1] = data02;
// ao1 += lda;
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
@ -443,21 +443,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
do {
if (X < posY) {
b += 1;
ao1 += 1;
} else {
#ifdef UNIT
} else
if (X > posY) {
#endif
b[ 0] = *(ao1 + 0);
#ifdef UNIT
data01 = *(ao1 + 0);
b[ 0] = data01;
ao1 += lda;
b += 1;
} else {
#ifdef UNIT
b[ 0] = ONE;
}
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif
ao1 += lda;
}
b ++;
X ++;
ao1 += lda;
b += 1;
}
X += 1;
i --;
} while (i > 0);
}

View File

@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
}
a1 += 2 * lda;
// a2 += 2 * lda;
a2 += 2 * lda;
b += 8;
ii += 2;

View File

@ -139,18 +139,48 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
#ifdef UNIT
if (X > posY) {
ao1 += 2;
ao2 += 2;
b += 4;
} else
if (X < posY) {
#endif
b[ 0] = *(ao1 + 0);
b[ 1] = *(ao1 + 1);
#ifdef UNIT
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);
b[ 0] = data1;
b[ 1] = data2;
b[ 2] = data3;
b[ 3] = data4;
ao1 += lda;
b += 4;
} else {
#ifdef UNIT
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);
b[ 0] = ONE;
b[ 1] = ZERO;
}
b[ 2] = data3;
b[ 3] = data4;
#else
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);
b[ 0] = data1;
b[ 1] = data2;
b[ 2] = data3;
b[ 3] = data4;
#endif
b += 4;
b += 4;
}
}
posY += 2;
@ -203,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} while (i > 0);
}
// posY += 1;
posY += 1;
}
return 0;

View File

@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
BLASLONG i, ii, j, jj;
FLOAT data01 = 0.0, data02 = 0.0;
FLOAT data01, data02;
FLOAT *a1;
lda *= 2;

View File

@ -43,8 +43,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
BLASLONG i, ii, j, jj;
FLOAT data01 = 0.0, data02 = 0.0, data03, data04;
FLOAT data05, data06, data07 = 0.0, data08 = 0.0;
FLOAT data01, data02, data03, data04;
FLOAT data05, data06, data07, data08;
FLOAT *a1, *a2;
lda *= 2;

1
kernel/mips/KERNEL.1004K Normal file
View File

@ -0,0 +1 @@
include $(KERNELDIR)/KERNEL.P5600

View File

@ -38,8 +38,8 @@ ZASUMKERNEL = ../mips/zasum_msa.c
else
SASUMKERNEL = ../mips/asum.c
DASUMKERNEL = ../mips/asum.c
CASUMKERNEL = ../mips/asum.c
ZASUMKERNEL = ../mips/asum.c
CASUMKERNEL = ../mips/zasum.c
ZASUMKERNEL = ../mips/zasum.c
endif
ifdef HAVE_MSA
@ -253,4 +253,4 @@ ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
endif
endif

View File

@ -484,10 +484,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
v2f64 v_alpha;
v2f64 x0, x1, x2, x3, y0 = 0.0, y1 = 0.0, y2 = 0.0, y3 = 0.0;
v2f64 x0, x1, x2, x3, y0 = {0,0}, y1 = {0,0}, y2 = {0,0}, y3 = {0,0};
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
v2f64 t30, t31, tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0;
v2f64 t30, t31, tp0 = {0,0}, tp1 = {0,0}, tp2 = {0,0}, tp3 = {0,0}, tp4 = {0,0}, tp5 = {0,0}, tp6 = {0,0}, tp7 = {0,0};
v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);

View File

@ -41,8 +41,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
while(i < n)
{
dot += y[iy] * x[ix] ;
#if defined(DSDOT)
dot += (double)y[iy] * (double)x[ix] ;
#else
dot += y[iy] * x[ix];
#endif
ix += inc_x ;
iy += inc_y ;
i++ ;

View File

@ -423,9 +423,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
FLOAT *y_org = y;
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
v4f32 v_alpha, x0, x1, y0 = 0.0, y1 = 0.0;
v4f32 v_alpha, x0, x1, y0 = {0,0,0,0}, y1 = {0,0,0,0};
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
v4f32 tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0;
v4f32 tp0 = {0,0,0,0}, tp1 = {0,0,0,0}, tp2 = {0,0,0,0}, tp3 = {0,0,0,0}, tp4 = {0,0,0,0}, tp5 = {0,0,0,0}, tp6 = {0,0,0,0}, tp7 = {0,0,0,0};
v_alpha = COPY_FLOAT_TO_VECTOR(alpha);

View File

@ -54,3 +54,6 @@ ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S
ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S
ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S
ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S
CROTKERNEL = ../arm/zrot.c
ZROTKERNEL = ../arm/zrot.c

View File

@ -39,7 +39,7 @@ lapack_int LAPACKE_clacgv( lapack_int n, lapack_complex_float* x,
#ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) {
/* Optionally check input matrices for NaNs */
if( LAPACKE_c_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) {
if( LAPACKE_c_nancheck( n, x, incx ) ) {
return -2;
}
}

View File

@ -51,16 +51,21 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
#ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) {
/* Optionally check input matrices for NaNs */
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
( ( LAPACKE_lsame( storev, 'r' ) &&
LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'r' ) &&
LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'c' ) &&
LAPACKE_lsame( side, 'r' ) ) ? n :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
lapack_int lrv, lcv; /* row, column stride */
if( matrix_layout == LAPACK_COL_MAJOR ) {
lrv = 1;
lcv = ldv;
} else {
lrv = ldv;
lcv = 1;
}
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -13;
}
@ -70,8 +75,8 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
return -9;
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv],
ldv ) )
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v,
&v[k*lrv], ldv ) )
return -9;
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > nrows_v ) {
@ -79,23 +84,23 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
return -8;
}
if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k,
&v[(nrows_v-k)*ldv], ldv ) )
&v[(nrows_v-k)*lrv], ldv ) )
return -9;
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
return -9;
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k],
ldv ) )
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k,
&v[k*lrv], ldv ) )
return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > ncols_v ) {
LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
return -8;
}
if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k],
ldv ) )
if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k,
&v[(ncols_v-k)*lcv], ldv ) )
return -9;
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
return -9;

View File

@ -43,7 +43,7 @@ lapack_int LAPACKE_clarfg( lapack_int n, lapack_complex_float* alpha,
if( LAPACKE_c_nancheck( 1, alpha, 1 ) ) {
return -2;
}
if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
if( LAPACKE_c_nancheck( n-1, x, incx ) ) {
return -3;
}
}

View File

@ -38,6 +38,7 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m,
lapack_complex_float tau, lapack_complex_float* c,
lapack_int ldc, lapack_complex_float* work )
{
lapack_int lv;
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
LAPACKE_xerbla( "LAPACKE_clarfx", -1 );
return -1;
@ -51,7 +52,8 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m,
if( LAPACKE_c_nancheck( 1, &tau, 1 ) ) {
return -6;
}
if( LAPACKE_c_nancheck( m, v, 1 ) ) {
lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
if( LAPACKE_c_nancheck( lv, v, 1 ) ) {
return -5;
}
}

View File

@ -39,7 +39,7 @@ lapack_int LAPACKE_classq( lapack_int n, lapack_complex_float* x,
#ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) {
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
if( LAPACKE_c_nancheck( n, x, incx ) ) {
return -2;
}
if( LAPACKE_s_nancheck( 1, scale, 1 ) ) {

View File

@ -58,7 +58,7 @@ lapack_int LAPACKE_cunmtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -10;
}
if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) {
if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) {
return -9;
}
}

View File

@ -58,7 +58,7 @@ lapack_int LAPACKE_cupmtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -9;
}
if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) {
if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) {
return -8;
}
}

View File

@ -50,16 +50,21 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
#ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) {
/* Optionally check input matrices for NaNs */
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
( ( LAPACKE_lsame( storev, 'r' ) &&
LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'r' ) &&
LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'c' ) &&
LAPACKE_lsame( side, 'r' ) ) ? n :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
lapack_int lrv, lcv; /* row, column stride */
if( matrix_layout == LAPACK_COL_MAJOR ) {
lrv = 1;
lcv = ldv;
} else {
lrv = ldv;
lcv = 1;
}
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -13;
}
@ -69,8 +74,8 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
return -9;
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv],
ldv ) )
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v,
&v[k*lrv], ldv ) )
return -9;
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > nrows_v ) {
@ -78,23 +83,23 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
return -8;
}
if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k,
&v[(nrows_v-k)*ldv], ldv ) )
&v[(nrows_v-k)*lrv], ldv ) )
return -9;
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
return -9;
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k],
ldv ) )
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k,
&v[k*lrv], ldv ) )
return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > ncols_v ) {
LAPACKE_xerbla( "LAPACKE_dlarfb", -8 );
return -8;
}
if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k],
ldv ) )
if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k,
&v[(ncols_v-k)*lcv], ldv ) )
return -9;
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
return -9;

View File

@ -42,7 +42,7 @@ lapack_int LAPACKE_dlarfg( lapack_int n, double* alpha, double* x,
if( LAPACKE_d_nancheck( 1, alpha, 1 ) ) {
return -2;
}
if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
if( LAPACKE_d_nancheck( n-1, x, incx ) ) {
return -3;
}
}

View File

@ -37,6 +37,7 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m,
lapack_int n, const double* v, double tau, double* c,
lapack_int ldc, double* work )
{
lapack_int lv;
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
LAPACKE_xerbla( "LAPACKE_dlarfx", -1 );
return -1;
@ -50,7 +51,8 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m,
if( LAPACKE_d_nancheck( 1, &tau, 1 ) ) {
return -6;
}
if( LAPACKE_d_nancheck( m, v, 1 ) ) {
lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
if( LAPACKE_d_nancheck( lv, v, 1 ) ) {
return -5;
}
}

View File

@ -38,7 +38,7 @@ lapack_int LAPACKE_dlassq( lapack_int n, double* x, lapack_int incx, double* sca
#ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) {
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
if( LAPACKE_d_nancheck( n, x, incx ) ) {
return -2;
}
if( LAPACKE_d_nancheck( 1, scale, 1 ) ) {

View File

@ -56,7 +56,7 @@ lapack_int LAPACKE_dopmtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -9;
}
if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) {
if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) {
return -8;
}
}

View File

@ -57,7 +57,7 @@ lapack_int LAPACKE_dormtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -10;
}
if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) {
if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) {
return -9;
}
}

View File

@ -50,16 +50,21 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
#ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) {
/* Optionally check input matrices for NaNs */
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
( ( LAPACKE_lsame( storev, 'r' ) &&
LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'r' ) &&
LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'c' ) &&
LAPACKE_lsame( side, 'r' ) ) ? n :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
lapack_int lrv, lcv; /* row, column stride */
if( matrix_layout == LAPACK_COL_MAJOR ) {
lrv = 1;
lcv = ldv;
} else {
lrv = ldv;
lcv = 1;
}
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -13;
}
@ -69,8 +74,8 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
return -9;
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv],
ldv ) )
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v,
&v[k*lrv], ldv ) )
return -9;
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > nrows_v ) {
@ -78,23 +83,23 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
return -8;
}
if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k,
&v[(nrows_v-k)*ldv], ldv ) )
&v[(nrows_v-k)*lrv], ldv ) )
return -9;
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
return -9;
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k],
ldv ) )
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k,
&v[k*lrv], ldv ) )
return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > ncols_v ) {
LAPACKE_xerbla( "LAPACKE_slarfb", -8 );
return -8;
}
if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k],
ldv ) )
if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k,
&v[(ncols_v-k)*lcv], ldv ) )
return -9;
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
return -9;

View File

@ -42,7 +42,7 @@ lapack_int LAPACKE_slarfg( lapack_int n, float* alpha, float* x,
if( LAPACKE_s_nancheck( 1, alpha, 1 ) ) {
return -2;
}
if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
if( LAPACKE_s_nancheck( n-1, x, incx ) ) {
return -3;
}
}

View File

@ -37,6 +37,7 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m,
lapack_int n, const float* v, float tau, float* c,
lapack_int ldc, float* work )
{
lapack_int lv;
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
LAPACKE_xerbla( "LAPACKE_slarfx", -1 );
return -1;
@ -50,7 +51,8 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m,
if( LAPACKE_s_nancheck( 1, &tau, 1 ) ) {
return -6;
}
if( LAPACKE_s_nancheck( m, v, 1 ) ) {
lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
if( LAPACKE_s_nancheck( lv, v, 1 ) ) {
return -5;
}
}

View File

@ -38,7 +38,7 @@ lapack_int LAPACKE_slassq( lapack_int n, float* x, lapack_int incx, float* scale
#ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) {
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
if( LAPACKE_s_nancheck( n, x, incx ) ) {
return -2;
}
if( LAPACKE_s_nancheck( 1, scale, 1 ) ) {

View File

@ -56,7 +56,7 @@ lapack_int LAPACKE_sopmtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -9;
}
if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) {
if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) {
return -8;
}
}

View File

@ -57,7 +57,7 @@ lapack_int LAPACKE_sormtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -10;
}
if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) {
if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) {
return -9;
}
}

View File

@ -39,7 +39,7 @@ lapack_int LAPACKE_zlacgv( lapack_int n, lapack_complex_double* x,
#ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) {
/* Optionally check input matrices for NaNs */
if( LAPACKE_z_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) {
if( LAPACKE_z_nancheck( n, x, incx ) ) {
return -2;
}
}

View File

@ -51,16 +51,21 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
#ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) {
/* Optionally check input matrices for NaNs */
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
( ( LAPACKE_lsame( storev, 'r' ) &&
LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'r' ) &&
LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'c' ) &&
LAPACKE_lsame( side, 'r' ) ) ? n :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
lapack_int lrv, lcv; /* row, column stride */
if( matrix_layout == LAPACK_COL_MAJOR ) {
lrv = 1;
lcv = ldv;
} else {
lrv = ldv;
lcv = 1;
}
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -13;
}
@ -70,8 +75,8 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
return -9;
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv],
ldv ) )
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v,
&v[k*lrv], ldv ) )
return -9;
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > nrows_v ) {
@ -79,23 +84,23 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
return -8;
}
if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k,
&v[(nrows_v-k)*ldv], ldv ) )
&v[(nrows_v-k)*lrv], ldv ) )
return -9;
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
return -9;
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k],
ldv ) )
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k,
&v[k*lrv], ldv ) )
return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > ncols_v ) {
LAPACKE_xerbla( "LAPACKE_zlarfb", -8 );
return -8;
}
if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k],
ldv ) )
if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k,
&v[(ncols_v-k)*lcv], ldv ) )
return -9;
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
return -9;

View File

@ -43,7 +43,7 @@ lapack_int LAPACKE_zlarfg( lapack_int n, lapack_complex_double* alpha,
if( LAPACKE_z_nancheck( 1, alpha, 1 ) ) {
return -2;
}
if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
if( LAPACKE_z_nancheck( n-1, x, incx ) ) {
return -3;
}
}

View File

@ -38,6 +38,7 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m,
lapack_complex_double tau, lapack_complex_double* c,
lapack_int ldc, lapack_complex_double* work )
{
lapack_int lv;
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
LAPACKE_xerbla( "LAPACKE_zlarfx", -1 );
return -1;
@ -51,7 +52,8 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m,
if( LAPACKE_z_nancheck( 1, &tau, 1 ) ) {
return -6;
}
if( LAPACKE_z_nancheck( m, v, 1 ) ) {
lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
if( LAPACKE_z_nancheck( lv, v, 1 ) ) {
return -5;
}
}

View File

@ -39,7 +39,7 @@ lapack_int LAPACKE_zlassq( lapack_int n, lapack_complex_double* x,
#ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) {
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
if( LAPACKE_z_nancheck( n, x, incx ) ) {
return -2;
}
if( LAPACKE_d_nancheck( 1, scale, 1 ) ) {

View File

@ -58,7 +58,7 @@ lapack_int LAPACKE_zunmtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -10;
}
if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) {
if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) {
return -9;
}
}

View File

@ -58,7 +58,7 @@ lapack_int LAPACKE_zupmtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -9;
}
if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) {
if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) {
return -8;
}
}

View File

@ -512,7 +512,7 @@ C END IF
*
* Call the kernel
*
#if defined(_OPENMP) && _OPENMP >= 201307L
#if defined(_OPENMP) && _OPENMP >= 201307
IF( TTYPE.NE.1 ) THEN
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
!$OMP$ DEPEND(in:WORK(MYID-1))

View File

@ -481,7 +481,7 @@
*
* Call the kernel
*
#if defined(_OPENMP) && _OPENMP >= 201307L
#if defined(_OPENMP) && _OPENMP >= 201307
IF( TTYPE.NE.1 ) THEN
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
!$OMP$ DEPEND(in:WORK(MYID-1))

View File

@ -512,7 +512,7 @@ C END IF
*
* Call the kernel
*
#if defined(_OPENMP) && _OPENMP >= 201307L
#if defined(_OPENMP) && _OPENMP >= 201307
IF( TTYPE.NE.1 ) THEN
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))

View File

@ -67,6 +67,26 @@ double sqrt(double);
#undef GETRF_FACTOR
#define GETRF_FACTOR 1.00
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
static pthread_spinlock_t getrf_lock = 0;
#else
static BLASULONG getrf_lock = 0UL;
#endif
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
static pthread_spinlock_t getrf_flag_lock = 0;
#else
static BLASULONG getrf_flag_lock = 0UL;
#endif
static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) {
double m = (double)(M - IS - BK);
@ -99,7 +119,11 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE;
FLOAT *sbb = sb;
#if __STDC_VERSION__ >= 201112L
_Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
#else
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
#endif
blasint *ipiv = (blasint *)args -> c;
@ -177,7 +201,12 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
/* Non blocking implementation */
typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
@ -216,9 +245,12 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
FLOAT *sbb= sb;
blasint *ipiv = (blasint *)args -> c;
BLASLONG jw;
#if __STDC_VERSION__ >= 201112L
_Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
#else
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
#endif
if (args -> a == NULL) {
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
@ -245,8 +277,20 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
for (i = 0; i < args -> nthreads; i++)
#if 1
{
LOCK_COMMAND(&getrf_lock);
jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
UNLOCK_COMMAND(&getrf_lock);
do {
LOCK_COMMAND(&getrf_lock);
jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
UNLOCK_COMMAND(&getrf_lock);
} while (jw);
}
#else
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {};
#endif
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
@ -283,18 +327,23 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
b + (is + jjs * lda) * COMPSIZE, lda, is);
}
}
MB;
for (i = 0; i < args -> nthreads; i++)
for (i = 0; i < args -> nthreads; i++) {
LOCK_COMMAND(&getrf_lock);
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
UNLOCK_COMMAND(&getrf_lock);
}
}
LOCK_COMMAND(&getrf_flag_lock);
flag[mypos * CACHE_LINE_SIZE] = 0;
UNLOCK_COMMAND(&getrf_flag_lock);
if (m == 0) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
LOCK_COMMAND(&getrf_lock);
job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0;
UNLOCK_COMMAND(&getrf_lock);
}
}
@ -318,7 +367,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
if ((current != mypos) && (!is)) {
#if 1
LOCK_COMMAND(&getrf_lock);
jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
UNLOCK_COMMAND(&getrf_lock);
do {
LOCK_COMMAND(&getrf_lock);
jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
UNLOCK_COMMAND(&getrf_lock);
} while (jw == 0);
#else
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {};
#endif
}
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k,
@ -327,7 +387,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
MB;
if (is + min_i >= m) {
LOCK_COMMAND(&getrf_lock);
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
UNLOCK_COMMAND(&getrf_lock);
}
}
@ -339,7 +401,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
for (i = 0; i < args -> nthreads; i++) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
#if 1
LOCK_COMMAND(&getrf_lock);
jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
UNLOCK_COMMAND(&getrf_lock);
do {
LOCK_COMMAND(&getrf_lock);
jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
UNLOCK_COMMAND(&getrf_lock);
} while(jw != 0);
#else
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {};
#endif
}
}
@ -374,11 +447,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
BLASLONG i, j, k, is, bk;
BLASLONG num_cpu;
BLASLONG f;
#ifdef _MSC_VER
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE];
#else
volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
#endif
#ifndef COMPLEX
@ -501,11 +580,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
if (mm >= nn) {
width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
if (width == 0) width = nn;
if (nn < width) width = nn;
nn -= width;
range_N[num_cpu + 1] = range_N[num_cpu] + width;
width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
if (width == 0) width = mm;
if (mm < width) width = mm;
if (nn <= 0) width = mm;
mm -= width;
@ -514,11 +595,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
} else {
width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
if (width == 0) width = mm;
if (mm < width) width = mm;
mm -= width;
range_M[num_cpu + 1] = range_M[num_cpu] + width;
width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
if (width == 0) width = nn;
if (nn < width) width = nn;
if (mm <= 0) width = nn;
nn -= width;
@ -561,7 +644,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
range_n_new[1] = offset + is + bk;
if (num_cpu > 0) {
queue[num_cpu - 1].next = NULL;
exec_blas_async(0, &queue[0]);
@ -572,8 +654,20 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
if (iinfo && !info) info = iinfo + is;
for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};
for (i = 0; i < num_cpu; i ++) {
#if 1
LOCK_COMMAND(&getrf_flag_lock);
f=flag[i*CACHE_LINE_SIZE];
UNLOCK_COMMAND(&getrf_flag_lock);
while (f!=0) {
LOCK_COMMAND(&getrf_flag_lock);
f=flag[i*CACHE_LINE_SIZE];
UNLOCK_COMMAND(&getrf_flag_lock);
};
#else
while (flag[i*CACHE_LINE_SIZE]) {};
#endif
}
TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb);
} else {
@ -634,8 +728,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
BLASLONG range[MAX_CPU_NUMBER + 1];
BLASLONG width, nn, num_cpu;
volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
#ifndef COMPLEX
#ifdef XDOUBLE

View File

@ -0,0 +1,664 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifndef USE_SIMPLE_THREADED_LEVEL3
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif
static FLOAT dm1 = -1.;
#ifndef KERNEL_FUNC
#ifndef LOWER
#define KERNEL_FUNC SYRK_KERNEL_U
#else
#define KERNEL_FUNC SYRK_KERNEL_L
#endif
#endif
#ifndef LOWER
#ifndef COMPLEX
#define TRSM_KERNEL TRSM_KERNEL_LT
#else
#define TRSM_KERNEL TRSM_KERNEL_LC
#endif
#else
#ifndef COMPLEX
#define TRSM_KERNEL TRSM_KERNEL_RN
#else
#define TRSM_KERNEL TRSM_KERNEL_RR
#endif
#endif
#ifndef CACHE_LINE_SIZE
#define CACHE_LINE_SIZE 8
#endif
#ifndef DIVIDE_RATE
#define DIVIDE_RATE 2
#endif
#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif
#ifndef LOWER
#define TRANS
#endif
#ifndef SYRK_LOCAL
#if !defined(LOWER) && !defined(TRANS)
#define SYRK_LOCAL SYRK_UN
#elif !defined(LOWER) && defined(TRANS)
#define SYRK_LOCAL SYRK_UT
#elif defined(LOWER) && !defined(TRANS)
#define SYRK_LOCAL SYRK_LN
#else
#define SYRK_LOCAL SYRK_LT
#endif
#endif
typedef struct {
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;
#ifndef KERNEL_OPERATION
#ifndef COMPLEX
#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
#else
#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
#endif
#endif
#ifndef ICOPY_OPERATION
#ifndef TRANS
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
#else
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
#endif
#endif
#ifndef OCOPY_OPERATION
#ifdef TRANS
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
#else
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
#endif
#endif
#ifndef S
#define S args -> a
#endif
#ifndef A
#define A args -> b
#endif
#ifndef C
#define C args -> c
#endif
#ifndef LDA
#define LDA args -> lda
#endif
#ifndef N
#define N args -> m
#endif
#ifndef K
#define K args -> k
#endif
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
FLOAT *buffer[DIVIDE_RATE];
BLASLONG k, lda;
BLASLONG m_from, m_to;
FLOAT *alpha;
FLOAT *a, *c;
job_t *job = (job_t *)args -> common;
BLASLONG xxx, bufferside;
BLASLONG jjs, min_jj;
BLASLONG is, min_i, div_n;
BLASLONG i, current;
k = K;
a = (FLOAT *)A;
c = (FLOAT *)C;
lda = LDA;
alpha = (FLOAT *)args -> alpha;
m_from = range_n[mypos + 0];
m_to = range_n[mypos + 1];
#if 0
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to);
#endif
div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
for (i = 1; i < DIVIDE_RATE; i++) {
buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
}
#ifndef LOWER
TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb);
#else
TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb);
#endif
for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(m_to, xxx + div_n) - jjs;
#ifndef LOWER
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
#else
if (min_jj > GEMM_P) min_jj = GEMM_P;
#endif
#ifndef LOWER
OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
TRSM_KERNEL (k, min_jj, k, dm1,
#ifdef COMPLEX
ZERO,
#endif
sb,
buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
a + jjs * lda * COMPSIZE, lda, 0);
#else
ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
TRSM_KERNEL (min_jj, k, k, dm1,
#ifdef COMPLEX
ZERO,
#endif
buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
sb,
a + jjs * COMPSIZE, lda, 0);
#endif
}
#ifndef LOWER
for (i = 0; i <= mypos; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
#else
for (i = mypos; i < args -> nthreads; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
#endif
WMB;
}
min_i = m_to - m_from;
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
#ifndef LOWER
ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
#else
OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
#endif
current = mypos;
#ifndef LOWER
while (current < args -> nthreads)
#else
while (current >= 0)
#endif
{
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
/* thread has to wait */
if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, lda, m_from, xxx);
if (m_from + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}
#ifndef LOWER
current ++;
#else
current --;
#endif
}
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
#ifndef LOWER
ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
#else
OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
#endif
current = mypos;
#ifndef LOWER
while (current < args -> nthreads)
#else
while (current >= 0)
#endif
{
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, lda, is, xxx);
if (is + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}
#ifndef LOWER
current ++;
#else
current --;
#endif
}
}
for (i = 0; i < args -> nthreads; i++) {
if (i != mypos) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
}
}
}
return 0;
}
static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
blas_arg_t newarg;
#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range[MAX_CPU_NUMBER + 100];
BLASLONG num_cpu;
BLASLONG nthreads = args -> nthreads;
BLASLONG width, i, j, k;
BLASLONG n, n_from, n_to;
int mode, mask;
double dnum;
#ifndef COMPLEX
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_REAL;
mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_REAL;
mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
#else
mode = BLAS_SINGLE | BLAS_REAL;
mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_COMPLEX;
mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
#endif
#endif
newarg.m = args -> m;
newarg.k = args -> k;
newarg.a = args -> a;
newarg.b = args -> b;
newarg.c = args -> c;
newarg.lda = args -> lda;
newarg.alpha = args -> alpha;
#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif
newarg.common = (void *)job;
n_from = 0;
n_to = args -> m;
#ifndef LOWER
range[MAX_CPU_NUMBER] = n_to - n_from;
range[0] = 0;
num_cpu = 0;
i = 0;
n = n_to - n_from;
dnum = (double)n * (double)n /(double)nthreads;
while (i < n){
if (nthreads - num_cpu > 1) {
double di = (double)i;
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1));
if ((width > n - i) || (width < mask)) width = n - i;
} else {
width = n - i;
}
range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = inner_thread;
queue[num_cpu].args = &newarg;
queue[num_cpu].range_m = NULL;
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
num_cpu ++;
i += width;
}
for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu];
#else
range[0] = 0;
num_cpu = 0;
i = 0;
n = n_to - n_from;
dnum = (double)n * (double)n /(double)nthreads;
while (i < n){
if (nthreads - num_cpu > 1) {
double di = (double)i;
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
if ((width > n - i) || (width < mask)) width = n - i;
} else {
width = n - i;
}
range[num_cpu + 1] = range[num_cpu] + width;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = inner_thread;
queue[num_cpu].args = &newarg;
queue[num_cpu].range_m = NULL;
queue[num_cpu].range_n = range;
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
num_cpu ++;
i += width;
}
#endif
newarg.nthreads = num_cpu;
if (num_cpu) {
for (j = 0; j < num_cpu; j++) {
for (i = 0; i < num_cpu; i++) {
for (k = 0; k < DIVIDE_RATE; k++) {
job[j].working[i][CACHE_LINE_SIZE * k] = 0;
}
}
}
queue[0].sa = sa;
queue[0].sb = sb;
queue[num_cpu - 1].next = NULL;
exec_blas(num_cpu, queue);
}
#ifdef USE_ALLOC_HEAP
free(job);
#endif
return 0;
}
#endif
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {
BLASLONG n, bk, i, blocking, lda;
BLASLONG info;
int mode;
blas_arg_t newarg;
FLOAT *a;
FLOAT alpha[2] = { -ONE, ZERO};
#ifndef COMPLEX
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
#endif
if (args -> nthreads == 1) {
#ifndef LOWER
info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0);
#else
info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0);
#endif
return info;
}
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
if (range_n) n = range_n[1] - range_n[0];
if (n <= GEMM_UNROLL_N * 2) {
#ifndef LOWER
info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0);
#else
info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0);
#endif
return info;
}
newarg.lda = lda;
newarg.ldb = lda;
newarg.ldc = lda;
newarg.alpha = alpha;
newarg.beta = NULL;
newarg.nthreads = args -> nthreads;
blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N;
if (blocking > GEMM_Q) blocking = GEMM_Q;
for (i = 0; i < n; i += blocking) {
bk = n - i;
if (bk > blocking) bk = blocking;
newarg.m = bk;
newarg.n = bk;
newarg.a = a + (i + i * lda) * COMPSIZE;
info = CNAME(&newarg, NULL, NULL, sa, sb, 0);
if (info) return info + i;
if (n - i - bk > 0) {
#ifndef USE_SIMPLE_THREADED_LEVEL3
newarg.m = n - i - bk;
newarg.k = bk;
#ifndef LOWER
newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE;
#else
newarg.b = a + ((i + bk) + i * lda) * COMPSIZE;
#endif
newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
thread_driver(&newarg, sa, sb);
#else
#ifndef LOWER
newarg.m = bk;
newarg.n = n - i - bk;
newarg.a = a + (i + i * lda) * COMPSIZE;
newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;
gemm_thread_n(mode | BLAS_TRANSA_T,
&newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads);
newarg.n = n - i - bk;
newarg.k = bk;
newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE;
newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
#if 0
HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0);
#else
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
&newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads);
#endif
#else
newarg.m = n - i - bk;
newarg.n = bk;
newarg.a = a + (i + i * lda) * COMPSIZE;
newarg.b = a + (i + bk + i * lda) * COMPSIZE;
gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO,
&newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads);
newarg.n = n - i - bk;
newarg.k = bk;
newarg.a = a + (i + bk + i * lda) * COMPSIZE;
newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE;
#if 0
HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
#else
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO,
&newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads);
#endif
#endif
#endif
}
}
return 0;
}

View File

@ -2291,7 +2291,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16
#endif
#if defined(P5600) || defined(I6400) || defined(P6600) || defined(I6500)
#if defined(P5600) || defined(MIPS1004K) || defined(I6400) || defined(P6600) || defined(I6500)
#define SNUMOPT 2
#define DNUMOPT 2

View File

@ -25,6 +25,7 @@ endif ()
# known to hang with the native Windows and Android threads
# FIXME needs checking if this works on any of the other platforms
if (NOT NO_CBLAS)
if (NOT USE_OPENMP)
if (OS_CYGWIN_NT OR OS_LINUX)
set(OpenBLAS_utest_src
@ -33,6 +34,7 @@ set(OpenBLAS_utest_src
)
endif()
endif()
endif()
if (NOT NO_LAPACK)
set(OpenBLAS_utest_src

View File

@ -17,11 +17,13 @@ endif
#this does not work with OpenMP nor with native Windows or Android threads
# FIXME TBD if this works on OSX, SunOS, POWER and zarch
ifneq ($(NO_CBLAS), 1)
ifndef USE_OPENMP
ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT))
OBJS += test_fork.o
endif
endif
endif
all : run_test

View File

@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include "openblas_utest.h"
#include <sys/types.h>
#include <sys/wait.h>
#include <cblas.h>