Merge pull request #1570 from xianyi/develop

Update release-0.3.0 branch to match develop
This commit is contained in:
Martin Kroeker 2018-05-23 15:12:20 +02:00 committed by GitHub
commit 939452ea9d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
81 changed files with 1477 additions and 421 deletions

View File

@ -7,6 +7,7 @@ language: c
jobs: jobs:
include: include:
- &test-ubuntu - &test-ubuntu
os: linux
stage: test stage: test
compiler: gcc compiler: gcc
addons: addons:
@ -57,7 +58,8 @@ jobs:
- TARGET_BOX=LINUX32 - TARGET_BOX=LINUX32
- BTYPE="BINARY=32" - BTYPE="BINARY=32"
- stage: test - os: linux
stage: test
compiler: gcc compiler: gcc
addons: addons:
apt: apt:
@ -77,6 +79,7 @@ jobs:
# which is slower than container-based infrastructure used for jobs # which is slower than container-based infrastructure used for jobs
# that don't require sudo. # that don't require sudo.
- &test-alpine - &test-alpine
os: linux
stage: test stage: test
dist: trusty dist: trusty
sudo: true sudo: true
@ -120,6 +123,7 @@ jobs:
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
- &test-cmake - &test-cmake
os: linux
stage: test stage: test
compiler: clang compiler: clang
addons: addons:
@ -147,6 +151,23 @@ jobs:
env: env:
- CMAKE=1 - CMAKE=1
- &test-macos
os: osx
stage: test
osx_image: xcode8
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
- brew update
- brew install gcc # for gfortran
script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env:
- BTYPE="BINARY=64 INTERFACE64=1"
- <<: *test-macos
env:
- BTYPE="BINARY=32"
# whitelist # whitelist
branches: branches:
only: only:

View File

@ -91,11 +91,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
@ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif
ifeq ($(OSNAME), FreeBSD) ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
@$(MAKE) -C exports so @$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif endif

View File

@ -72,12 +72,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif
ifeq ($(OSNAME), FreeBSD) ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ -101,8 +96,9 @@ endif
#Generating openblas.pc #Generating openblas.pc
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@ -115,7 +111,7 @@ endif
ifndef NO_SHARED ifndef NO_SHARED
#ifeq logical or #ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif endif
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))

View File

@ -17,6 +17,10 @@ ifdef CPUIDEMU
EXFLAGS = -DCPUIDEMU -DVENDOR=99 EXFLAGS = -DCPUIDEMU -DVENDOR=99
endif endif
ifeq ($(TARGET), 1004K)
TARGET_FLAGS = -mips32r2
endif
ifeq ($(TARGET), P5600) ifeq ($(TARGET), P5600)
TARGET_FLAGS = -mips32r5 TARGET_FLAGS = -mips32r5
endif endif

View File

@ -60,6 +60,13 @@ VERSION = 0.3.0.dev
# automatically detected by the the script. # automatically detected by the the script.
# NUM_THREADS = 24 # NUM_THREADS = 24
# If you have enabled USE_OPENMP and your application would call
# OpenBLAS's calculation API from multi threads, please comment it in.
# This flag defines how many instances of OpenBLAS's calculation API can
# actually run in parallel. If more threads call OpenBLAS's calculation API,
# they need to wait for the preceding API calls to finish or risk data corruption.
# NUM_PARALLEL = 2
# if you don't need to install the static library, please comment it in. # if you don't need to install the static library, please comment it in.
# NO_STATIC = 1 # NO_STATIC = 1

View File

@ -17,15 +17,24 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
# http://stackoverflow.com/questions/4029274/mingw-and-make-variables # http://stackoverflow.com/questions/4029274/mingw-and-make-variables
# - Default value is 'cc' which is not always a valid command (e.g. MinGW). # - Default value is 'cc' which is not always a valid command (e.g. MinGW).
ifeq ($(origin CC),default) ifeq ($(origin CC),default)
# Check if $(CC) refers to a valid command and set the value to gcc if not
ifneq ($(findstring cmd.exe,$(SHELL)),)
ifeq ($(shell where $(CC) 2>NUL),)
CC = gcc CC = gcc
# Change the default compile to clang on Mac OSX.
# http://stackoverflow.com/questions/714100/os-detecting-makefile
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
CC = clang
# EXTRALIB += -Wl,-no_compact_unwind
endif
endif endif
else # POSIX-ish
ifeq ($(shell command -v $(CC) 2>/dev/null),)
ifeq ($(shell uname -s),Darwin)
CC = clang
# EXTRALIB += -Wl,-no_compact_unwind
else
CC = gcc
endif # Darwin
endif # CC exists
endif # Shell is sane
endif # CC is set to default
# Default Fortran compiler (FC) is selected by f_check. # Default Fortran compiler (FC) is selected by f_check.
@ -175,6 +184,10 @@ endif
endif endif
ifndef NUM_PARALLEL
NUM_PARALLEL = 1
endif
ifndef NUM_THREADS ifndef NUM_THREADS
NUM_THREADS = $(NUM_CORES) NUM_THREADS = $(NUM_CORES)
endif endif
@ -230,7 +243,7 @@ endif
MD5SUM = md5 -r MD5SUM = md5 -r
endif endif
ifeq ($(OSNAME), FreeBSD) ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly))
MD5SUM = md5 -r MD5SUM = md5 -r
endif endif
@ -424,7 +437,7 @@ CCOMMON_OPT += -fopenmp
endif endif
ifeq ($(C_COMPILER), INTEL) ifeq ($(C_COMPILER), INTEL)
CCOMMON_OPT += -openmp CCOMMON_OPT += -fopenmp
endif endif
ifeq ($(C_COMPILER), PGI) ifeq ($(C_COMPILER), PGI)
@ -555,9 +568,14 @@ CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64
endif endif
ifeq ($(CORE), 1004K)
CCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
FCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
endif
ifeq ($(CORE), P5600) ifeq ($(CORE), P5600)
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
endif endif
ifeq ($(CORE), I6400) ifeq ($(CORE), I6400)
@ -704,7 +722,7 @@ FCOMMON_OPT += -i8
endif endif
endif endif
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp FCOMMON_OPT += -fopenmp
endif endif
endif endif
@ -952,6 +970,8 @@ endif
CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS)
CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL)
ifdef USE_SIMPLE_THREADED_LEVEL3 ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif endif

228
README.md
View File

@ -5,175 +5,219 @@
Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS)
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
## Introduction ## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>. Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
## Binary Packages ## Binary Packages
We provide binary packages for the following platform.
We provide official binary packages for the following platform:
* Windows x86/x86_64 * Windows x86/x86_64
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/). You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
## Installation from Source ## Installation from Source
Download from project homepage. http://xianyi.github.com/OpenBLAS/
Or, check out codes from git://github.com/xianyi/OpenBLAS.git Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
using Git from https://github.com/xianyi/OpenBLAS.git.
### Dependencies
Building OpenBLAS requires the following to be installed:
* GNU Make
* A C compiler, e.g. GCC or Clang
* A Fortran compiler (optional, for LAPACK)
* IBM MASS (optional, see below)
### Normal compile ### Normal compile
* type "make" to detect the CPU automatically.
or Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically.
* type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`.
The full target list is in the file `TargetList.txt`.
### Cross compile ### Cross compile
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler.
The target must be specified explicitly when cross compiling.
Examples: Examples:
On X86 box, compile this library for loongson3a CPU. * On an x86 box, compile this library for a loongson3a CPU:
```sh
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
```
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A * On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler:
```sh
On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler. make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
```
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
### Debug version ### Debug version
make DEBUG=1 A debug version can be built using `make DEBUG=1`.
### Compile with MASS Support on Power CPU (Optional dependency) ### Compile with MASS support on Power CPU (optional)
[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER. consists of a set of mathematical functions for C, C++, and Fortran applications that are
The library can be installed as below - are tuned for optimum performance on POWER architectures.
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
The library can be installed as shown:
* On Ubuntu: * On Ubuntu:
```sh
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
sudo apt-get update
sudo apt-get install libxlmass-devel.8.1.5
```
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -</br> * On RHEL/CentOS:
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list</br> ```sh
sudo apt-get update</br> wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
sudo apt-get install libxlmass-devel.8.1.5</br> sudo rpm --import repomd.xml.key
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
sudo yum install libxlmass-devel.8.1.5
```
* On RHEL/CentOS: After installing the MASS library, compile OpenBLAS with `USE_MASS=1`.
For example, to compile on Power8 with MASS support: `make USE_MASS=1 TARGET=POWER8`.
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key</br> ### Install to a specific directory (optional)
sudo rpm --import repomd.xml.key</br>
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo</br>
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/</br>
sudo yum install libxlmass-devel.8.1.5</br>
After installing MASS library, compile openblas with USE_MASS=1. Use `PREFIX=` when invoking `make`, for example
Example: ```sh
make install PREFIX=your_installation_directory
```
Compiling on Power8 with MASS support - The default installation directory is `/opt/OpenBLAS`.
make USE_MASS=1 TARGET=POWER8 ## Supported CPUs and Operating Systems
### Install to the directory (optional) Please read `GotoBLAS_01Readme.txt`.
Example: ### Additional supported CPUs
make install PREFIX=your_installation_directory #### x86/x86-64
The default directory is /opt/OpenBLAS
## Support CPU & OS
Please read GotoBLAS_01Readme.txt
### Additional support CPU:
#### x86/x86-64:
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar) - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
#### MIPS64: #### MIPS64
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
- **ICT Loongson 3B**: Experimental - **ICT Loongson 3B**: Experimental
#### ARM: #### ARM
- **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ )
- **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 )
#### ARM64: - **ARMv6**: Optimized BLAS for vfpv2 and vfpv3-d16 (e.g. BCM2835, Cortex M0+)
- **ARMV8**: Experimental - **ARMv7**: Optimized BLAS for vfpv3-d32 (e.g. Cortex A8, A9 and A15)
#### ARM64
- **ARMv8**: Experimental
- **ARM Cortex-A57**: Experimental - **ARM Cortex-A57**: Experimental
#### PPC/PPC64 #### PPC/PPC64
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with USE_OPENMP=1
#### IBM zEnterprise System: - **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
#### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
### Supported OS
### Support OS:
- **GNU/Linux** - **GNU/Linux**
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. - **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. - **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts.
- **FreeBSD**: Supported by community. We didn't test the library on this OS. - **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
- **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. - **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
## Usages ## Usage
Link with libopenblas.a or -lopenblas for shared library.
### Set the number of threads with environment variables. Statically link with `libopenblas.a` or dynamically link with `-lopenblas` if OpenBLAS was
compiled as a shared library.
Examples: ### Setting the number of threads using environment variables
export OPENBLAS_NUM_THREADS=4 Environment variables are used to specify a maximum number of threads.
For example,
or ```sh
export OPENBLAS_NUM_THREADS=4
export GOTO_NUM_THREADS=4
export OMP_NUM_THREADS=4
```
export GOTO_NUM_THREADS=4 The priorities are `OPENBLAS_NUM_THREADS` > `GOTO_NUM_THREADS` > `OMP_NUM_THREADS`.
or If you compile this library with `USE_OPENMP=1`, you should set the `OMP_NUM_THREADS`
environment variable; OpenBLAS ignores `OPENBLAS_NUM_THREADS` and `GOTO_NUM_THREADS` when
compiled with `USE_OPENMP=1`.
export OMP_NUM_THREADS=4 ### Setting the number of threads at runtime
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. We provide the following functions to control the number of threads at runtime:
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. ```c
void goto_set_num_threads(int num_threads);
void openblas_set_num_threads(int num_threads);
```
### Set the number of threads on runtime. If you compile this library with `USE_OPENMP=1`, you should use the above functions too.
We provided the below functions to control the number of threads on runtime. ## Reporting bugs
void goto_set_num_threads(int num_threads); Please submit an issue in https://github.com/xianyi/OpenBLAS/issues.
void openblas_set_num_threads(int num_threads);
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
## Report Bugs
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
## Contact ## Contact
* OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users * OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
* OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev * OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev
## ChangeLog ## Change log
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version.
## Troubleshooting ## Troubleshooting
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. * Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). Clang 3.0 will generate the wrong AVX binary code.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. * The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
the library with `BIGNUMA=1`.
* OpenBLAS does not set processor affinity by default.
On Linux, you can enable processor affinity by commenting out the line `NO_AFFINITY=1` in
Makefile.rule. However, note that this may cause
[a conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
* On Loongson 3A, `make test` may fail with a `pthread_create` error (`EAGAIN`).
However, it will be okay when you run the same test case on the shell.
## Contributing ## Contributing
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. 1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue
1. Write a test which shows that the bug was fixed or that the feature works as expected. to start a discussion around a feature idea or a bug.
1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. 2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
3. Write a test which shows that the bug was fixed or that the feature works as expected.
4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
## Donation ## Donation
Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation). Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation).

View File

@ -56,6 +56,7 @@ CELL
3.MIPS CPU: 3.MIPS CPU:
P5600 P5600
1004K
4.MIPS64 CPU: 4.MIPS64 CPU:
SICORTEX SICORTEX

View File

@ -14,6 +14,20 @@ Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
`MAX_CPU_NUMBER=NUM_THREADS`. `MAX_CPU_NUMBER=NUM_THREADS`.
Despite its name, and due to the use of memory buffers in functions like SGEMM,
the setting of NUM_THREADS can be relevant even for a single-threaded build
of OpenBLAS, if such functions get called by multiple threads of a program
that uses OpenBLAS. In some cases, the affected code may simply crash or throw
a segmentation fault without displaying the above warning first.
Note that the number of threads used at runtime can be altered to differ from the
value NUM_THREADS was set to at build time. At runtime, the actual number of
threads can be set anywhere from 1 to the build's NUM_THREADS (note however,
that this does not change the number of memory buffers that will be allocated,
which is set at build time). The number of threads for a process can be set by
using the mechanisms described below.
#### How can I use OpenBLAS in multi-threaded applications? #### How can I use OpenBLAS in multi-threaded applications?
If your application is already multi-threaded, it will conflict with OpenBLAS If your application is already multi-threaded, it will conflict with OpenBLAS

View File

@ -54,6 +54,8 @@ $compiler = GCC if ($compiler eq "");
$os = Linux if ($data =~ /OS_LINUX/); $os = Linux if ($data =~ /OS_LINUX/);
$os = FreeBSD if ($data =~ /OS_FREEBSD/); $os = FreeBSD if ($data =~ /OS_FREEBSD/);
$os = NetBSD if ($data =~ /OS_NETBSD/); $os = NetBSD if ($data =~ /OS_NETBSD/);
$os = OpenBSD if ($data =~ /OS_OPENBSD/);
$os = DragonFly if ($data =~ /OS_DRAGONFLY/);
$os = Darwin if ($data =~ /OS_DARWIN/); $os = Darwin if ($data =~ /OS_DARWIN/);
$os = SunOS if ($data =~ /OS_SUNOS/); $os = SunOS if ($data =~ /OS_SUNOS/);
$os = AIX if ($data =~ /OS_AIX/); $os = AIX if ($data =~ /OS_AIX/);

View File

@ -1,6 +1,7 @@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
Name: OpenBLAS Name: OpenBLAS
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: @OPENBLAS_VERSION@ Version: @OPENBLAS_VERSION@

View File

@ -96,6 +96,10 @@ if (NOT CMAKE_CROSSCOMPILING)
endif() endif()
if (NOT DEFINED NUM_PARALLEL)
set(NUM_PARALLEL 1)
endif()
if (NOT DEFINED NUM_THREADS) if (NOT DEFINED NUM_THREADS)
if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0) if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0)
# HT? # HT?
@ -224,6 +228,8 @@ endif ()
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}")
if (USE_SIMPLE_THREADED_LEVEL3) if (USE_SIMPLE_THREADED_LEVEL3)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
endif () endif ()

View File

@ -93,7 +93,7 @@ extern "C" {
#include <sched.h> #include <sched.h>
#endif #endif
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID) #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID)
#include <sched.h> #include <sched.h>
#endif #endif
@ -179,7 +179,7 @@ extern "C" {
#define ALLOCA_ALIGN 63UL #define ALLOCA_ALIGN 63UL
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2) #define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
#ifdef NEEDBUNDERSCORE #ifdef NEEDBUNDERSCORE
#define BLASFUNC(FUNC) FUNC##_ #define BLASFUNC(FUNC) FUNC##_
@ -649,6 +649,12 @@ int omp_get_num_procs(void);
__declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_in_parallel(void);
__declspec(dllimport) int __cdecl omp_get_num_procs(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void);
#endif #endif
#if (__STDC_VERSION__ >= 201112L)
#ifndef _Atomic
#define _Atomic volatile
#endif
#include <stdatomic.h>
#endif
#else #else
#ifdef __ELF__ #ifdef __ELF__
int omp_in_parallel (void) __attribute__ ((weak)); int omp_in_parallel (void) __attribute__ ((weak));

View File

@ -178,6 +178,12 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
result = x/y; result = x/y;
return result; return result;
#else #else
#if (MAX_CPU_NUMBER > 64)
if ( y > 64) {
result = x/y;
return result;
}
#endif
y = blas_quick_divide_table[y]; y = blas_quick_divide_table[y];
@ -327,7 +333,7 @@ REALNAME:
#endif #endif
#endif #endif
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__)
#define PROLOGUE \ #define PROLOGUE \
.text; \ .text; \
.align 16; \ .align 16; \

View File

@ -196,6 +196,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
if (y <= 1) return x; if (y <= 1) return x;
#if (MAX_CPU_NUMBER > 64)
if (y > 64) {
result = x / y;
return result;
}
#endif
y = blas_quick_divide_table[y]; y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
@ -403,7 +410,7 @@ REALNAME:
#define EPILOGUE .end #define EPILOGUE .end
#endif #endif
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(__ELF__) || defined(C_PGI)
#define PROLOGUE \ #define PROLOGUE \
.text; \ .text; \
.align 512; \ .align 512; \

View File

@ -121,7 +121,7 @@ int detect(void)
return CPU_VULCAN; return CPU_VULCAN;
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
return CPU_THUNDERX; return CPU_THUNDERX;
else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */ else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43"))
return CPU_THUNDERX2T99; return CPU_THUNDERX2T99;
} }

View File

@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_UNKNOWN 0 #define CPU_UNKNOWN 0
#define CPU_P5600 1 #define CPU_P5600 1
#define CPU_1004K 2
static char *cpuname[] = { static char *cpuname[] = {
"UNKOWN", "UNKOWN",
"P5600" "P5600",
"1004K"
}; };
int detect(void){ int detect(void){
@ -90,7 +92,7 @@ int detect(void){
if (!strncmp("cpu", buffer, 3)){ if (!strncmp("cpu", buffer, 3)){
p = strchr(buffer, ':') + 2; p = strchr(buffer, ':') + 2;
#if 0 #if 0
fprintf(stderr, "%s\n", p); fprintf(stderr, "%s \n", p);
#endif #endif
break; break;
} }
@ -99,43 +101,13 @@ int detect(void){
fclose(infile); fclose(infile);
if(p != NULL){ if(p != NULL){
if (strstr(p, "Loongson-3A")){ if (strstr(p, "5600")) {
return CPU_LOONGSON3A; return CPU_P5600;
}else if(strstr(p, "Loongson-3B")){ } else if (strstr(p, "1004K")) {
return CPU_LOONGSON3B; return CPU_1004K;
}else if (strstr(p, "Loongson-3")){ } else
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("system type", buffer, 11)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if (strstr(p, "loongson3a"))
return CPU_LOONGSON3A;
}else{
return CPU_UNKNOWN; return CPU_UNKNOWN;
} }
}
//Check model name for Loongson3
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("model name", buffer, 10)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}
}
#endif #endif
return CPU_UNKNOWN; return CPU_UNKNOWN;
} }
@ -149,7 +121,7 @@ void get_architecture(void){
} }
void get_subarchitecture(void){ void get_subarchitecture(void){
if(detect()==CPU_P5600){ if(detect()==CPU_P5600|| detect()==CPU_1004K){
printf("P5600"); printf("P5600");
}else{ }else{
printf("UNKNOWN"); printf("UNKNOWN");
@ -170,6 +142,14 @@ void get_cpuconfig(void){
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n"); printf("#define L2_ASSOCIATIVE 8\n");
} else if (detect()==CPU_1004K) {
printf("#define MIPS1004K\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 26144\n");
printf("#define DTB_DEFAULT_ENTRIES 8\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else{ }else{
printf("#define UNKNOWN\n"); printf("#define UNKNOWN\n");
} }
@ -178,6 +158,8 @@ void get_cpuconfig(void){
void get_libname(void){ void get_libname(void){
if(detect()==CPU_P5600) { if(detect()==CPU_P5600) {
printf("p5600\n"); printf("p5600\n");
} else if (detect()==CPU_1004K) {
printf("1004K\n");
}else{ }else{
printf("mips\n"); printf("mips\n");
} }

View File

@ -60,6 +60,14 @@ OS_FREEBSD
OS_NETBSD OS_NETBSD
#endif #endif
#if defined(__OpenBSD__)
OS_OPENBSD
#endif
#if defined(__DragonFly__)
OS_DRAGONFLY
#endif
#if defined(__sun) #if defined(__sun)
OS_SUNOS OS_SUNOS
#endif #endif

View File

@ -91,7 +91,12 @@
#endif #endif
typedef struct { typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; #if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t; } job_t;

View File

@ -67,7 +67,12 @@
#endif #endif
typedef struct { typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; #if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t; } job_t;

View File

@ -91,7 +91,12 @@
#endif #endif
typedef struct { typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; #if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t; } job_t;

View File

@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************/ /*********************************************************************/
#include "common.h" #include "common.h"
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
#include <dlfcn.h> #include <dlfcn.h>
#include <signal.h> #include <signal.h>
#include <sys/resource.h> #include <sys/resource.h>

View File

@ -36,6 +36,7 @@
/* or implied, of The University of Texas at Austin. */ /* or implied, of The University of Texas at Austin. */
/*********************************************************************/ /*********************************************************************/
#include <stdbool.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
//#include <sys/mman.h> //#include <sys/mman.h>
@ -49,11 +50,16 @@
int blas_server_avail = 0; int blas_server_avail = 0;
static void * blas_thread_buffer[MAX_CPU_NUMBER]; static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
#if __STDC_VERSION__ >= 201112L
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
#else
static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
#endif
void goto_set_num_threads(int num_threads) { void goto_set_num_threads(int num_threads) {
int i=0; int i=0, j=0;
if (num_threads < 1) num_threads = blas_num_threads; if (num_threads < 1) num_threads = blas_num_threads;
@ -68,15 +74,17 @@ void goto_set_num_threads(int num_threads) {
omp_set_num_threads(blas_cpu_number); omp_set_num_threads(blas_cpu_number);
//adjust buffer for each thread //adjust buffer for each thread
for(i=0; i<blas_cpu_number; i++){ for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
if(blas_thread_buffer[i]==NULL){ for(j=0; j<blas_cpu_number; j++){
blas_thread_buffer[i]=blas_memory_alloc(2); if(blas_thread_buffer[i][j]==NULL){
blas_thread_buffer[i][j]=blas_memory_alloc(2);
}
} }
} for(; j<MAX_CPU_NUMBER; j++){
for(; i<MAX_CPU_NUMBER; i++){ if(blas_thread_buffer[i][j]!=NULL){
if(blas_thread_buffer[i]!=NULL){ blas_memory_free(blas_thread_buffer[i][j]);
blas_memory_free(blas_thread_buffer[i]); blas_thread_buffer[i][j]=NULL;
blas_thread_buffer[i]=NULL; }
} }
} }
#if defined(ARCH_MIPS64) #if defined(ARCH_MIPS64)
@ -92,30 +100,34 @@ void openblas_set_num_threads(int num_threads) {
int blas_thread_init(void){ int blas_thread_init(void){
int i=0; int i=0, j=0;
blas_get_cpu_number(); blas_get_cpu_number();
blas_server_avail = 1; blas_server_avail = 1;
for(i=0; i<blas_num_threads; i++){ for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
blas_thread_buffer[i]=blas_memory_alloc(2); for(j=0; j<blas_num_threads; j++){
} blas_thread_buffer[i][j]=blas_memory_alloc(2);
for(; i<MAX_CPU_NUMBER; i++){ }
blas_thread_buffer[i]=NULL; for(; j<MAX_CPU_NUMBER; j++){
blas_thread_buffer[i][j]=NULL;
}
} }
return 0; return 0;
} }
int BLASFUNC(blas_thread_shutdown)(void){ int BLASFUNC(blas_thread_shutdown)(void){
int i=0; int i=0, j=0;
blas_server_avail = 0; blas_server_avail = 0;
for(i=0; i<MAX_CPU_NUMBER; i++){ for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
if(blas_thread_buffer[i]!=NULL){ for(j=0; j<MAX_CPU_NUMBER; j++){
blas_memory_free(blas_thread_buffer[i]); if(blas_thread_buffer[i][j]!=NULL){
blas_thread_buffer[i]=NULL; blas_memory_free(blas_thread_buffer[i][j]);
blas_thread_buffer[i][j]=NULL;
}
} }
} }
@ -206,7 +218,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
} }
} }
static void exec_threads(blas_queue_t *queue){ static void exec_threads(blas_queue_t *queue, int buf_index){
void *buffer, *sa, *sb; void *buffer, *sa, *sb;
int pos=0, release_flag=0; int pos=0, release_flag=0;
@ -223,7 +235,7 @@ static void exec_threads(blas_queue_t *queue){
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
pos = omp_get_thread_num(); pos = omp_get_thread_num();
buffer = blas_thread_buffer[pos]; buffer = blas_thread_buffer[buf_index][pos];
//fallback //fallback
if(buffer==NULL) { if(buffer==NULL) {
@ -291,7 +303,7 @@ static void exec_threads(blas_queue_t *queue){
int exec_blas(BLASLONG num, blas_queue_t *queue){ int exec_blas(BLASLONG num, blas_queue_t *queue){
BLASLONG i; BLASLONG i, buf_index;
if ((num <= 0) || (queue == NULL)) return 0; if ((num <= 0) || (queue == NULL)) return 0;
@ -302,6 +314,23 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
} }
#endif #endif
while(true) {
for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
#if __STDC_VERSION__ >= 201112L
_Bool inuse = false;
if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) {
#else
if(blas_buffer_inuse[i] == false) {
blas_buffer_inuse[i] = true;
#endif
buf_index = i;
break;
}
}
if(i != MAX_PARALLEL_NUMBER)
break;
}
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (i = 0; i < num; i ++) { for (i = 0; i < num; i ++) {
@ -309,9 +338,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
queue[i].position = i; queue[i].position = i;
#endif #endif
exec_threads(&queue[i]); exec_threads(&queue[i], buf_index);
} }
#if __STDC_VERSION__ >= 201112L
atomic_store(&blas_buffer_inuse[buf_index], false);
#else
blas_buffer_inuse[buf_index] = false;
#endif
return 0; return 0;
} }

View File

@ -108,7 +108,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/resource.h> #include <sys/resource.h>
#endif #endif
#if defined(OS_FREEBSD) || defined(OS_DARWIN) #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#include <sys/sysctl.h> #include <sys/sysctl.h>
#include <sys/resource.h> #include <sys/resource.h>
#endif #endif
@ -147,9 +147,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
#define CONSTRUCTOR __attribute__ ((constructor)) #define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor)) #define DESTRUCTOR __attribute__ ((destructor))
#else #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
#define CONSTRUCTOR __attribute__ ((constructor(101))) #define CONSTRUCTOR __attribute__ ((constructor(101)))
#define DESTRUCTOR __attribute__ ((destructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101)))
#else
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#endif #endif
#ifdef DYNAMIC_ARCH #ifdef DYNAMIC_ARCH
@ -209,7 +212,8 @@ int ret;
size = CPU_ALLOC_SIZE(nums); size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp); ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums; if (ret!=0) return nums;
nums = CPU_COUNT_S(size,cpusetp); ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp); CPU_FREE(cpusetp);
return nums; return nums;
#endif #endif
@ -246,7 +250,7 @@ int get_num_procs(void) {
#endif #endif
#if defined(OS_FREEBSD) #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
int get_num_procs(void) { int get_num_procs(void) {
@ -336,7 +340,7 @@ extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env(); extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){ int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num; int max_num;
#endif #endif
int blas_goto_num = 0; int blas_goto_num = 0;
@ -344,7 +348,7 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads; if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
max_num = get_num_procs(); max_num = get_num_procs();
#endif #endif
@ -368,7 +372,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER; else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
if (blas_num_threads > max_num) blas_num_threads = max_num; if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif #endif

View File

@ -54,6 +54,9 @@ static char* openblas_config_str=""
#ifdef NO_AFFINITY #ifdef NO_AFFINITY
"NO_AFFINITY " "NO_AFFINITY "
#endif #endif
#ifdef USE_OPENMP
"USE_OPENMP "
#endif
#ifndef DYNAMIC_ARCH #ifndef DYNAMIC_ARCH
CHAR_CORENAME CHAR_CORENAME
#endif #endif
@ -61,18 +64,23 @@ static char* openblas_config_str=""
#ifdef DYNAMIC_ARCH #ifdef DYNAMIC_ARCH
char *gotoblas_corename(); char *gotoblas_corename();
static char tmp_config_str[256];
#endif #endif
static char tmp_config_str[256];
int openblas_get_parallel();
char* CNAME() { char* CNAME() {
#ifndef DYNAMIC_ARCH char tmpstr[20];
return openblas_config_str;
#else
strcpy(tmp_config_str, openblas_config_str); strcpy(tmp_config_str, openblas_config_str);
#ifdef DYNAMIC_ARCH
strcat(tmp_config_str, gotoblas_corename()); strcat(tmp_config_str, gotoblas_corename());
return tmp_config_str;
#endif #endif
if (openblas_get_parallel() == 0)
sprintf(tmpstr, " SINGLE_THREADED");
else
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
strcat(tmp_config_str, tmpstr);
return tmp_config_str;
} }
@ -83,3 +91,4 @@ char* openblas_get_corename() {
return gotoblas_corename(); return gotoblas_corename();
#endif #endif
} }

View File

@ -156,7 +156,7 @@ endif
endif endif
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD)) ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
so : ../$(LIBSONAME) so : ../$(LIBSONAME)

View File

@ -97,7 +97,7 @@ if ($compiler eq "") {
if ($data =~ /Intel/) { if ($data =~ /Intel/) {
$vendor = INTEL; $vendor = INTEL;
$openmp = "-openmp"; $openmp = "-fopenmp";
} }
if ($data =~ /Sun Fortran/) { if ($data =~ /Sun Fortran/) {
@ -127,7 +127,7 @@ if ($compiler eq "") {
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores. # for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data =~ /zho_ge__/) { if ($data =~ / zho_ge__/) {
$need2bu = 1; $need2bu = 1;
} }
} }
@ -155,7 +155,7 @@ if ($compiler eq "") {
if ($compiler =~ /ifort/) { if ($compiler =~ /ifort/) {
$vendor = INTEL; $vendor = INTEL;
$bu = "_"; $bu = "_";
$openmp = "-openmp"; $openmp = "-fopenmp";
} }
if ($compiler =~ /pathf/) { if ($compiler =~ /pathf/) {

View File

@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef OS_WINDOWS #ifdef OS_WINDOWS
#include <windows.h> #include <windows.h>
#endif #endif
#if defined(__FreeBSD__) || defined(__APPLE__) #if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
#include <sys/types.h> #include <sys/types.h>
#include <sys/sysctl.h> #include <sys/sysctl.h>
#endif #endif
@ -1074,7 +1074,7 @@ static int get_num_cores(void) {
#ifdef OS_WINDOWS #ifdef OS_WINDOWS
SYSTEM_INFO sysinfo; SYSTEM_INFO sysinfo;
#elif defined(__FreeBSD__) || defined(__APPLE__) #elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
int m[2], count; int m[2], count;
size_t len; size_t len;
#endif #endif
@ -1088,7 +1088,7 @@ static int get_num_cores(void) {
GetSystemInfo(&sysinfo); GetSystemInfo(&sysinfo);
return sysinfo.dwNumberOfProcessors; return sysinfo.dwNumberOfProcessors;
#elif defined(__FreeBSD__) || defined(__APPLE__) #elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
m[0] = CTL_HW; m[0] = CTL_HW;
m[1] = HW_NCPU; m[1] = HW_NCPU;
len = sizeof(int); len = sizeof(int);

View File

@ -239,6 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
} else } else
nthreads = 1; nthreads = 1;
/* FIXME TRMV multithreading appears to be broken, see issue 1332*/
nthreads = 1;
if(nthreads > 1) { if(nthreads > 1) {
buffer_size = n > 16 ? 0 : n * 4 + 40; buffer_size = n > 16 ? 0 : n * 4 + 40;
} }

View File

@ -29,10 +29,8 @@ USE_TRMM = 1
endif endif
ifeq ($(CORE), HASWELL) ifeq ($(CORE), HASWELL)
ifeq ($(ARCH), x86_64)
USE_TRMM = 1 USE_TRMM = 1
endif endif
endif
ifeq ($(CORE), ZEN) ifeq ($(CORE), ZEN)
USE_TRMM = 1 USE_TRMM = 1

View File

@ -440,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0 cmp N, #0
ble axpy_kernel_L999 ble axpy_kernel_L999
/*
cmp INC_X, #0 cmp INC_X, #0
beq axpy_kernel_L999 beq axpy_kernel_L999
cmp INC_Y, #0 cmp INC_Y, #0
beq axpy_kernel_L999 beq axpy_kernel_L999
*/
cmp INC_X, #1 cmp INC_X, #1
bne axpy_kernel_S_BEGIN bne axpy_kernel_S_BEGIN

View File

@ -483,13 +483,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0 cmp N, #0
ble rot_kernel_L999 ble rot_kernel_L999
/*
cmp INC_X, #0 cmp INC_X, #0
beq rot_kernel_L999 beq rot_kernel_L999
cmp INC_Y, #0 cmp INC_Y, #0
beq rot_kernel_L999 beq rot_kernel_L999
*/
cmp INC_X, #1 cmp INC_X, #1
bne rot_kernel_S_BEGIN bne rot_kernel_S_BEGIN
@ -585,6 +585,12 @@ rot_kernel_S10:
KERNEL_S1 KERNEL_S1
cmp INC_X, #0
beq rot_kernel_L999
cmp INC_Y, #0
beq rot_kernel_L999
subs I, I, #1 subs I, I, #1
bne rot_kernel_S10 bne rot_kernel_S10

View File

@ -116,22 +116,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (m & 1) { if (m & 1) {
if (X > posY) { if (X > posY) {
/* ao1 += 1; ao1 += 1;
ao2 += 1; */ ao2 += 1;
b += 2; b += 2;
} else } else
#ifdef UNIT
if (X < posY) { if (X < posY) {
#endif data01 = *(ao1 + 0);
b[ 0] = *(ao1 + 0); data02 = *(ao1 + 1);
#ifdef UNIT
b[ 0] = data01;
b[ 1] = data02;
ao1 += lda;
b += 2;
} else { } else {
#ifdef UNIT
data02 = *(ao1 + 1);
b[ 0] = ONE; b[ 0] = ONE;
} b[ 1] = data02;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
b[ 0] = data01;
b[ 1] = data02;
#endif #endif
b[ 1] = *(ao1 + 1); ao1 += 2;
b += 2; b += 2;
}
} }
posY += 2; posY += 2;
@ -178,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} while (i > 0); } while (i > 0);
} }
// posY += 1; posY += 1;
} }
return 0; return 0;

View File

@ -518,7 +518,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 15); i = (m & 15);
if (i > 0) { if (i > 0) {
if (X < posY) { if (X < posY) {
/* a01 += i; a01 += i;
a02 += i; a02 += i;
a03 += i; a03 += i;
a04 += i; a04 += i;
@ -533,7 +533,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a13 += i; a13 += i;
a14 += i; a14 += i;
a15 += i; a15 += i;
a16 += i; */ a16 += i;
b += 16 * i; b += 16 * i;
} else } else
if (X > posY) { if (X > posY) {
@ -1130,14 +1130,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 7); i = (m & 7);
if (i > 0) { if (i > 0) {
if (X < posY) { if (X < posY) {
/* a01 += i; a01 += i;
a02 += i; a02 += i;
a03 += i; a03 += i;
a04 += i; a04 += i;
a05 += i; a05 += i;
a06 += i; a06 += i;
a07 += i; a07 += i;
a08 += i; */ a08 += i;
b += 8 * i; b += 8 * i;
} else } else
if (X > posY) { if (X > posY) {
@ -1156,13 +1156,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 8; b += 8;
} }
/* a02 += i * lda; a02 += i * lda;
a03 += i * lda; a03 += i * lda;
a04 += i * lda; a04 += i * lda;
a05 += i * lda; a05 += i * lda;
a06 += i * lda; a06 += i * lda;
a07 += i * lda; a07 += i * lda;
a08 += i * lda; */ a08 += i * lda;
} else { } else {
#ifdef UNIT #ifdef UNIT
b[ 0] = ONE; b[ 0] = ONE;
@ -1371,10 +1371,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3); i = (m & 3);
if (i > 0) { if (i > 0) {
if (X < posY) { if (X < posY) {
/* a01 += i; a01 += i;
a02 += i; a02 += i;
a03 += i; a03 += i;
a04 += i; */ a04 += i;
b += 4 * i; b += 4 * i;
} else } else
if (X > posY) { if (X > posY) {
@ -1387,9 +1387,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 += lda; a01 += lda;
b += 4; b += 4;
} }
/* a02 += lda; a02 += lda;
a03 += lda; a03 += lda;
a04 += lda; */ a04 += lda;
} else { } else {
#ifdef UNIT #ifdef UNIT
@ -1487,19 +1487,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X < posY) { if (X < posY) {
a01 ++; a01 ++;
a02 ++; a02 ++;
} else { b += 2;
#ifdef UNIT } else
if (X > posY) { if (X > posY) {
#endif
b[ 0] = *(a01 + 0); b[ 0] = *(a01 + 0);
#ifdef UNIT b[ 1] = *(a01 + 1);
a01 += lda;
b += 2;
} else { } else {
#ifdef UNIT
b[ 0] = ONE; b[ 0] = ONE;
} b[ 1] = *(a01 + 1);
#else
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
#endif #endif
b[ 1] = *(a01 + 1); b += 2;
} }
b += 2;
} }
posY += 2; posY += 2;
} }
@ -1518,25 +1522,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (i > 0) { if (i > 0) {
do { do {
if (X < posY) { if (X < posY) {
a01 ++; a01 += 1;
} else { b ++;
#ifdef UNIT } else
if (X > posY) { if (X > posY) {
#endif
b[ 0] = *(a01 + 0); b[ 0] = *(a01 + 0);
#ifdef UNIT a01 += lda;
b ++;
} else { } else {
#ifdef UNIT
b[ 0] = ONE; b[ 0] = ONE;
} #else
b[ 0] = *(a01 + 0);
#endif #endif
a01 += lda; a01 += lda;
} b ++;
b ++; }
X ++;
i --; X += 1;
i --;
} while (i > 0); } while (i > 0);
} }
// posY += 1; posY += 1;
} }
return 0; return 0;

View File

@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (m & 1) { if (m & 1) {
if (X < posY) { if (X < posY) {
/* ao1 += 1; ao1 += 1;
ao2 += 1; */ ao2 += 1;
b += 2; b += 2;
} else } else
if (X > posY) { if (X > posY) {
@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = data01; b[ 0] = data01;
b[ 1] = data02; b[ 1] = data02;
// ao1 += lda; ao1 += lda;
b += 2; b += 2;
} else { } else {
#ifdef UNIT #ifdef UNIT
@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = data01; b[ 0] = data01;
b[ 1] = ZERO; b[ 1] = ZERO;
#endif #endif
// ao1 += lda; ao1 += lda;
b += 2; b += 2;
} }
} }
@ -161,18 +161,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = m; i = m;
if (m > 0) { if (m > 0) {
do { do {
if (X < posY) {
b += 1;
ao1 += 1;
} else
if (X > posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
b += 1;
ao1 += lda;
} else {
#ifdef UNIT #ifdef UNIT
if (X > posY) { b[ 0] = ONE;
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif #endif
b[ 0] = *(ao1 + 0); b += 1;
#ifdef UNIT ao1 += lda;
} else { }
b[ 0] = ONE;
} X += 1;
#endif
b ++;
ao1 += lda;
X ++;
i --; i --;
} while (i > 0); } while (i > 0);
} }

View File

@ -201,18 +201,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X < posY) { if (X < posY) {
if (m & 2) { if (m & 2) {
/* ao1 += 2; ao1 += 2;
ao2 += 2; ao2 += 2;
ao3 += 2; ao3 += 2;
ao4 += 2; */ ao4 += 2;
b += 8; b += 8;
} }
if (m & 1) { if (m & 1) {
/* ao1 += 1; ao1 += 1;
ao2 += 1; ao2 += 1;
ao3 += 1; ao3 += 1;
ao4 += 1; */ ao4 += 1;
b += 4; b += 4;
} }
@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = data08; b[ 7] = data08;
ao1 += 2 * lda; ao1 += 2 * lda;
// ao2 += 2 * lda; ao2 += 2 * lda;
b += 8; b += 8;
} }
@ -253,7 +253,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = data03; b[ 2] = data03;
b[ 3] = data04; b[ 3] = data04;
// ao1 += lda; ao1 += lda;
b += 4; b += 4;
} }
@ -401,7 +401,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (i) { if (i) {
if (X < posY) { if (X < posY) {
// ao1 += 2; ao1 += 2;
b += 2; b += 2;
} else } else
if (X > posY) { if (X > posY) {
@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = data01; b[ 0] = data01;
b[ 1] = data02; b[ 1] = data02;
// ao1 += lda; ao1 += lda;
b += 2; b += 2;
} else { } else {
#ifdef UNIT #ifdef UNIT
@ -443,21 +443,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
do { do {
if (X < posY) { if (X < posY) {
b += 1;
ao1 += 1; ao1 += 1;
} else { } else
#ifdef UNIT
if (X > posY) { if (X > posY) {
#endif data01 = *(ao1 + 0);
b[ 0] = *(ao1 + 0); b[ 0] = data01;
#ifdef UNIT ao1 += lda;
b += 1;
} else { } else {
#ifdef UNIT
b[ 0] = ONE; b[ 0] = ONE;
} #else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif #endif
ao1 += lda; ao1 += lda;
} b += 1;
b ++; }
X ++;
X += 1;
i --; i --;
} while (i > 0); } while (i > 0);
} }

View File

@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
} }
a1 += 2 * lda; a1 += 2 * lda;
// a2 += 2 * lda; a2 += 2 * lda;
b += 8; b += 8;
ii += 2; ii += 2;

View File

@ -139,18 +139,48 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} }
if (m & 1) { if (m & 1) {
#ifdef UNIT
if (X > posY) {
ao1 += 2;
ao2 += 2;
b += 4;
} else
if (X < posY) { if (X < posY) {
#endif data1 = *(ao1 + 0);
b[ 0] = *(ao1 + 0); data2 = *(ao1 + 1);
b[ 1] = *(ao1 + 1); data3 = *(ao1 + 2);
#ifdef UNIT data4 = *(ao1 + 3);
b[ 0] = data1;
b[ 1] = data2;
b[ 2] = data3;
b[ 3] = data4;
ao1 += lda;
b += 4;
} else { } else {
#ifdef UNIT
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);
b[ 0] = ONE; b[ 0] = ONE;
b[ 1] = ZERO; b[ 1] = ZERO;
} b[ 2] = data3;
b[ 3] = data4;
#else
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);
b[ 0] = data1;
b[ 1] = data2;
b[ 2] = data3;
b[ 3] = data4;
#endif #endif
b += 4; b += 4;
}
} }
posY += 2; posY += 2;
@ -203,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} while (i > 0); } while (i > 0);
} }
// posY += 1; posY += 1;
} }
return 0; return 0;

View File

@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
BLASLONG i, ii, j, jj; BLASLONG i, ii, j, jj;
FLOAT data01 = 0.0, data02 = 0.0; FLOAT data01, data02;
FLOAT *a1; FLOAT *a1;
lda *= 2; lda *= 2;

View File

@ -43,8 +43,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
BLASLONG i, ii, j, jj; BLASLONG i, ii, j, jj;
FLOAT data01 = 0.0, data02 = 0.0, data03, data04; FLOAT data01, data02, data03, data04;
FLOAT data05, data06, data07 = 0.0, data08 = 0.0; FLOAT data05, data06, data07, data08;
FLOAT *a1, *a2; FLOAT *a1, *a2;
lda *= 2; lda *= 2;

1
kernel/mips/KERNEL.1004K Normal file
View File

@ -0,0 +1 @@
include $(KERNELDIR)/KERNEL.P5600

View File

@ -38,8 +38,8 @@ ZASUMKERNEL = ../mips/zasum_msa.c
else else
SASUMKERNEL = ../mips/asum.c SASUMKERNEL = ../mips/asum.c
DASUMKERNEL = ../mips/asum.c DASUMKERNEL = ../mips/asum.c
CASUMKERNEL = ../mips/asum.c CASUMKERNEL = ../mips/zasum.c
ZASUMKERNEL = ../mips/asum.c ZASUMKERNEL = ../mips/zasum.c
endif endif
ifdef HAVE_MSA ifdef HAVE_MSA

View File

@ -484,10 +484,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
v2f64 v_alpha; v2f64 v_alpha;
v2f64 x0, x1, x2, x3, y0 = 0.0, y1 = 0.0, y2 = 0.0, y3 = 0.0; v2f64 x0, x1, x2, x3, y0 = {0,0}, y1 = {0,0}, y2 = {0,0}, y3 = {0,0};
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
v2f64 t30, t31, tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0; v2f64 t30, t31, tp0 = {0,0}, tp1 = {0,0}, tp2 = {0,0}, tp3 = {0,0}, tp4 = {0,0}, tp5 = {0,0}, tp6 = {0,0}, tp7 = {0,0};
v_alpha = COPY_DOUBLE_TO_VECTOR(alpha); v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);

View File

@ -41,8 +41,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
while(i < n) while(i < n)
{ {
#if defined(DSDOT)
dot += y[iy] * x[ix] ; dot += (double)y[iy] * (double)x[ix] ;
#else
dot += y[iy] * x[ix];
#endif
ix += inc_x ; ix += inc_x ;
iy += inc_y ; iy += inc_y ;
i++ ; i++ ;

View File

@ -423,9 +423,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
FLOAT *y_org = y; FLOAT *y_org = y;
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
v4f32 v_alpha, x0, x1, y0 = 0.0, y1 = 0.0; v4f32 v_alpha, x0, x1, y0 = {0,0,0,0}, y1 = {0,0,0,0};
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
v4f32 tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0; v4f32 tp0 = {0,0,0,0}, tp1 = {0,0,0,0}, tp2 = {0,0,0,0}, tp3 = {0,0,0,0}, tp4 = {0,0,0,0}, tp5 = {0,0,0,0}, tp6 = {0,0,0,0}, tp7 = {0,0,0,0};
v_alpha = COPY_FLOAT_TO_VECTOR(alpha); v_alpha = COPY_FLOAT_TO_VECTOR(alpha);

View File

@ -54,3 +54,6 @@ ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S
ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S
ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S
ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S
CROTKERNEL = ../arm/zrot.c
ZROTKERNEL = ../arm/zrot.c

View File

@ -39,7 +39,7 @@ lapack_int LAPACKE_clacgv( lapack_int n, lapack_complex_float* x,
#ifndef LAPACK_DISABLE_NAN_CHECK #ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) { if( LAPACKE_get_nancheck() ) {
/* Optionally check input matrices for NaNs */ /* Optionally check input matrices for NaNs */
if( LAPACKE_c_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) { if( LAPACKE_c_nancheck( n, x, incx ) ) {
return -2; return -2;
} }
} }

View File

@ -51,16 +51,21 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
#ifndef LAPACK_DISABLE_NAN_CHECK #ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) { if( LAPACKE_get_nancheck() ) {
/* Optionally check input matrices for NaNs */ /* Optionally check input matrices for NaNs */
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : lapack_int lrv, lcv; /* row, column stride */
( ( LAPACKE_lsame( storev, 'r' ) && if( matrix_layout == LAPACK_COL_MAJOR ) {
LAPACKE_lsame( side, 'l' ) ) ? m : lrv = 1;
( ( LAPACKE_lsame( storev, 'r' ) && lcv = ldv;
LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); } else {
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && lrv = ldv;
LAPACKE_lsame( side, 'l' ) ) ? m : lcv = 1;
( ( LAPACKE_lsame( storev, 'c' ) && }
LAPACKE_lsame( side, 'r' ) ) ? n : ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -13; return -13;
} }
@ -70,8 +75,8 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
return -9; return -9;
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v,
ldv ) ) &v[k*lrv], ldv ) )
return -9; return -9;
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > nrows_v ) { if( k > nrows_v ) {
@ -79,23 +84,23 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
return -8; return -8;
} }
if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k,
&v[(nrows_v-k)*ldv], ldv ) ) &v[(nrows_v-k)*lrv], ldv ) )
return -9; return -9;
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
return -9; return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
return -9; return -9;
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k,
ldv ) ) &v[k*lrv], ldv ) )
return -9; return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > ncols_v ) { if( k > ncols_v ) {
LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
return -8; return -8;
} }
if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k,
ldv ) ) &v[(ncols_v-k)*lcv], ldv ) )
return -9; return -9;
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
return -9; return -9;

View File

@ -43,7 +43,7 @@ lapack_int LAPACKE_clarfg( lapack_int n, lapack_complex_float* alpha,
if( LAPACKE_c_nancheck( 1, alpha, 1 ) ) { if( LAPACKE_c_nancheck( 1, alpha, 1 ) ) {
return -2; return -2;
} }
if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { if( LAPACKE_c_nancheck( n-1, x, incx ) ) {
return -3; return -3;
} }
} }

View File

@ -38,6 +38,7 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m,
lapack_complex_float tau, lapack_complex_float* c, lapack_complex_float tau, lapack_complex_float* c,
lapack_int ldc, lapack_complex_float* work ) lapack_int ldc, lapack_complex_float* work )
{ {
lapack_int lv;
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
LAPACKE_xerbla( "LAPACKE_clarfx", -1 ); LAPACKE_xerbla( "LAPACKE_clarfx", -1 );
return -1; return -1;
@ -51,7 +52,8 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m,
if( LAPACKE_c_nancheck( 1, &tau, 1 ) ) { if( LAPACKE_c_nancheck( 1, &tau, 1 ) ) {
return -6; return -6;
} }
if( LAPACKE_c_nancheck( m, v, 1 ) ) { lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
if( LAPACKE_c_nancheck( lv, v, 1 ) ) {
return -5; return -5;
} }
} }

View File

@ -39,7 +39,7 @@ lapack_int LAPACKE_classq( lapack_int n, lapack_complex_float* x,
#ifndef LAPACK_DISABLE_NAN_CHECK #ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) { if( LAPACKE_get_nancheck() ) {
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { if( LAPACKE_c_nancheck( n, x, incx ) ) {
return -2; return -2;
} }
if( LAPACKE_s_nancheck( 1, scale, 1 ) ) { if( LAPACKE_s_nancheck( 1, scale, 1 ) ) {

View File

@ -58,7 +58,7 @@ lapack_int LAPACKE_cunmtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -10; return -10;
} }
if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) { if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) {
return -9; return -9;
} }
} }

View File

@ -58,7 +58,7 @@ lapack_int LAPACKE_cupmtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -9; return -9;
} }
if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) { if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) {
return -8; return -8;
} }
} }

View File

@ -50,16 +50,21 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
#ifndef LAPACK_DISABLE_NAN_CHECK #ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) { if( LAPACKE_get_nancheck() ) {
/* Optionally check input matrices for NaNs */ /* Optionally check input matrices for NaNs */
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : lapack_int lrv, lcv; /* row, column stride */
( ( LAPACKE_lsame( storev, 'r' ) && if( matrix_layout == LAPACK_COL_MAJOR ) {
LAPACKE_lsame( side, 'l' ) ) ? m : lrv = 1;
( ( LAPACKE_lsame( storev, 'r' ) && lcv = ldv;
LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); } else {
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && lrv = ldv;
LAPACKE_lsame( side, 'l' ) ) ? m : lcv = 1;
( ( LAPACKE_lsame( storev, 'c' ) && }
LAPACKE_lsame( side, 'r' ) ) ? n : ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -13; return -13;
} }
@ -69,8 +74,8 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
return -9; return -9;
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v,
ldv ) ) &v[k*lrv], ldv ) )
return -9; return -9;
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > nrows_v ) { if( k > nrows_v ) {
@ -78,23 +83,23 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
return -8; return -8;
} }
if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k,
&v[(nrows_v-k)*ldv], ldv ) ) &v[(nrows_v-k)*lrv], ldv ) )
return -9; return -9;
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
return -9; return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
return -9; return -9;
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k,
ldv ) ) &v[k*lrv], ldv ) )
return -9; return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > ncols_v ) { if( k > ncols_v ) {
LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); LAPACKE_xerbla( "LAPACKE_dlarfb", -8 );
return -8; return -8;
} }
if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k,
ldv ) ) &v[(ncols_v-k)*lcv], ldv ) )
return -9; return -9;
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
return -9; return -9;

View File

@ -42,7 +42,7 @@ lapack_int LAPACKE_dlarfg( lapack_int n, double* alpha, double* x,
if( LAPACKE_d_nancheck( 1, alpha, 1 ) ) { if( LAPACKE_d_nancheck( 1, alpha, 1 ) ) {
return -2; return -2;
} }
if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { if( LAPACKE_d_nancheck( n-1, x, incx ) ) {
return -3; return -3;
} }
} }

View File

@ -37,6 +37,7 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m,
lapack_int n, const double* v, double tau, double* c, lapack_int n, const double* v, double tau, double* c,
lapack_int ldc, double* work ) lapack_int ldc, double* work )
{ {
lapack_int lv;
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
LAPACKE_xerbla( "LAPACKE_dlarfx", -1 ); LAPACKE_xerbla( "LAPACKE_dlarfx", -1 );
return -1; return -1;
@ -50,7 +51,8 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m,
if( LAPACKE_d_nancheck( 1, &tau, 1 ) ) { if( LAPACKE_d_nancheck( 1, &tau, 1 ) ) {
return -6; return -6;
} }
if( LAPACKE_d_nancheck( m, v, 1 ) ) { lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
if( LAPACKE_d_nancheck( lv, v, 1 ) ) {
return -5; return -5;
} }
} }

View File

@ -38,7 +38,7 @@ lapack_int LAPACKE_dlassq( lapack_int n, double* x, lapack_int incx, double* sca
#ifndef LAPACK_DISABLE_NAN_CHECK #ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) { if( LAPACKE_get_nancheck() ) {
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { if( LAPACKE_d_nancheck( n, x, incx ) ) {
return -2; return -2;
} }
if( LAPACKE_d_nancheck( 1, scale, 1 ) ) { if( LAPACKE_d_nancheck( 1, scale, 1 ) ) {

View File

@ -56,7 +56,7 @@ lapack_int LAPACKE_dopmtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -9; return -9;
} }
if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) { if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) {
return -8; return -8;
} }
} }

View File

@ -57,7 +57,7 @@ lapack_int LAPACKE_dormtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -10; return -10;
} }
if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) { if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) {
return -9; return -9;
} }
} }

View File

@ -50,16 +50,21 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
#ifndef LAPACK_DISABLE_NAN_CHECK #ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) { if( LAPACKE_get_nancheck() ) {
/* Optionally check input matrices for NaNs */ /* Optionally check input matrices for NaNs */
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : lapack_int lrv, lcv; /* row, column stride */
( ( LAPACKE_lsame( storev, 'r' ) && if( matrix_layout == LAPACK_COL_MAJOR ) {
LAPACKE_lsame( side, 'l' ) ) ? m : lrv = 1;
( ( LAPACKE_lsame( storev, 'r' ) && lcv = ldv;
LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); } else {
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && lrv = ldv;
LAPACKE_lsame( side, 'l' ) ) ? m : lcv = 1;
( ( LAPACKE_lsame( storev, 'c' ) && }
LAPACKE_lsame( side, 'r' ) ) ? n : ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -13; return -13;
} }
@ -69,8 +74,8 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
return -9; return -9;
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v,
ldv ) ) &v[k*lrv], ldv ) )
return -9; return -9;
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > nrows_v ) { if( k > nrows_v ) {
@ -78,23 +83,23 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
return -8; return -8;
} }
if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k,
&v[(nrows_v-k)*ldv], ldv ) ) &v[(nrows_v-k)*lrv], ldv ) )
return -9; return -9;
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
return -9; return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
return -9; return -9;
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k,
ldv ) ) &v[k*lrv], ldv ) )
return -9; return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > ncols_v ) { if( k > ncols_v ) {
LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); LAPACKE_xerbla( "LAPACKE_slarfb", -8 );
return -8; return -8;
} }
if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k,
ldv ) ) &v[(ncols_v-k)*lcv], ldv ) )
return -9; return -9;
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
return -9; return -9;

View File

@ -42,7 +42,7 @@ lapack_int LAPACKE_slarfg( lapack_int n, float* alpha, float* x,
if( LAPACKE_s_nancheck( 1, alpha, 1 ) ) { if( LAPACKE_s_nancheck( 1, alpha, 1 ) ) {
return -2; return -2;
} }
if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { if( LAPACKE_s_nancheck( n-1, x, incx ) ) {
return -3; return -3;
} }
} }

View File

@ -37,6 +37,7 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m,
lapack_int n, const float* v, float tau, float* c, lapack_int n, const float* v, float tau, float* c,
lapack_int ldc, float* work ) lapack_int ldc, float* work )
{ {
lapack_int lv;
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
LAPACKE_xerbla( "LAPACKE_slarfx", -1 ); LAPACKE_xerbla( "LAPACKE_slarfx", -1 );
return -1; return -1;
@ -50,7 +51,8 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m,
if( LAPACKE_s_nancheck( 1, &tau, 1 ) ) { if( LAPACKE_s_nancheck( 1, &tau, 1 ) ) {
return -6; return -6;
} }
if( LAPACKE_s_nancheck( m, v, 1 ) ) { lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
if( LAPACKE_s_nancheck( lv, v, 1 ) ) {
return -5; return -5;
} }
} }

View File

@ -38,7 +38,7 @@ lapack_int LAPACKE_slassq( lapack_int n, float* x, lapack_int incx, float* scale
#ifndef LAPACK_DISABLE_NAN_CHECK #ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) { if( LAPACKE_get_nancheck() ) {
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { if( LAPACKE_s_nancheck( n, x, incx ) ) {
return -2; return -2;
} }
if( LAPACKE_s_nancheck( 1, scale, 1 ) ) { if( LAPACKE_s_nancheck( 1, scale, 1 ) ) {

View File

@ -56,7 +56,7 @@ lapack_int LAPACKE_sopmtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -9; return -9;
} }
if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) { if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) {
return -8; return -8;
} }
} }

View File

@ -57,7 +57,7 @@ lapack_int LAPACKE_sormtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -10; return -10;
} }
if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) { if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) {
return -9; return -9;
} }
} }

View File

@ -39,7 +39,7 @@ lapack_int LAPACKE_zlacgv( lapack_int n, lapack_complex_double* x,
#ifndef LAPACK_DISABLE_NAN_CHECK #ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) { if( LAPACKE_get_nancheck() ) {
/* Optionally check input matrices for NaNs */ /* Optionally check input matrices for NaNs */
if( LAPACKE_z_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) { if( LAPACKE_z_nancheck( n, x, incx ) ) {
return -2; return -2;
} }
} }

View File

@ -51,16 +51,21 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
#ifndef LAPACK_DISABLE_NAN_CHECK #ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) { if( LAPACKE_get_nancheck() ) {
/* Optionally check input matrices for NaNs */ /* Optionally check input matrices for NaNs */
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : lapack_int lrv, lcv; /* row, column stride */
( ( LAPACKE_lsame( storev, 'r' ) && if( matrix_layout == LAPACK_COL_MAJOR ) {
LAPACKE_lsame( side, 'l' ) ) ? m : lrv = 1;
( ( LAPACKE_lsame( storev, 'r' ) && lcv = ldv;
LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); } else {
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && lrv = ldv;
LAPACKE_lsame( side, 'l' ) ) ? m : lcv = 1;
( ( LAPACKE_lsame( storev, 'c' ) && }
LAPACKE_lsame( side, 'r' ) ) ? n : ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -13; return -13;
} }
@ -70,8 +75,8 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
return -9; return -9;
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v,
ldv ) ) &v[k*lrv], ldv ) )
return -9; return -9;
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > nrows_v ) { if( k > nrows_v ) {
@ -79,23 +84,23 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
return -8; return -8;
} }
if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k,
&v[(nrows_v-k)*ldv], ldv ) ) &v[(nrows_v-k)*lrv], ldv ) )
return -9; return -9;
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
return -9; return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
return -9; return -9;
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k,
ldv ) ) &v[k*lrv], ldv ) )
return -9; return -9;
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
if( k > ncols_v ) { if( k > ncols_v ) {
LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); LAPACKE_xerbla( "LAPACKE_zlarfb", -8 );
return -8; return -8;
} }
if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k,
ldv ) ) &v[(ncols_v-k)*lcv], ldv ) )
return -9; return -9;
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
return -9; return -9;

View File

@ -43,7 +43,7 @@ lapack_int LAPACKE_zlarfg( lapack_int n, lapack_complex_double* alpha,
if( LAPACKE_z_nancheck( 1, alpha, 1 ) ) { if( LAPACKE_z_nancheck( 1, alpha, 1 ) ) {
return -2; return -2;
} }
if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { if( LAPACKE_z_nancheck( n-1, x, incx ) ) {
return -3; return -3;
} }
} }

View File

@ -38,6 +38,7 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m,
lapack_complex_double tau, lapack_complex_double* c, lapack_complex_double tau, lapack_complex_double* c,
lapack_int ldc, lapack_complex_double* work ) lapack_int ldc, lapack_complex_double* work )
{ {
lapack_int lv;
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
LAPACKE_xerbla( "LAPACKE_zlarfx", -1 ); LAPACKE_xerbla( "LAPACKE_zlarfx", -1 );
return -1; return -1;
@ -51,7 +52,8 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m,
if( LAPACKE_z_nancheck( 1, &tau, 1 ) ) { if( LAPACKE_z_nancheck( 1, &tau, 1 ) ) {
return -6; return -6;
} }
if( LAPACKE_z_nancheck( m, v, 1 ) ) { lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
if( LAPACKE_z_nancheck( lv, v, 1 ) ) {
return -5; return -5;
} }
} }

View File

@ -39,7 +39,7 @@ lapack_int LAPACKE_zlassq( lapack_int n, lapack_complex_double* x,
#ifndef LAPACK_DISABLE_NAN_CHECK #ifndef LAPACK_DISABLE_NAN_CHECK
if( LAPACKE_get_nancheck() ) { if( LAPACKE_get_nancheck() ) {
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { if( LAPACKE_z_nancheck( n, x, incx ) ) {
return -2; return -2;
} }
if( LAPACKE_d_nancheck( 1, scale, 1 ) ) { if( LAPACKE_d_nancheck( 1, scale, 1 ) ) {

View File

@ -58,7 +58,7 @@ lapack_int LAPACKE_zunmtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -10; return -10;
} }
if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) { if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) {
return -9; return -9;
} }
} }

View File

@ -58,7 +58,7 @@ lapack_int LAPACKE_zupmtr( int matrix_layout, char side, char uplo, char trans,
if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
return -9; return -9;
} }
if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) { if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) {
return -8; return -8;
} }
} }

View File

@ -512,7 +512,7 @@ C END IF
* *
* Call the kernel * Call the kernel
* *
#if defined(_OPENMP) && _OPENMP >= 201307L #if defined(_OPENMP) && _OPENMP >= 201307
IF( TTYPE.NE.1 ) THEN IF( TTYPE.NE.1 ) THEN
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
!$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(in:WORK(MYID-1))

View File

@ -481,7 +481,7 @@
* *
* Call the kernel * Call the kernel
* *
#if defined(_OPENMP) && _OPENMP >= 201307L #if defined(_OPENMP) && _OPENMP >= 201307
IF( TTYPE.NE.1 ) THEN IF( TTYPE.NE.1 ) THEN
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
!$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(in:WORK(MYID-1))

View File

@ -512,7 +512,7 @@ C END IF
* *
* Call the kernel * Call the kernel
* *
#if defined(_OPENMP) && _OPENMP >= 201307L #if defined(_OPENMP) && _OPENMP >= 201307
IF( TTYPE.NE.1 ) THEN IF( TTYPE.NE.1 ) THEN
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))

View File

@ -67,6 +67,26 @@ double sqrt(double);
#undef GETRF_FACTOR #undef GETRF_FACTOR
#define GETRF_FACTOR 1.00 #define GETRF_FACTOR 1.00
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
static pthread_spinlock_t getrf_lock = 0;
#else
static BLASULONG getrf_lock = 0UL;
#endif
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
static pthread_spinlock_t getrf_flag_lock = 0;
#else
static BLASULONG getrf_flag_lock = 0UL;
#endif
static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) { static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) {
double m = (double)(M - IS - BK); double m = (double)(M - IS - BK);
@ -99,7 +119,11 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE;
FLOAT *sbb = sb; FLOAT *sbb = sb;
#if __STDC_VERSION__ >= 201112L
_Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
#else
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
#endif
blasint *ipiv = (blasint *)args -> c; blasint *ipiv = (blasint *)args -> c;
@ -177,7 +201,12 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
/* Non blocking implementation */ /* Non blocking implementation */
typedef struct { typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; #if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t; } job_t;
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
@ -216,9 +245,12 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
FLOAT *sbb= sb; FLOAT *sbb= sb;
blasint *ipiv = (blasint *)args -> c; blasint *ipiv = (blasint *)args -> c;
BLASLONG jw;
#if __STDC_VERSION__ >= 201112L
_Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
#else
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
#endif
if (args -> a == NULL) { if (args -> a == NULL) {
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
@ -245,8 +277,20 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
for (i = 0; i < args -> nthreads; i++) for (i = 0; i < args -> nthreads; i++)
#if 1
{
LOCK_COMMAND(&getrf_lock);
jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
UNLOCK_COMMAND(&getrf_lock);
do {
LOCK_COMMAND(&getrf_lock);
jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
UNLOCK_COMMAND(&getrf_lock);
} while (jw);
}
#else
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {}; while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {};
#endif
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs; min_jj = MIN(n_to, xxx + div_n) - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
@ -283,18 +327,23 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
b + (is + jjs * lda) * COMPSIZE, lda, is); b + (is + jjs * lda) * COMPSIZE, lda, is);
} }
} }
MB; MB;
for (i = 0; i < args -> nthreads; i++) for (i = 0; i < args -> nthreads; i++) {
LOCK_COMMAND(&getrf_lock);
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
UNLOCK_COMMAND(&getrf_lock);
}
} }
LOCK_COMMAND(&getrf_flag_lock);
flag[mypos * CACHE_LINE_SIZE] = 0; flag[mypos * CACHE_LINE_SIZE] = 0;
UNLOCK_COMMAND(&getrf_flag_lock);
if (m == 0) { if (m == 0) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
LOCK_COMMAND(&getrf_lock);
job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0; job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0;
UNLOCK_COMMAND(&getrf_lock);
} }
} }
@ -318,7 +367,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
if ((current != mypos) && (!is)) { if ((current != mypos) && (!is)) {
#if 1
LOCK_COMMAND(&getrf_lock);
jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
UNLOCK_COMMAND(&getrf_lock);
do {
LOCK_COMMAND(&getrf_lock);
jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
UNLOCK_COMMAND(&getrf_lock);
} while (jw == 0);
#else
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {}; while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {};
#endif
} }
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k,
@ -327,7 +387,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
MB; MB;
if (is + min_i >= m) { if (is + min_i >= m) {
LOCK_COMMAND(&getrf_lock);
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
UNLOCK_COMMAND(&getrf_lock);
} }
} }
@ -339,7 +401,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
for (i = 0; i < args -> nthreads; i++) { for (i = 0; i < args -> nthreads; i++) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
#if 1
LOCK_COMMAND(&getrf_lock);
jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
UNLOCK_COMMAND(&getrf_lock);
do {
LOCK_COMMAND(&getrf_lock);
jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
UNLOCK_COMMAND(&getrf_lock);
} while(jw != 0);
#else
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {}; while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {};
#endif
} }
} }
@ -374,11 +447,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
BLASLONG i, j, k, is, bk; BLASLONG i, j, k, is, bk;
BLASLONG num_cpu; BLASLONG num_cpu;
BLASLONG f;
#ifdef _MSC_VER #ifdef _MSC_VER
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE]; BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE];
#else #else
volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); #if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
#endif #endif
#ifndef COMPLEX #ifndef COMPLEX
@ -501,11 +580,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
if (mm >= nn) { if (mm >= nn) {
width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
if (width == 0) width = nn;
if (nn < width) width = nn; if (nn < width) width = nn;
nn -= width; nn -= width;
range_N[num_cpu + 1] = range_N[num_cpu] + width; range_N[num_cpu + 1] = range_N[num_cpu] + width;
width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
if (width == 0) width = mm;
if (mm < width) width = mm; if (mm < width) width = mm;
if (nn <= 0) width = mm; if (nn <= 0) width = mm;
mm -= width; mm -= width;
@ -514,11 +595,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
} else { } else {
width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
if (width == 0) width = mm;
if (mm < width) width = mm; if (mm < width) width = mm;
mm -= width; mm -= width;
range_M[num_cpu + 1] = range_M[num_cpu] + width; range_M[num_cpu + 1] = range_M[num_cpu] + width;
width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
if (width == 0) width = nn;
if (nn < width) width = nn; if (nn < width) width = nn;
if (mm <= 0) width = nn; if (mm <= 0) width = nn;
nn -= width; nn -= width;
@ -561,7 +644,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
range_n_new[1] = offset + is + bk; range_n_new[1] = offset + is + bk;
if (num_cpu > 0) { if (num_cpu > 0) {
queue[num_cpu - 1].next = NULL; queue[num_cpu - 1].next = NULL;
exec_blas_async(0, &queue[0]); exec_blas_async(0, &queue[0]);
@ -572,8 +654,20 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
if (iinfo && !info) info = iinfo + is; if (iinfo && !info) info = iinfo + is;
for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; for (i = 0; i < num_cpu; i ++) {
#if 1
LOCK_COMMAND(&getrf_flag_lock);
f=flag[i*CACHE_LINE_SIZE];
UNLOCK_COMMAND(&getrf_flag_lock);
while (f!=0) {
LOCK_COMMAND(&getrf_flag_lock);
f=flag[i*CACHE_LINE_SIZE];
UNLOCK_COMMAND(&getrf_flag_lock);
};
#else
while (flag[i*CACHE_LINE_SIZE]) {};
#endif
}
TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb); TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb);
} else { } else {
@ -634,8 +728,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
BLASLONG range[MAX_CPU_NUMBER + 1]; BLASLONG range[MAX_CPU_NUMBER + 1];
BLASLONG width, nn, num_cpu; BLASLONG width, nn, num_cpu;
#if __STDC_VERSION__ >= 201112L
volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); _Atomic
#else
volatile
#endif
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
#ifndef COMPLEX #ifndef COMPLEX
#ifdef XDOUBLE #ifdef XDOUBLE

View File

@ -0,0 +1,664 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifndef USE_SIMPLE_THREADED_LEVEL3
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif
static FLOAT dm1 = -1.;
#ifndef KERNEL_FUNC
#ifndef LOWER
#define KERNEL_FUNC SYRK_KERNEL_U
#else
#define KERNEL_FUNC SYRK_KERNEL_L
#endif
#endif
#ifndef LOWER
#ifndef COMPLEX
#define TRSM_KERNEL TRSM_KERNEL_LT
#else
#define TRSM_KERNEL TRSM_KERNEL_LC
#endif
#else
#ifndef COMPLEX
#define TRSM_KERNEL TRSM_KERNEL_RN
#else
#define TRSM_KERNEL TRSM_KERNEL_RR
#endif
#endif
#ifndef CACHE_LINE_SIZE
#define CACHE_LINE_SIZE 8
#endif
#ifndef DIVIDE_RATE
#define DIVIDE_RATE 2
#endif
#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif
#ifndef LOWER
#define TRANS
#endif
#ifndef SYRK_LOCAL
#if !defined(LOWER) && !defined(TRANS)
#define SYRK_LOCAL SYRK_UN
#elif !defined(LOWER) && defined(TRANS)
#define SYRK_LOCAL SYRK_UT
#elif defined(LOWER) && !defined(TRANS)
#define SYRK_LOCAL SYRK_LN
#else
#define SYRK_LOCAL SYRK_LT
#endif
#endif
typedef struct {
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;
#ifndef KERNEL_OPERATION
#ifndef COMPLEX
#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
#else
#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
#endif
#endif
#ifndef ICOPY_OPERATION
#ifndef TRANS
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
#else
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
#endif
#endif
#ifndef OCOPY_OPERATION
#ifdef TRANS
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
#else
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
#endif
#endif
#ifndef S
#define S args -> a
#endif
#ifndef A
#define A args -> b
#endif
#ifndef C
#define C args -> c
#endif
#ifndef LDA
#define LDA args -> lda
#endif
#ifndef N
#define N args -> m
#endif
#ifndef K
#define K args -> k
#endif
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
FLOAT *buffer[DIVIDE_RATE];
BLASLONG k, lda;
BLASLONG m_from, m_to;
FLOAT *alpha;
FLOAT *a, *c;
job_t *job = (job_t *)args -> common;
BLASLONG xxx, bufferside;
BLASLONG jjs, min_jj;
BLASLONG is, min_i, div_n;
BLASLONG i, current;
k = K;
a = (FLOAT *)A;
c = (FLOAT *)C;
lda = LDA;
alpha = (FLOAT *)args -> alpha;
m_from = range_n[mypos + 0];
m_to = range_n[mypos + 1];
#if 0
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to);
#endif
div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
for (i = 1; i < DIVIDE_RATE; i++) {
buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
}
#ifndef LOWER
TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb);
#else
TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb);
#endif
for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(m_to, xxx + div_n) - jjs;
#ifndef LOWER
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
#else
if (min_jj > GEMM_P) min_jj = GEMM_P;
#endif
#ifndef LOWER
OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
TRSM_KERNEL (k, min_jj, k, dm1,
#ifdef COMPLEX
ZERO,
#endif
sb,
buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
a + jjs * lda * COMPSIZE, lda, 0);
#else
ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
TRSM_KERNEL (min_jj, k, k, dm1,
#ifdef COMPLEX
ZERO,
#endif
buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
sb,
a + jjs * COMPSIZE, lda, 0);
#endif
}
#ifndef LOWER
for (i = 0; i <= mypos; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
#else
for (i = mypos; i < args -> nthreads; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
#endif
WMB;
}
min_i = m_to - m_from;
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
#ifndef LOWER
ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
#else
OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
#endif
current = mypos;
#ifndef LOWER
while (current < args -> nthreads)
#else
while (current >= 0)
#endif
{
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
/* thread has to wait */
if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, lda, m_from, xxx);
if (m_from + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}
#ifndef LOWER
current ++;
#else
current --;
#endif
}
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
#ifndef LOWER
ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
#else
OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
#endif
current = mypos;
#ifndef LOWER
while (current < args -> nthreads)
#else
while (current >= 0)
#endif
{
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, lda, is, xxx);
if (is + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}
#ifndef LOWER
current ++;
#else
current --;
#endif
}
}
for (i = 0; i < args -> nthreads; i++) {
if (i != mypos) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
}
}
}
return 0;
}
static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
blas_arg_t newarg;
#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range[MAX_CPU_NUMBER + 100];
BLASLONG num_cpu;
BLASLONG nthreads = args -> nthreads;
BLASLONG width, i, j, k;
BLASLONG n, n_from, n_to;
int mode, mask;
double dnum;
#ifndef COMPLEX
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_REAL;
mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_REAL;
mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
#else
mode = BLAS_SINGLE | BLAS_REAL;
mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_COMPLEX;
mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
#endif
#endif
newarg.m = args -> m;
newarg.k = args -> k;
newarg.a = args -> a;
newarg.b = args -> b;
newarg.c = args -> c;
newarg.lda = args -> lda;
newarg.alpha = args -> alpha;
#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif
newarg.common = (void *)job;
n_from = 0;
n_to = args -> m;
#ifndef LOWER
range[MAX_CPU_NUMBER] = n_to - n_from;
range[0] = 0;
num_cpu = 0;
i = 0;
n = n_to - n_from;
dnum = (double)n * (double)n /(double)nthreads;
while (i < n){
if (nthreads - num_cpu > 1) {
double di = (double)i;
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1));
if ((width > n - i) || (width < mask)) width = n - i;
} else {
width = n - i;
}
range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = inner_thread;
queue[num_cpu].args = &newarg;
queue[num_cpu].range_m = NULL;
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
num_cpu ++;
i += width;
}
for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu];
#else
range[0] = 0;
num_cpu = 0;
i = 0;
n = n_to - n_from;
dnum = (double)n * (double)n /(double)nthreads;
while (i < n){
if (nthreads - num_cpu > 1) {
double di = (double)i;
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
if ((width > n - i) || (width < mask)) width = n - i;
} else {
width = n - i;
}
range[num_cpu + 1] = range[num_cpu] + width;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = inner_thread;
queue[num_cpu].args = &newarg;
queue[num_cpu].range_m = NULL;
queue[num_cpu].range_n = range;
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
num_cpu ++;
i += width;
}
#endif
newarg.nthreads = num_cpu;
if (num_cpu) {
for (j = 0; j < num_cpu; j++) {
for (i = 0; i < num_cpu; i++) {
for (k = 0; k < DIVIDE_RATE; k++) {
job[j].working[i][CACHE_LINE_SIZE * k] = 0;
}
}
}
queue[0].sa = sa;
queue[0].sb = sb;
queue[num_cpu - 1].next = NULL;
exec_blas(num_cpu, queue);
}
#ifdef USE_ALLOC_HEAP
free(job);
#endif
return 0;
}
#endif
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {
BLASLONG n, bk, i, blocking, lda;
BLASLONG info;
int mode;
blas_arg_t newarg;
FLOAT *a;
FLOAT alpha[2] = { -ONE, ZERO};
#ifndef COMPLEX
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
#endif
if (args -> nthreads == 1) {
#ifndef LOWER
info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0);
#else
info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0);
#endif
return info;
}
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
if (range_n) n = range_n[1] - range_n[0];
if (n <= GEMM_UNROLL_N * 2) {
#ifndef LOWER
info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0);
#else
info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0);
#endif
return info;
}
newarg.lda = lda;
newarg.ldb = lda;
newarg.ldc = lda;
newarg.alpha = alpha;
newarg.beta = NULL;
newarg.nthreads = args -> nthreads;
blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N;
if (blocking > GEMM_Q) blocking = GEMM_Q;
for (i = 0; i < n; i += blocking) {
bk = n - i;
if (bk > blocking) bk = blocking;
newarg.m = bk;
newarg.n = bk;
newarg.a = a + (i + i * lda) * COMPSIZE;
info = CNAME(&newarg, NULL, NULL, sa, sb, 0);
if (info) return info + i;
if (n - i - bk > 0) {
#ifndef USE_SIMPLE_THREADED_LEVEL3
newarg.m = n - i - bk;
newarg.k = bk;
#ifndef LOWER
newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE;
#else
newarg.b = a + ((i + bk) + i * lda) * COMPSIZE;
#endif
newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
thread_driver(&newarg, sa, sb);
#else
#ifndef LOWER
newarg.m = bk;
newarg.n = n - i - bk;
newarg.a = a + (i + i * lda) * COMPSIZE;
newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;
gemm_thread_n(mode | BLAS_TRANSA_T,
&newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads);
newarg.n = n - i - bk;
newarg.k = bk;
newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE;
newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
#if 0
HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0);
#else
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
&newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads);
#endif
#else
newarg.m = n - i - bk;
newarg.n = bk;
newarg.a = a + (i + i * lda) * COMPSIZE;
newarg.b = a + (i + bk + i * lda) * COMPSIZE;
gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO,
&newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads);
newarg.n = n - i - bk;
newarg.k = bk;
newarg.a = a + (i + bk + i * lda) * COMPSIZE;
newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE;
#if 0
HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
#else
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO,
&newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads);
#endif
#endif
#endif
}
}
return 0;
}

View File

@ -2291,7 +2291,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16 #define SYMV_P 16
#endif #endif
#if defined(P5600) || defined(I6400) || defined(P6600) || defined(I6500) #if defined(P5600) || defined(MIPS1004K) || defined(I6400) || defined(P6600) || defined(I6500)
#define SNUMOPT 2 #define SNUMOPT 2
#define DNUMOPT 2 #define DNUMOPT 2

View File

@ -25,6 +25,7 @@ endif ()
# known to hang with the native Windows and Android threads # known to hang with the native Windows and Android threads
# FIXME needs checking if this works on any of the other platforms # FIXME needs checking if this works on any of the other platforms
if (NOT NO_CBLAS)
if (NOT USE_OPENMP) if (NOT USE_OPENMP)
if (OS_CYGWIN_NT OR OS_LINUX) if (OS_CYGWIN_NT OR OS_LINUX)
set(OpenBLAS_utest_src set(OpenBLAS_utest_src
@ -33,6 +34,7 @@ set(OpenBLAS_utest_src
) )
endif() endif()
endif() endif()
endif()
if (NOT NO_LAPACK) if (NOT NO_LAPACK)
set(OpenBLAS_utest_src set(OpenBLAS_utest_src

View File

@ -17,11 +17,13 @@ endif
#this does not work with OpenMP nor with native Windows or Android threads #this does not work with OpenMP nor with native Windows or Android threads
# FIXME TBD if this works on OSX, SunOS, POWER and zarch # FIXME TBD if this works on OSX, SunOS, POWER and zarch
ifneq ($(NO_CBLAS), 1)
ifndef USE_OPENMP ifndef USE_OPENMP
ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT))
OBJS += test_fork.o OBJS += test_fork.o
endif endif
endif endif
endif
all : run_test all : run_test

View File

@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/ **********************************************************************************/
#include "openblas_utest.h" #include "openblas_utest.h"
#include <sys/types.h>
#include <sys/wait.h> #include <sys/wait.h>
#include <cblas.h> #include <cblas.h>