Merge pull request #1570 from xianyi/develop
Update release-0.3.0 branch to match develop
This commit is contained in:
commit
939452ea9d
23
.travis.yml
23
.travis.yml
|
@ -7,6 +7,7 @@ language: c
|
|||
jobs:
|
||||
include:
|
||||
- &test-ubuntu
|
||||
os: linux
|
||||
stage: test
|
||||
compiler: gcc
|
||||
addons:
|
||||
|
@ -57,7 +58,8 @@ jobs:
|
|||
- TARGET_BOX=LINUX32
|
||||
- BTYPE="BINARY=32"
|
||||
|
||||
- stage: test
|
||||
- os: linux
|
||||
stage: test
|
||||
compiler: gcc
|
||||
addons:
|
||||
apt:
|
||||
|
@ -77,6 +79,7 @@ jobs:
|
|||
# which is slower than container-based infrastructure used for jobs
|
||||
# that don't require sudo.
|
||||
- &test-alpine
|
||||
os: linux
|
||||
stage: test
|
||||
dist: trusty
|
||||
sudo: true
|
||||
|
@ -120,6 +123,7 @@ jobs:
|
|||
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
|
||||
|
||||
- &test-cmake
|
||||
os: linux
|
||||
stage: test
|
||||
compiler: clang
|
||||
addons:
|
||||
|
@ -147,6 +151,23 @@ jobs:
|
|||
env:
|
||||
- CMAKE=1
|
||||
|
||||
- &test-macos
|
||||
os: osx
|
||||
stage: test
|
||||
osx_image: xcode8
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
- brew update
|
||||
- brew install gcc # for gfortran
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||
|
||||
- <<: *test-macos
|
||||
env:
|
||||
- BTYPE="BINARY=32"
|
||||
|
||||
# whitelist
|
||||
branches:
|
||||
only:
|
||||
|
|
6
Makefile
6
Makefile
|
@ -91,11 +91,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
|||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), NetBSD)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
|
|
|
@ -72,12 +72,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
|||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), NetBSD)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
|
@ -101,8 +96,9 @@ endif
|
|||
|
||||
#Generating openblas.pc
|
||||
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
|
@ -115,7 +111,7 @@ endif
|
|||
|
||||
ifndef NO_SHARED
|
||||
#ifeq logical or
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
endif
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
|
||||
|
|
|
@ -17,6 +17,10 @@ ifdef CPUIDEMU
|
|||
EXFLAGS = -DCPUIDEMU -DVENDOR=99
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), 1004K)
|
||||
TARGET_FLAGS = -mips32r2
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), P5600)
|
||||
TARGET_FLAGS = -mips32r5
|
||||
endif
|
||||
|
|
|
@ -60,6 +60,13 @@ VERSION = 0.3.0.dev
|
|||
# automatically detected by the the script.
|
||||
# NUM_THREADS = 24
|
||||
|
||||
# If you have enabled USE_OPENMP and your application would call
|
||||
# OpenBLAS's calculation API from multi threads, please comment it in.
|
||||
# This flag defines how many instances of OpenBLAS's calculation API can
|
||||
# actually run in parallel. If more threads call OpenBLAS's calculation API,
|
||||
# they need to wait for the preceding API calls to finish or risk data corruption.
|
||||
# NUM_PARALLEL = 2
|
||||
|
||||
# if you don't need to install the static library, please comment it in.
|
||||
# NO_STATIC = 1
|
||||
|
||||
|
|
|
@ -17,15 +17,24 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
|
|||
# http://stackoverflow.com/questions/4029274/mingw-and-make-variables
|
||||
# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
|
||||
ifeq ($(origin CC),default)
|
||||
|
||||
# Check if $(CC) refers to a valid command and set the value to gcc if not
|
||||
ifneq ($(findstring cmd.exe,$(SHELL)),)
|
||||
ifeq ($(shell where $(CC) 2>NUL),)
|
||||
CC = gcc
|
||||
# Change the default compile to clang on Mac OSX.
|
||||
# http://stackoverflow.com/questions/714100/os-detecting-makefile
|
||||
UNAME_S := $(shell uname -s)
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CC = clang
|
||||
# EXTRALIB += -Wl,-no_compact_unwind
|
||||
endif
|
||||
endif
|
||||
else # POSIX-ish
|
||||
ifeq ($(shell command -v $(CC) 2>/dev/null),)
|
||||
ifeq ($(shell uname -s),Darwin)
|
||||
CC = clang
|
||||
# EXTRALIB += -Wl,-no_compact_unwind
|
||||
else
|
||||
CC = gcc
|
||||
endif # Darwin
|
||||
endif # CC exists
|
||||
endif # Shell is sane
|
||||
|
||||
endif # CC is set to default
|
||||
|
||||
# Default Fortran compiler (FC) is selected by f_check.
|
||||
|
||||
|
@ -175,6 +184,10 @@ endif
|
|||
|
||||
endif
|
||||
|
||||
ifndef NUM_PARALLEL
|
||||
NUM_PARALLEL = 1
|
||||
endif
|
||||
|
||||
ifndef NUM_THREADS
|
||||
NUM_THREADS = $(NUM_CORES)
|
||||
endif
|
||||
|
@ -230,7 +243,7 @@ endif
|
|||
MD5SUM = md5 -r
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly))
|
||||
MD5SUM = md5 -r
|
||||
endif
|
||||
|
||||
|
@ -424,7 +437,7 @@ CCOMMON_OPT += -fopenmp
|
|||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), INTEL)
|
||||
CCOMMON_OPT += -openmp
|
||||
CCOMMON_OPT += -fopenmp
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
|
@ -555,9 +568,14 @@ CCOMMON_OPT += -march=mips64
|
|||
FCOMMON_OPT += -march=mips64
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), 1004K)
|
||||
CCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
||||
FCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), P5600)
|
||||
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), I6400)
|
||||
|
@ -704,7 +722,7 @@ FCOMMON_OPT += -i8
|
|||
endif
|
||||
endif
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -openmp
|
||||
FCOMMON_OPT += -fopenmp
|
||||
endif
|
||||
endif
|
||||
|
||||
|
@ -952,6 +970,8 @@ endif
|
|||
|
||||
CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS)
|
||||
|
||||
CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL)
|
||||
|
||||
ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||
endif
|
||||
|
|
230
README.md
230
README.md
|
@ -5,175 +5,219 @@
|
|||
Travis CI: [](https://travis-ci.org/xianyi/OpenBLAS)
|
||||
|
||||
AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
||||
|
||||
## Introduction
|
||||
|
||||
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
|
||||
|
||||
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.
|
||||
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
|
||||
|
||||
## Binary Packages
|
||||
We provide binary packages for the following platform.
|
||||
|
||||
We provide official binary packages for the following platform:
|
||||
|
||||
* Windows x86/x86_64
|
||||
|
||||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
|
||||
|
||||
## Installation from Source
|
||||
Download from project homepage. http://xianyi.github.com/OpenBLAS/
|
||||
|
||||
Or, check out codes from git://github.com/xianyi/OpenBLAS.git
|
||||
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
|
||||
using Git from https://github.com/xianyi/OpenBLAS.git.
|
||||
|
||||
### Dependencies
|
||||
|
||||
Building OpenBLAS requires the following to be installed:
|
||||
|
||||
* GNU Make
|
||||
* A C compiler, e.g. GCC or Clang
|
||||
* A Fortran compiler (optional, for LAPACK)
|
||||
* IBM MASS (optional, see below)
|
||||
|
||||
### Normal compile
|
||||
* type "make" to detect the CPU automatically.
|
||||
or
|
||||
* type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
|
||||
|
||||
Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically.
|
||||
To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`.
|
||||
The full target list is in the file `TargetList.txt`.
|
||||
|
||||
### Cross compile
|
||||
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
|
||||
|
||||
Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler.
|
||||
The target must be specified explicitly when cross compiling.
|
||||
|
||||
Examples:
|
||||
|
||||
On X86 box, compile this library for loongson3a CPU.
|
||||
* On an x86 box, compile this library for a loongson3a CPU:
|
||||
```sh
|
||||
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
|
||||
```
|
||||
|
||||
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
|
||||
|
||||
On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler.
|
||||
|
||||
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
|
||||
* On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler:
|
||||
```sh
|
||||
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
|
||||
```
|
||||
|
||||
### Debug version
|
||||
|
||||
make DEBUG=1
|
||||
A debug version can be built using `make DEBUG=1`.
|
||||
|
||||
### Compile with MASS Support on Power CPU (Optional dependency)
|
||||
### Compile with MASS support on Power CPU (optional)
|
||||
|
||||
[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and
|
||||
Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER.
|
||||
The library can be installed as below -
|
||||
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
|
||||
consists of a set of mathematical functions for C, C++, and Fortran applications that are
|
||||
are tuned for optimum performance on POWER architectures.
|
||||
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
|
||||
The library can be installed as shown:
|
||||
|
||||
* On Ubuntu:
|
||||
* On Ubuntu:
|
||||
```sh
|
||||
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
|
||||
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install libxlmass-devel.8.1.5
|
||||
```
|
||||
|
||||
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -</br>
|
||||
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list</br>
|
||||
sudo apt-get update</br>
|
||||
sudo apt-get install libxlmass-devel.8.1.5</br>
|
||||
* On RHEL/CentOS:
|
||||
```sh
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
|
||||
sudo rpm --import repomd.xml.key
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
|
||||
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
|
||||
sudo yum install libxlmass-devel.8.1.5
|
||||
```
|
||||
|
||||
* On RHEL/CentOS:
|
||||
After installing the MASS library, compile OpenBLAS with `USE_MASS=1`.
|
||||
For example, to compile on Power8 with MASS support: `make USE_MASS=1 TARGET=POWER8`.
|
||||
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key</br>
|
||||
sudo rpm --import repomd.xml.key</br>
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo</br>
|
||||
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/</br>
|
||||
sudo yum install libxlmass-devel.8.1.5</br>
|
||||
### Install to a specific directory (optional)
|
||||
|
||||
After installing MASS library, compile openblas with USE_MASS=1.
|
||||
Use `PREFIX=` when invoking `make`, for example
|
||||
|
||||
Example:
|
||||
```sh
|
||||
make install PREFIX=your_installation_directory
|
||||
```
|
||||
|
||||
Compiling on Power8 with MASS support -
|
||||
The default installation directory is `/opt/OpenBLAS`.
|
||||
|
||||
make USE_MASS=1 TARGET=POWER8
|
||||
## Supported CPUs and Operating Systems
|
||||
|
||||
### Install to the directory (optional)
|
||||
Please read `GotoBLAS_01Readme.txt`.
|
||||
|
||||
Example:
|
||||
### Additional supported CPUs
|
||||
|
||||
make install PREFIX=your_installation_directory
|
||||
#### x86/x86-64
|
||||
|
||||
The default directory is /opt/OpenBLAS
|
||||
|
||||
## Support CPU & OS
|
||||
Please read GotoBLAS_01Readme.txt
|
||||
|
||||
### Additional support CPU:
|
||||
|
||||
#### x86/x86-64:
|
||||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar)
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
||||
|
||||
#### MIPS64:
|
||||
#### MIPS64
|
||||
|
||||
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
||||
- **ICT Loongson 3B**: Experimental
|
||||
|
||||
#### ARM:
|
||||
- **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ )
|
||||
- **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 )
|
||||
#### ARM
|
||||
|
||||
#### ARM64:
|
||||
- **ARMV8**: Experimental
|
||||
- **ARMv6**: Optimized BLAS for vfpv2 and vfpv3-d16 (e.g. BCM2835, Cortex M0+)
|
||||
- **ARMv7**: Optimized BLAS for vfpv3-d32 (e.g. Cortex A8, A9 and A15)
|
||||
|
||||
#### ARM64
|
||||
|
||||
- **ARMv8**: Experimental
|
||||
- **ARM Cortex-A57**: Experimental
|
||||
|
||||
#### PPC/PPC64
|
||||
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with USE_OPENMP=1
|
||||
|
||||
#### IBM zEnterprise System:
|
||||
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
|
||||
|
||||
#### IBM zEnterprise System
|
||||
|
||||
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
|
||||
|
||||
|
||||
### Support OS:
|
||||
### Supported OS
|
||||
|
||||
- **GNU/Linux**
|
||||
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
|
||||
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
|
||||
- **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
- **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts.
|
||||
- **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
|
||||
## Usages
|
||||
Link with libopenblas.a or -lopenblas for shared library.
|
||||
## Usage
|
||||
|
||||
### Set the number of threads with environment variables.
|
||||
Statically link with `libopenblas.a` or dynamically link with `-lopenblas` if OpenBLAS was
|
||||
compiled as a shared library.
|
||||
|
||||
Examples:
|
||||
### Setting the number of threads using environment variables
|
||||
|
||||
export OPENBLAS_NUM_THREADS=4
|
||||
Environment variables are used to specify a maximum number of threads.
|
||||
For example,
|
||||
|
||||
or
|
||||
```sh
|
||||
export OPENBLAS_NUM_THREADS=4
|
||||
export GOTO_NUM_THREADS=4
|
||||
export OMP_NUM_THREADS=4
|
||||
```
|
||||
|
||||
export GOTO_NUM_THREADS=4
|
||||
The priorities are `OPENBLAS_NUM_THREADS` > `GOTO_NUM_THREADS` > `OMP_NUM_THREADS`.
|
||||
|
||||
or
|
||||
If you compile this library with `USE_OPENMP=1`, you should set the `OMP_NUM_THREADS`
|
||||
environment variable; OpenBLAS ignores `OPENBLAS_NUM_THREADS` and `GOTO_NUM_THREADS` when
|
||||
compiled with `USE_OPENMP=1`.
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
### Setting the number of threads at runtime
|
||||
|
||||
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
|
||||
We provide the following functions to control the number of threads at runtime:
|
||||
|
||||
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
|
||||
```c
|
||||
void goto_set_num_threads(int num_threads);
|
||||
void openblas_set_num_threads(int num_threads);
|
||||
```
|
||||
|
||||
### Set the number of threads on runtime.
|
||||
If you compile this library with `USE_OPENMP=1`, you should use the above functions too.
|
||||
|
||||
We provided the below functions to control the number of threads on runtime.
|
||||
## Reporting bugs
|
||||
|
||||
void goto_set_num_threads(int num_threads);
|
||||
|
||||
void openblas_set_num_threads(int num_threads);
|
||||
|
||||
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
|
||||
|
||||
## Report Bugs
|
||||
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
|
||||
Please submit an issue in https://github.com/xianyi/OpenBLAS/issues.
|
||||
|
||||
## Contact
|
||||
|
||||
* OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
|
||||
* OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev
|
||||
|
||||
## ChangeLog
|
||||
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
|
||||
## Change log
|
||||
|
||||
Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version.
|
||||
|
||||
## Troubleshooting
|
||||
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
|
||||
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
|
||||
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
|
||||
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
|
||||
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
|
||||
|
||||
* Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first.
|
||||
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
||||
Clang 3.0 will generate the wrong AVX binary code.
|
||||
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
|
||||
the library with `BIGNUMA=1`.
|
||||
* OpenBLAS does not set processor affinity by default.
|
||||
On Linux, you can enable processor affinity by commenting out the line `NO_AFFINITY=1` in
|
||||
Makefile.rule. However, note that this may cause
|
||||
[a conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
|
||||
* On Loongson 3A, `make test` may fail with a `pthread_create` error (`EAGAIN`).
|
||||
However, it will be okay when you run the same test case on the shell.
|
||||
|
||||
## Contributing
|
||||
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
|
||||
1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
|
||||
1. Write a test which shows that the bug was fixed or that the feature works as expected.
|
||||
1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
|
||||
|
||||
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue
|
||||
to start a discussion around a feature idea or a bug.
|
||||
2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
|
||||
3. Write a test which shows that the bug was fixed or that the feature works as expected.
|
||||
4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
|
||||
|
||||
## Donation
|
||||
|
||||
Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation).
|
||||
|
|
|
@ -56,6 +56,7 @@ CELL
|
|||
|
||||
3.MIPS CPU:
|
||||
P5600
|
||||
1004K
|
||||
|
||||
4.MIPS64 CPU:
|
||||
SICORTEX
|
||||
|
|
14
USAGE.md
14
USAGE.md
|
@ -14,6 +14,20 @@ Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
|
|||
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
|
||||
`MAX_CPU_NUMBER=NUM_THREADS`.
|
||||
|
||||
Despite its name, and due to the use of memory buffers in functions like SGEMM,
|
||||
the setting of NUM_THREADS can be relevant even for a single-threaded build
|
||||
of OpenBLAS, if such functions get called by multiple threads of a program
|
||||
that uses OpenBLAS. In some cases, the affected code may simply crash or throw
|
||||
a segmentation fault without displaying the above warning first.
|
||||
|
||||
Note that the number of threads used at runtime can be altered to differ from the
|
||||
value NUM_THREADS was set to at build time. At runtime, the actual number of
|
||||
threads can be set anywhere from 1 to the build's NUM_THREADS (note however,
|
||||
that this does not change the number of memory buffers that will be allocated,
|
||||
which is set at build time). The number of threads for a process can be set by
|
||||
using the mechanisms described below.
|
||||
|
||||
|
||||
#### How can I use OpenBLAS in multi-threaded applications?
|
||||
|
||||
If your application is already multi-threaded, it will conflict with OpenBLAS
|
||||
|
|
2
c_check
2
c_check
|
@ -54,6 +54,8 @@ $compiler = GCC if ($compiler eq "");
|
|||
$os = Linux if ($data =~ /OS_LINUX/);
|
||||
$os = FreeBSD if ($data =~ /OS_FREEBSD/);
|
||||
$os = NetBSD if ($data =~ /OS_NETBSD/);
|
||||
$os = OpenBSD if ($data =~ /OS_OPENBSD/);
|
||||
$os = DragonFly if ($data =~ /OS_DRAGONFLY/);
|
||||
$os = Darwin if ($data =~ /OS_DARWIN/);
|
||||
$os = SunOS if ($data =~ /OS_SUNOS/);
|
||||
$os = AIX if ($data =~ /OS_AIX/);
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||
|
||||
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
|
||||
Name: OpenBLAS
|
||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||
Version: @OPENBLAS_VERSION@
|
||||
|
|
|
@ -96,6 +96,10 @@ if (NOT CMAKE_CROSSCOMPILING)
|
|||
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NUM_PARALLEL)
|
||||
set(NUM_PARALLEL 1)
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NUM_THREADS)
|
||||
if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0)
|
||||
# HT?
|
||||
|
@ -224,6 +228,8 @@ endif ()
|
|||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}")
|
||||
|
||||
if (USE_SIMPLE_THREADED_LEVEL3)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
|
||||
endif ()
|
||||
|
|
10
common.h
10
common.h
|
@ -93,7 +93,7 @@ extern "C" {
|
|||
#include <sched.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID)
|
||||
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID)
|
||||
#include <sched.h>
|
||||
#endif
|
||||
|
||||
|
@ -179,7 +179,7 @@ extern "C" {
|
|||
|
||||
#define ALLOCA_ALIGN 63UL
|
||||
|
||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
|
||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
|
||||
|
||||
#ifdef NEEDBUNDERSCORE
|
||||
#define BLASFUNC(FUNC) FUNC##_
|
||||
|
@ -649,6 +649,12 @@ int omp_get_num_procs(void);
|
|||
__declspec(dllimport) int __cdecl omp_in_parallel(void);
|
||||
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
|
||||
#endif
|
||||
#if (__STDC_VERSION__ >= 201112L)
|
||||
#ifndef _Atomic
|
||||
#define _Atomic volatile
|
||||
#endif
|
||||
#include <stdatomic.h>
|
||||
#endif
|
||||
#else
|
||||
#ifdef __ELF__
|
||||
int omp_in_parallel (void) __attribute__ ((weak));
|
||||
|
|
10
common_x86.h
10
common_x86.h
|
@ -178,7 +178,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
|||
result = x/y;
|
||||
return result;
|
||||
#else
|
||||
|
||||
#if (MAX_CPU_NUMBER > 64)
|
||||
if ( y > 64) {
|
||||
result = x/y;
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
y = blas_quick_divide_table[y];
|
||||
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||
|
@ -327,7 +333,7 @@ REALNAME:
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__)
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__)
|
||||
#define PROLOGUE \
|
||||
.text; \
|
||||
.align 16; \
|
||||
|
|
|
@ -196,6 +196,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
|||
|
||||
if (y <= 1) return x;
|
||||
|
||||
#if (MAX_CPU_NUMBER > 64)
|
||||
if (y > 64) {
|
||||
result = x / y;
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
y = blas_quick_divide_table[y];
|
||||
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||
|
@ -403,7 +410,7 @@ REALNAME:
|
|||
#define EPILOGUE .end
|
||||
#endif
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(__ELF__) || defined(C_PGI)
|
||||
#define PROLOGUE \
|
||||
.text; \
|
||||
.align 512; \
|
||||
|
|
|
@ -121,7 +121,7 @@ int detect(void)
|
|||
return CPU_VULCAN;
|
||||
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
|
||||
return CPU_THUNDERX;
|
||||
else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */
|
||||
else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43"))
|
||||
return CPU_THUNDERX2T99;
|
||||
}
|
||||
|
||||
|
|
58
cpuid_mips.c
58
cpuid_mips.c
|
@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_P5600 1
|
||||
#define CPU_1004K 2
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKOWN",
|
||||
"P5600"
|
||||
"P5600",
|
||||
"1004K"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
|
@ -90,7 +92,7 @@ int detect(void){
|
|||
if (!strncmp("cpu", buffer, 3)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
fprintf(stderr, "%s \n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
@ -99,43 +101,13 @@ int detect(void){
|
|||
fclose(infile);
|
||||
|
||||
if(p != NULL){
|
||||
if (strstr(p, "Loongson-3A")){
|
||||
return CPU_LOONGSON3A;
|
||||
}else if(strstr(p, "Loongson-3B")){
|
||||
return CPU_LOONGSON3B;
|
||||
}else if (strstr(p, "Loongson-3")){
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
p = (char *)NULL;
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("system type", buffer, 11)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if (strstr(p, "loongson3a"))
|
||||
return CPU_LOONGSON3A;
|
||||
}else{
|
||||
if (strstr(p, "5600")) {
|
||||
return CPU_P5600;
|
||||
} else if (strstr(p, "1004K")) {
|
||||
return CPU_1004K;
|
||||
} else
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
}
|
||||
//Check model name for Loongson3
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
p = (char *)NULL;
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("model name", buffer, 10)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if(p != NULL){
|
||||
if (strstr(p, "Loongson-3A")){
|
||||
return CPU_LOONGSON3A;
|
||||
}else if(strstr(p, "Loongson-3B")){
|
||||
return CPU_LOONGSON3B;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
|
@ -149,7 +121,7 @@ void get_architecture(void){
|
|||
}
|
||||
|
||||
void get_subarchitecture(void){
|
||||
if(detect()==CPU_P5600){
|
||||
if(detect()==CPU_P5600|| detect()==CPU_1004K){
|
||||
printf("P5600");
|
||||
}else{
|
||||
printf("UNKNOWN");
|
||||
|
@ -170,6 +142,14 @@ void get_cpuconfig(void){
|
|||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
} else if (detect()==CPU_1004K) {
|
||||
printf("#define MIPS1004K\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 26144\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 8\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
}else{
|
||||
printf("#define UNKNOWN\n");
|
||||
}
|
||||
|
@ -178,6 +158,8 @@ void get_cpuconfig(void){
|
|||
void get_libname(void){
|
||||
if(detect()==CPU_P5600) {
|
||||
printf("p5600\n");
|
||||
} else if (detect()==CPU_1004K) {
|
||||
printf("1004K\n");
|
||||
}else{
|
||||
printf("mips\n");
|
||||
}
|
||||
|
|
8
ctest.c
8
ctest.c
|
@ -60,6 +60,14 @@ OS_FREEBSD
|
|||
OS_NETBSD
|
||||
#endif
|
||||
|
||||
#if defined(__OpenBSD__)
|
||||
OS_OPENBSD
|
||||
#endif
|
||||
|
||||
#if defined(__DragonFly__)
|
||||
OS_DRAGONFLY
|
||||
#endif
|
||||
|
||||
#if defined(__sun)
|
||||
OS_SUNOS
|
||||
#endif
|
||||
|
|
|
@ -91,7 +91,12 @@
|
|||
#endif
|
||||
|
||||
typedef struct {
|
||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
|
||||
|
|
|
@ -67,7 +67,12 @@
|
|||
#endif
|
||||
|
||||
typedef struct {
|
||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
|
||||
|
|
|
@ -91,7 +91,12 @@
|
|||
#endif
|
||||
|
||||
typedef struct {
|
||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
|
||||
|
|
|
@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD)
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#include <dlfcn.h>
|
||||
#include <signal.h>
|
||||
#include <sys/resource.h>
|
||||
|
|
|
@ -36,6 +36,7 @@
|
|||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
//#include <sys/mman.h>
|
||||
|
@ -49,11 +50,16 @@
|
|||
|
||||
int blas_server_avail = 0;
|
||||
|
||||
static void * blas_thread_buffer[MAX_CPU_NUMBER];
|
||||
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
|
||||
#else
|
||||
static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
|
||||
#endif
|
||||
|
||||
void goto_set_num_threads(int num_threads) {
|
||||
|
||||
int i=0;
|
||||
int i=0, j=0;
|
||||
|
||||
if (num_threads < 1) num_threads = blas_num_threads;
|
||||
|
||||
|
@ -68,15 +74,17 @@ void goto_set_num_threads(int num_threads) {
|
|||
omp_set_num_threads(blas_cpu_number);
|
||||
|
||||
//adjust buffer for each thread
|
||||
for(i=0; i<blas_cpu_number; i++){
|
||||
if(blas_thread_buffer[i]==NULL){
|
||||
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||
for(j=0; j<blas_cpu_number; j++){
|
||||
if(blas_thread_buffer[i][j]==NULL){
|
||||
blas_thread_buffer[i][j]=blas_memory_alloc(2);
|
||||
}
|
||||
}
|
||||
}
|
||||
for(; i<MAX_CPU_NUMBER; i++){
|
||||
if(blas_thread_buffer[i]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i]);
|
||||
blas_thread_buffer[i]=NULL;
|
||||
for(; j<MAX_CPU_NUMBER; j++){
|
||||
if(blas_thread_buffer[i][j]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i][j]);
|
||||
blas_thread_buffer[i][j]=NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined(ARCH_MIPS64)
|
||||
|
@ -92,30 +100,34 @@ void openblas_set_num_threads(int num_threads) {
|
|||
|
||||
int blas_thread_init(void){
|
||||
|
||||
int i=0;
|
||||
int i=0, j=0;
|
||||
|
||||
blas_get_cpu_number();
|
||||
|
||||
blas_server_avail = 1;
|
||||
|
||||
for(i=0; i<blas_num_threads; i++){
|
||||
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||
}
|
||||
for(; i<MAX_CPU_NUMBER; i++){
|
||||
blas_thread_buffer[i]=NULL;
|
||||
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||
for(j=0; j<blas_num_threads; j++){
|
||||
blas_thread_buffer[i][j]=blas_memory_alloc(2);
|
||||
}
|
||||
for(; j<MAX_CPU_NUMBER; j++){
|
||||
blas_thread_buffer[i][j]=NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int BLASFUNC(blas_thread_shutdown)(void){
|
||||
int i=0;
|
||||
int i=0, j=0;
|
||||
blas_server_avail = 0;
|
||||
|
||||
for(i=0; i<MAX_CPU_NUMBER; i++){
|
||||
if(blas_thread_buffer[i]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i]);
|
||||
blas_thread_buffer[i]=NULL;
|
||||
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||
for(j=0; j<MAX_CPU_NUMBER; j++){
|
||||
if(blas_thread_buffer[i][j]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i][j]);
|
||||
blas_thread_buffer[i][j]=NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -206,7 +218,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
}
|
||||
}
|
||||
|
||||
static void exec_threads(blas_queue_t *queue){
|
||||
static void exec_threads(blas_queue_t *queue, int buf_index){
|
||||
|
||||
void *buffer, *sa, *sb;
|
||||
int pos=0, release_flag=0;
|
||||
|
@ -223,7 +235,7 @@ static void exec_threads(blas_queue_t *queue){
|
|||
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
||||
|
||||
pos = omp_get_thread_num();
|
||||
buffer = blas_thread_buffer[pos];
|
||||
buffer = blas_thread_buffer[buf_index][pos];
|
||||
|
||||
//fallback
|
||||
if(buffer==NULL) {
|
||||
|
@ -291,7 +303,7 @@ static void exec_threads(blas_queue_t *queue){
|
|||
|
||||
int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||
|
||||
BLASLONG i;
|
||||
BLASLONG i, buf_index;
|
||||
|
||||
if ((num <= 0) || (queue == NULL)) return 0;
|
||||
|
||||
|
@ -302,6 +314,23 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
}
|
||||
#endif
|
||||
|
||||
while(true) {
|
||||
for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Bool inuse = false;
|
||||
if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) {
|
||||
#else
|
||||
if(blas_buffer_inuse[i] == false) {
|
||||
blas_buffer_inuse[i] = true;
|
||||
#endif
|
||||
buf_index = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(i != MAX_PARALLEL_NUMBER)
|
||||
break;
|
||||
}
|
||||
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (i = 0; i < num; i ++) {
|
||||
|
||||
|
@ -309,9 +338,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
queue[i].position = i;
|
||||
#endif
|
||||
|
||||
exec_threads(&queue[i]);
|
||||
exec_threads(&queue[i], buf_index);
|
||||
}
|
||||
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
atomic_store(&blas_buffer_inuse[buf_index], false);
|
||||
#else
|
||||
blas_buffer_inuse[buf_index] = false;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -108,7 +108,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
@ -147,9 +147,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
|
||||
#define CONSTRUCTOR __attribute__ ((constructor))
|
||||
#define DESTRUCTOR __attribute__ ((destructor))
|
||||
#else
|
||||
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
|
||||
#define CONSTRUCTOR __attribute__ ((constructor(101)))
|
||||
#define DESTRUCTOR __attribute__ ((destructor(101)))
|
||||
#else
|
||||
#define CONSTRUCTOR __attribute__ ((constructor))
|
||||
#define DESTRUCTOR __attribute__ ((destructor))
|
||||
#endif
|
||||
|
||||
#ifdef DYNAMIC_ARCH
|
||||
|
@ -209,7 +212,8 @@ int ret;
|
|||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) return nums;
|
||||
nums = CPU_COUNT_S(size,cpusetp);
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
#endif
|
||||
|
@ -246,7 +250,7 @@ int get_num_procs(void) {
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
|
@ -336,7 +340,7 @@ extern int openblas_goto_num_threads_env();
|
|||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
|
@ -344,7 +348,7 @@ int blas_get_cpu_number(void){
|
|||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
|
@ -368,7 +372,7 @@ int blas_get_cpu_number(void){
|
|||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
|
|
|
@ -54,6 +54,9 @@ static char* openblas_config_str=""
|
|||
#ifdef NO_AFFINITY
|
||||
"NO_AFFINITY "
|
||||
#endif
|
||||
#ifdef USE_OPENMP
|
||||
"USE_OPENMP "
|
||||
#endif
|
||||
#ifndef DYNAMIC_ARCH
|
||||
CHAR_CORENAME
|
||||
#endif
|
||||
|
@ -61,18 +64,23 @@ static char* openblas_config_str=""
|
|||
|
||||
#ifdef DYNAMIC_ARCH
|
||||
char *gotoblas_corename();
|
||||
static char tmp_config_str[256];
|
||||
#endif
|
||||
|
||||
static char tmp_config_str[256];
|
||||
int openblas_get_parallel();
|
||||
|
||||
char* CNAME() {
|
||||
#ifndef DYNAMIC_ARCH
|
||||
return openblas_config_str;
|
||||
#else
|
||||
char tmpstr[20];
|
||||
strcpy(tmp_config_str, openblas_config_str);
|
||||
#ifdef DYNAMIC_ARCH
|
||||
strcat(tmp_config_str, gotoblas_corename());
|
||||
return tmp_config_str;
|
||||
#endif
|
||||
if (openblas_get_parallel() == 0)
|
||||
sprintf(tmpstr, " SINGLE_THREADED");
|
||||
else
|
||||
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
|
||||
strcat(tmp_config_str, tmpstr);
|
||||
return tmp_config_str;
|
||||
}
|
||||
|
||||
|
||||
|
@ -83,3 +91,4 @@ char* openblas_get_corename() {
|
|||
return gotoblas_corename();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -156,7 +156,7 @@ endif
|
|||
endif
|
||||
|
||||
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||
|
||||
so : ../$(LIBSONAME)
|
||||
|
||||
|
|
6
f_check
6
f_check
|
@ -97,7 +97,7 @@ if ($compiler eq "") {
|
|||
|
||||
if ($data =~ /Intel/) {
|
||||
$vendor = INTEL;
|
||||
$openmp = "-openmp";
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($data =~ /Sun Fortran/) {
|
||||
|
@ -127,7 +127,7 @@ if ($compiler eq "") {
|
|||
|
||||
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
|
||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
|
||||
if ($data =~ /zho_ge__/) {
|
||||
if ($data =~ / zho_ge__/) {
|
||||
$need2bu = 1;
|
||||
}
|
||||
}
|
||||
|
@ -155,7 +155,7 @@ if ($compiler eq "") {
|
|||
if ($compiler =~ /ifort/) {
|
||||
$vendor = INTEL;
|
||||
$bu = "_";
|
||||
$openmp = "-openmp";
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /pathf/) {
|
||||
|
|
|
@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef OS_WINDOWS
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#if defined(__FreeBSD__) || defined(__APPLE__)
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
@ -1074,7 +1074,7 @@ static int get_num_cores(void) {
|
|||
|
||||
#ifdef OS_WINDOWS
|
||||
SYSTEM_INFO sysinfo;
|
||||
#elif defined(__FreeBSD__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
int m[2], count;
|
||||
size_t len;
|
||||
#endif
|
||||
|
@ -1088,7 +1088,7 @@ static int get_num_cores(void) {
|
|||
GetSystemInfo(&sysinfo);
|
||||
return sysinfo.dwNumberOfProcessors;
|
||||
|
||||
#elif defined(__FreeBSD__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
|
|
|
@ -239,6 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
} else
|
||||
nthreads = 1;
|
||||
|
||||
/* FIXME TRMV multithreading appears to be broken, see issue 1332*/
|
||||
nthreads = 1;
|
||||
|
||||
if(nthreads > 1) {
|
||||
buffer_size = n > 16 ? 0 : n * 4 + 40;
|
||||
}
|
||||
|
|
|
@ -29,10 +29,8 @@ USE_TRMM = 1
|
|||
endif
|
||||
|
||||
ifeq ($(CORE), HASWELL)
|
||||
ifeq ($(ARCH), x86_64)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ZEN)
|
||||
USE_TRMM = 1
|
||||
|
|
|
@ -440,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
cmp N, #0
|
||||
ble axpy_kernel_L999
|
||||
|
||||
/*
|
||||
cmp INC_X, #0
|
||||
beq axpy_kernel_L999
|
||||
|
||||
cmp INC_Y, #0
|
||||
beq axpy_kernel_L999
|
||||
|
||||
*/
|
||||
cmp INC_X, #1
|
||||
bne axpy_kernel_S_BEGIN
|
||||
|
||||
|
|
|
@ -483,13 +483,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
cmp N, #0
|
||||
ble rot_kernel_L999
|
||||
|
||||
/*
|
||||
cmp INC_X, #0
|
||||
beq rot_kernel_L999
|
||||
|
||||
cmp INC_Y, #0
|
||||
beq rot_kernel_L999
|
||||
|
||||
*/
|
||||
cmp INC_X, #1
|
||||
bne rot_kernel_S_BEGIN
|
||||
|
||||
|
@ -584,6 +584,12 @@ rot_kernel_S1:
|
|||
rot_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
cmp INC_X, #0
|
||||
beq rot_kernel_L999
|
||||
|
||||
cmp INC_Y, #0
|
||||
beq rot_kernel_L999
|
||||
|
||||
subs I, I, #1
|
||||
bne rot_kernel_S10
|
||||
|
|
|
@ -116,22 +116,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
if (m & 1) {
|
||||
|
||||
if (X > posY) {
|
||||
/* ao1 += 1;
|
||||
ao2 += 1; */
|
||||
ao1 += 1;
|
||||
ao2 += 1;
|
||||
b += 2;
|
||||
} else
|
||||
#ifdef UNIT
|
||||
if (X < posY) {
|
||||
#endif
|
||||
b[ 0] = *(ao1 + 0);
|
||||
#ifdef UNIT
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
ao1 += lda;
|
||||
b += 2;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
data02 = *(ao1 + 1);
|
||||
|
||||
b[ 0] = ONE;
|
||||
b[ 1] = data02;
|
||||
#else
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
#endif
|
||||
ao1 += 2;
|
||||
b += 2;
|
||||
}
|
||||
#endif
|
||||
b[ 1] = *(ao1 + 1);
|
||||
b += 2;
|
||||
}
|
||||
|
||||
posY += 2;
|
||||
|
@ -178,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
} while (i > 0);
|
||||
}
|
||||
|
||||
// posY += 1;
|
||||
posY += 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -518,7 +518,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
i = (m & 15);
|
||||
if (i > 0) {
|
||||
if (X < posY) {
|
||||
/* a01 += i;
|
||||
a01 += i;
|
||||
a02 += i;
|
||||
a03 += i;
|
||||
a04 += i;
|
||||
|
@ -533,7 +533,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
a13 += i;
|
||||
a14 += i;
|
||||
a15 += i;
|
||||
a16 += i; */
|
||||
a16 += i;
|
||||
b += 16 * i;
|
||||
} else
|
||||
if (X > posY) {
|
||||
|
@ -1130,14 +1130,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
i = (m & 7);
|
||||
if (i > 0) {
|
||||
if (X < posY) {
|
||||
/* a01 += i;
|
||||
a01 += i;
|
||||
a02 += i;
|
||||
a03 += i;
|
||||
a04 += i;
|
||||
a05 += i;
|
||||
a06 += i;
|
||||
a07 += i;
|
||||
a08 += i; */
|
||||
a08 += i;
|
||||
b += 8 * i;
|
||||
} else
|
||||
if (X > posY) {
|
||||
|
@ -1156,13 +1156,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
b += 8;
|
||||
}
|
||||
|
||||
/* a02 += i * lda;
|
||||
a02 += i * lda;
|
||||
a03 += i * lda;
|
||||
a04 += i * lda;
|
||||
a05 += i * lda;
|
||||
a06 += i * lda;
|
||||
a07 += i * lda;
|
||||
a08 += i * lda; */
|
||||
a08 += i * lda;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
b[ 0] = ONE;
|
||||
|
@ -1371,10 +1371,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
i = (m & 3);
|
||||
if (i > 0) {
|
||||
if (X < posY) {
|
||||
/* a01 += i;
|
||||
a01 += i;
|
||||
a02 += i;
|
||||
a03 += i;
|
||||
a04 += i; */
|
||||
a04 += i;
|
||||
b += 4 * i;
|
||||
} else
|
||||
if (X > posY) {
|
||||
|
@ -1387,9 +1387,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
a01 += lda;
|
||||
b += 4;
|
||||
}
|
||||
/* a02 += lda;
|
||||
a02 += lda;
|
||||
a03 += lda;
|
||||
a04 += lda; */
|
||||
a04 += lda;
|
||||
} else {
|
||||
|
||||
#ifdef UNIT
|
||||
|
@ -1487,19 +1487,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
if (X < posY) {
|
||||
a01 ++;
|
||||
a02 ++;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
b += 2;
|
||||
} else
|
||||
if (X > posY) {
|
||||
#endif
|
||||
b[ 0] = *(a01 + 0);
|
||||
#ifdef UNIT
|
||||
b[ 1] = *(a01 + 1);
|
||||
a01 += lda;
|
||||
b += 2;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
b[ 0] = ONE;
|
||||
}
|
||||
b[ 1] = *(a01 + 1);
|
||||
#else
|
||||
b[ 0] = *(a01 + 0);
|
||||
b[ 1] = *(a01 + 1);
|
||||
#endif
|
||||
b[ 1] = *(a01 + 1);
|
||||
}
|
||||
b += 2;
|
||||
b += 2;
|
||||
}
|
||||
}
|
||||
posY += 2;
|
||||
}
|
||||
|
@ -1518,25 +1522,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
if (i > 0) {
|
||||
do {
|
||||
if (X < posY) {
|
||||
a01 ++;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
a01 += 1;
|
||||
b ++;
|
||||
} else
|
||||
if (X > posY) {
|
||||
#endif
|
||||
b[ 0] = *(a01 + 0);
|
||||
#ifdef UNIT
|
||||
a01 += lda;
|
||||
b ++;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
b[ 0] = ONE;
|
||||
}
|
||||
#else
|
||||
b[ 0] = *(a01 + 0);
|
||||
#endif
|
||||
a01 += lda;
|
||||
}
|
||||
b ++;
|
||||
X ++;
|
||||
i --;
|
||||
a01 += lda;
|
||||
b ++;
|
||||
}
|
||||
|
||||
X += 1;
|
||||
i --;
|
||||
} while (i > 0);
|
||||
}
|
||||
// posY += 1;
|
||||
posY += 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
if (m & 1) {
|
||||
|
||||
if (X < posY) {
|
||||
/* ao1 += 1;
|
||||
ao2 += 1; */
|
||||
ao1 += 1;
|
||||
ao2 += 1;
|
||||
b += 2;
|
||||
} else
|
||||
if (X > posY) {
|
||||
|
@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
// ao1 += lda;
|
||||
ao1 += lda;
|
||||
b += 2;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
|
@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
b[ 0] = data01;
|
||||
b[ 1] = ZERO;
|
||||
#endif
|
||||
// ao1 += lda;
|
||||
ao1 += lda;
|
||||
b += 2;
|
||||
}
|
||||
}
|
||||
|
@ -161,18 +161,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
i = m;
|
||||
if (m > 0) {
|
||||
do {
|
||||
if (X < posY) {
|
||||
b += 1;
|
||||
ao1 += 1;
|
||||
} else
|
||||
if (X > posY) {
|
||||
data01 = *(ao1 + 0);
|
||||
b[ 0] = data01;
|
||||
b += 1;
|
||||
ao1 += lda;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
if (X > posY) {
|
||||
b[ 0] = ONE;
|
||||
#else
|
||||
data01 = *(ao1 + 0);
|
||||
b[ 0] = data01;
|
||||
#endif
|
||||
b[ 0] = *(ao1 + 0);
|
||||
#ifdef UNIT
|
||||
} else {
|
||||
b[ 0] = ONE;
|
||||
}
|
||||
#endif
|
||||
b ++;
|
||||
ao1 += lda;
|
||||
X ++;
|
||||
b += 1;
|
||||
ao1 += lda;
|
||||
}
|
||||
|
||||
X += 1;
|
||||
i --;
|
||||
} while (i > 0);
|
||||
}
|
||||
|
|
|
@ -201,18 +201,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
if (X < posY) {
|
||||
|
||||
if (m & 2) {
|
||||
/* ao1 += 2;
|
||||
ao1 += 2;
|
||||
ao2 += 2;
|
||||
ao3 += 2;
|
||||
ao4 += 2; */
|
||||
ao4 += 2;
|
||||
b += 8;
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
/* ao1 += 1;
|
||||
ao1 += 1;
|
||||
ao2 += 1;
|
||||
ao3 += 1;
|
||||
ao4 += 1; */
|
||||
ao4 += 1;
|
||||
b += 4;
|
||||
}
|
||||
|
||||
|
@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
b[ 7] = data08;
|
||||
|
||||
ao1 += 2 * lda;
|
||||
// ao2 += 2 * lda;
|
||||
ao2 += 2 * lda;
|
||||
b += 8;
|
||||
}
|
||||
|
||||
|
@ -253,7 +253,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
b[ 2] = data03;
|
||||
b[ 3] = data04;
|
||||
|
||||
// ao1 += lda;
|
||||
ao1 += lda;
|
||||
b += 4;
|
||||
}
|
||||
|
||||
|
@ -401,7 +401,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
if (i) {
|
||||
|
||||
if (X < posY) {
|
||||
// ao1 += 2;
|
||||
ao1 += 2;
|
||||
b += 2;
|
||||
} else
|
||||
if (X > posY) {
|
||||
|
@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
|
||||
// ao1 += lda;
|
||||
ao1 += lda;
|
||||
b += 2;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
|
@ -443,21 +443,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
do {
|
||||
|
||||
if (X < posY) {
|
||||
b += 1;
|
||||
ao1 += 1;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
} else
|
||||
if (X > posY) {
|
||||
#endif
|
||||
b[ 0] = *(ao1 + 0);
|
||||
#ifdef UNIT
|
||||
data01 = *(ao1 + 0);
|
||||
b[ 0] = data01;
|
||||
ao1 += lda;
|
||||
b += 1;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
b[ 0] = ONE;
|
||||
}
|
||||
#else
|
||||
data01 = *(ao1 + 0);
|
||||
b[ 0] = data01;
|
||||
#endif
|
||||
ao1 += lda;
|
||||
}
|
||||
b ++;
|
||||
X ++;
|
||||
ao1 += lda;
|
||||
b += 1;
|
||||
}
|
||||
|
||||
X += 1;
|
||||
i --;
|
||||
} while (i > 0);
|
||||
}
|
||||
|
|
|
@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
}
|
||||
|
||||
a1 += 2 * lda;
|
||||
// a2 += 2 * lda;
|
||||
a2 += 2 * lda;
|
||||
b += 8;
|
||||
|
||||
ii += 2;
|
||||
|
|
|
@ -139,18 +139,48 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
}
|
||||
|
||||
if (m & 1) {
|
||||
#ifdef UNIT
|
||||
|
||||
if (X > posY) {
|
||||
ao1 += 2;
|
||||
ao2 += 2;
|
||||
b += 4;
|
||||
|
||||
} else
|
||||
if (X < posY) {
|
||||
#endif
|
||||
b[ 0] = *(ao1 + 0);
|
||||
b[ 1] = *(ao1 + 1);
|
||||
#ifdef UNIT
|
||||
data1 = *(ao1 + 0);
|
||||
data2 = *(ao1 + 1);
|
||||
data3 = *(ao1 + 2);
|
||||
data4 = *(ao1 + 3);
|
||||
|
||||
b[ 0] = data1;
|
||||
b[ 1] = data2;
|
||||
b[ 2] = data3;
|
||||
b[ 3] = data4;
|
||||
|
||||
ao1 += lda;
|
||||
b += 4;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
data3 = *(ao1 + 2);
|
||||
data4 = *(ao1 + 3);
|
||||
|
||||
b[ 0] = ONE;
|
||||
b[ 1] = ZERO;
|
||||
}
|
||||
b[ 2] = data3;
|
||||
b[ 3] = data4;
|
||||
#else
|
||||
data1 = *(ao1 + 0);
|
||||
data2 = *(ao1 + 1);
|
||||
data3 = *(ao1 + 2);
|
||||
data4 = *(ao1 + 3);
|
||||
|
||||
b[ 0] = data1;
|
||||
b[ 1] = data2;
|
||||
b[ 2] = data3;
|
||||
b[ 3] = data4;
|
||||
#endif
|
||||
b += 4;
|
||||
b += 4;
|
||||
}
|
||||
}
|
||||
|
||||
posY += 2;
|
||||
|
@ -203,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
} while (i > 0);
|
||||
}
|
||||
|
||||
// posY += 1;
|
||||
posY += 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
|
||||
BLASLONG i, ii, j, jj;
|
||||
|
||||
FLOAT data01 = 0.0, data02 = 0.0;
|
||||
FLOAT data01, data02;
|
||||
FLOAT *a1;
|
||||
|
||||
lda *= 2;
|
||||
|
|
|
@ -43,8 +43,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
|
||||
BLASLONG i, ii, j, jj;
|
||||
|
||||
FLOAT data01 = 0.0, data02 = 0.0, data03, data04;
|
||||
FLOAT data05, data06, data07 = 0.0, data08 = 0.0;
|
||||
FLOAT data01, data02, data03, data04;
|
||||
FLOAT data05, data06, data07, data08;
|
||||
FLOAT *a1, *a2;
|
||||
|
||||
lda *= 2;
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
include $(KERNELDIR)/KERNEL.P5600
|
|
@ -38,8 +38,8 @@ ZASUMKERNEL = ../mips/zasum_msa.c
|
|||
else
|
||||
SASUMKERNEL = ../mips/asum.c
|
||||
DASUMKERNEL = ../mips/asum.c
|
||||
CASUMKERNEL = ../mips/asum.c
|
||||
ZASUMKERNEL = ../mips/asum.c
|
||||
CASUMKERNEL = ../mips/zasum.c
|
||||
ZASUMKERNEL = ../mips/zasum.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
|
@ -253,4 +253,4 @@ ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
|||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
endif
|
||||
|
|
|
@ -484,10 +484,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
|
|||
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
||||
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
v2f64 v_alpha;
|
||||
v2f64 x0, x1, x2, x3, y0 = 0.0, y1 = 0.0, y2 = 0.0, y3 = 0.0;
|
||||
v2f64 x0, x1, x2, x3, y0 = {0,0}, y1 = {0,0}, y2 = {0,0}, y3 = {0,0};
|
||||
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
|
||||
v2f64 t30, t31, tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0;
|
||||
v2f64 t30, t31, tp0 = {0,0}, tp1 = {0,0}, tp2 = {0,0}, tp3 = {0,0}, tp4 = {0,0}, tp5 = {0,0}, tp6 = {0,0}, tp7 = {0,0};
|
||||
|
||||
v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);
|
||||
|
||||
|
|
|
@ -41,8 +41,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] ;
|
||||
#if defined(DSDOT)
|
||||
dot += (double)y[iy] * (double)x[ix] ;
|
||||
#else
|
||||
dot += y[iy] * x[ix];
|
||||
#endif
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
|
|
@ -423,9 +423,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
|
|||
FLOAT *y_org = y;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
||||
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
v4f32 v_alpha, x0, x1, y0 = 0.0, y1 = 0.0;
|
||||
v4f32 v_alpha, x0, x1, y0 = {0,0,0,0}, y1 = {0,0,0,0};
|
||||
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v4f32 tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0;
|
||||
v4f32 tp0 = {0,0,0,0}, tp1 = {0,0,0,0}, tp2 = {0,0,0,0}, tp3 = {0,0,0,0}, tp4 = {0,0,0,0}, tp5 = {0,0,0,0}, tp6 = {0,0,0,0}, tp7 = {0,0,0,0};
|
||||
|
||||
v_alpha = COPY_FLOAT_TO_VECTOR(alpha);
|
||||
|
||||
|
|
|
@ -54,3 +54,6 @@ ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S
|
|||
ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S
|
||||
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
|
|
|
@ -39,7 +39,7 @@ lapack_int LAPACKE_clacgv( lapack_int n, lapack_complex_float* x,
|
|||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||
if( LAPACKE_get_nancheck() ) {
|
||||
/* Optionally check input matrices for NaNs */
|
||||
if( LAPACKE_c_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) {
|
||||
if( LAPACKE_c_nancheck( n, x, incx ) ) {
|
||||
return -2;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -51,16 +51,21 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
|
|||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||
if( LAPACKE_get_nancheck() ) {
|
||||
/* Optionally check input matrices for NaNs */
|
||||
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
||||
LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
||||
nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
|
||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'c' ) &&
|
||||
LAPACKE_lsame( side, 'r' ) ) ? n :
|
||||
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
||||
lapack_int lrv, lcv; /* row, column stride */
|
||||
if( matrix_layout == LAPACK_COL_MAJOR ) {
|
||||
lrv = 1;
|
||||
lcv = ldv;
|
||||
} else {
|
||||
lrv = ldv;
|
||||
lcv = 1;
|
||||
}
|
||||
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
||||
|
||||
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
|
||||
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||
return -13;
|
||||
}
|
||||
|
@ -70,8 +75,8 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
|
|||
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||
if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv],
|
||||
ldv ) )
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v,
|
||||
&v[k*lrv], ldv ) )
|
||||
return -9;
|
||||
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||
if( k > nrows_v ) {
|
||||
|
@ -79,23 +84,23 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
|
|||
return -8;
|
||||
}
|
||||
if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k,
|
||||
&v[(nrows_v-k)*ldv], ldv ) )
|
||||
&v[(nrows_v-k)*lrv], ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
|
||||
return -9;
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||
if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k],
|
||||
ldv ) )
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k,
|
||||
&v[k*lrv], ldv ) )
|
||||
return -9;
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||
if( k > ncols_v ) {
|
||||
LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
|
||||
return -8;
|
||||
}
|
||||
if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k],
|
||||
ldv ) )
|
||||
if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k,
|
||||
&v[(ncols_v-k)*lcv], ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
|
||||
return -9;
|
||||
|
|
|
@ -43,7 +43,7 @@ lapack_int LAPACKE_clarfg( lapack_int n, lapack_complex_float* alpha,
|
|||
if( LAPACKE_c_nancheck( 1, alpha, 1 ) ) {
|
||||
return -2;
|
||||
}
|
||||
if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
||||
if( LAPACKE_c_nancheck( n-1, x, incx ) ) {
|
||||
return -3;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,6 +38,7 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m,
|
|||
lapack_complex_float tau, lapack_complex_float* c,
|
||||
lapack_int ldc, lapack_complex_float* work )
|
||||
{
|
||||
lapack_int lv;
|
||||
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
||||
LAPACKE_xerbla( "LAPACKE_clarfx", -1 );
|
||||
return -1;
|
||||
|
@ -51,7 +52,8 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m,
|
|||
if( LAPACKE_c_nancheck( 1, &tau, 1 ) ) {
|
||||
return -6;
|
||||
}
|
||||
if( LAPACKE_c_nancheck( m, v, 1 ) ) {
|
||||
lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
|
||||
if( LAPACKE_c_nancheck( lv, v, 1 ) ) {
|
||||
return -5;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,7 +39,7 @@ lapack_int LAPACKE_classq( lapack_int n, lapack_complex_float* x,
|
|||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||
if( LAPACKE_get_nancheck() ) {
|
||||
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
|
||||
if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
||||
if( LAPACKE_c_nancheck( n, x, incx ) ) {
|
||||
return -2;
|
||||
}
|
||||
if( LAPACKE_s_nancheck( 1, scale, 1 ) ) {
|
||||
|
|
|
@ -58,7 +58,7 @@ lapack_int LAPACKE_cunmtr( int matrix_layout, char side, char uplo, char trans,
|
|||
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||
return -10;
|
||||
}
|
||||
if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) {
|
||||
if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) {
|
||||
return -9;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,7 +58,7 @@ lapack_int LAPACKE_cupmtr( int matrix_layout, char side, char uplo, char trans,
|
|||
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||
return -9;
|
||||
}
|
||||
if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) {
|
||||
if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) {
|
||||
return -8;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -50,16 +50,21 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
|
|||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||
if( LAPACKE_get_nancheck() ) {
|
||||
/* Optionally check input matrices for NaNs */
|
||||
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
||||
LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
||||
nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
|
||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'c' ) &&
|
||||
LAPACKE_lsame( side, 'r' ) ) ? n :
|
||||
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
||||
lapack_int lrv, lcv; /* row, column stride */
|
||||
if( matrix_layout == LAPACK_COL_MAJOR ) {
|
||||
lrv = 1;
|
||||
lcv = ldv;
|
||||
} else {
|
||||
lrv = ldv;
|
||||
lcv = 1;
|
||||
}
|
||||
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
||||
|
||||
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
|
||||
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
||||
if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||
return -13;
|
||||
}
|
||||
|
@ -69,8 +74,8 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
|
|||
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||
if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv],
|
||||
ldv ) )
|
||||
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v,
|
||||
&v[k*lrv], ldv ) )
|
||||
return -9;
|
||||
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||
if( k > nrows_v ) {
|
||||
|
@ -78,23 +83,23 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
|
|||
return -8;
|
||||
}
|
||||
if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k,
|
||||
&v[(nrows_v-k)*ldv], ldv ) )
|
||||
&v[(nrows_v-k)*lrv], ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
|
||||
return -9;
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||
if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k],
|
||||
ldv ) )
|
||||
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k,
|
||||
&v[k*lrv], ldv ) )
|
||||
return -9;
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||
if( k > ncols_v ) {
|
||||
LAPACKE_xerbla( "LAPACKE_dlarfb", -8 );
|
||||
return -8;
|
||||
}
|
||||
if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k],
|
||||
ldv ) )
|
||||
if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k,
|
||||
&v[(ncols_v-k)*lcv], ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
|
||||
return -9;
|
||||
|
|
|
@ -42,7 +42,7 @@ lapack_int LAPACKE_dlarfg( lapack_int n, double* alpha, double* x,
|
|||
if( LAPACKE_d_nancheck( 1, alpha, 1 ) ) {
|
||||
return -2;
|
||||
}
|
||||
if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
||||
if( LAPACKE_d_nancheck( n-1, x, incx ) ) {
|
||||
return -3;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,6 +37,7 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m,
|
|||
lapack_int n, const double* v, double tau, double* c,
|
||||
lapack_int ldc, double* work )
|
||||
{
|
||||
lapack_int lv;
|
||||
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
||||
LAPACKE_xerbla( "LAPACKE_dlarfx", -1 );
|
||||
return -1;
|
||||
|
@ -50,7 +51,8 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m,
|
|||
if( LAPACKE_d_nancheck( 1, &tau, 1 ) ) {
|
||||
return -6;
|
||||
}
|
||||
if( LAPACKE_d_nancheck( m, v, 1 ) ) {
|
||||
lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
|
||||
if( LAPACKE_d_nancheck( lv, v, 1 ) ) {
|
||||
return -5;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,7 +38,7 @@ lapack_int LAPACKE_dlassq( lapack_int n, double* x, lapack_int incx, double* sca
|
|||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||
if( LAPACKE_get_nancheck() ) {
|
||||
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
|
||||
if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
||||
if( LAPACKE_d_nancheck( n, x, incx ) ) {
|
||||
return -2;
|
||||
}
|
||||
if( LAPACKE_d_nancheck( 1, scale, 1 ) ) {
|
||||
|
|
|
@ -56,7 +56,7 @@ lapack_int LAPACKE_dopmtr( int matrix_layout, char side, char uplo, char trans,
|
|||
if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||
return -9;
|
||||
}
|
||||
if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) {
|
||||
if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) {
|
||||
return -8;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -57,7 +57,7 @@ lapack_int LAPACKE_dormtr( int matrix_layout, char side, char uplo, char trans,
|
|||
if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||
return -10;
|
||||
}
|
||||
if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) {
|
||||
if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) {
|
||||
return -9;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -50,16 +50,21 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
|
|||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||
if( LAPACKE_get_nancheck() ) {
|
||||
/* Optionally check input matrices for NaNs */
|
||||
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
||||
LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
||||
nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
|
||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'c' ) &&
|
||||
LAPACKE_lsame( side, 'r' ) ) ? n :
|
||||
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
||||
lapack_int lrv, lcv; /* row, column stride */
|
||||
if( matrix_layout == LAPACK_COL_MAJOR ) {
|
||||
lrv = 1;
|
||||
lcv = ldv;
|
||||
} else {
|
||||
lrv = ldv;
|
||||
lcv = 1;
|
||||
}
|
||||
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
||||
|
||||
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
|
||||
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
||||
if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||
return -13;
|
||||
}
|
||||
|
@ -69,8 +74,8 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
|
|||
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||
if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv],
|
||||
ldv ) )
|
||||
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v,
|
||||
&v[k*lrv], ldv ) )
|
||||
return -9;
|
||||
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||
if( k > nrows_v ) {
|
||||
|
@ -78,23 +83,23 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
|
|||
return -8;
|
||||
}
|
||||
if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k,
|
||||
&v[(nrows_v-k)*ldv], ldv ) )
|
||||
&v[(nrows_v-k)*lrv], ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
|
||||
return -9;
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||
if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k],
|
||||
ldv ) )
|
||||
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k,
|
||||
&v[k*lrv], ldv ) )
|
||||
return -9;
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||
if( k > ncols_v ) {
|
||||
LAPACKE_xerbla( "LAPACKE_slarfb", -8 );
|
||||
return -8;
|
||||
}
|
||||
if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k],
|
||||
ldv ) )
|
||||
if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k,
|
||||
&v[(ncols_v-k)*lcv], ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
|
||||
return -9;
|
||||
|
|
|
@ -42,7 +42,7 @@ lapack_int LAPACKE_slarfg( lapack_int n, float* alpha, float* x,
|
|||
if( LAPACKE_s_nancheck( 1, alpha, 1 ) ) {
|
||||
return -2;
|
||||
}
|
||||
if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
||||
if( LAPACKE_s_nancheck( n-1, x, incx ) ) {
|
||||
return -3;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,6 +37,7 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m,
|
|||
lapack_int n, const float* v, float tau, float* c,
|
||||
lapack_int ldc, float* work )
|
||||
{
|
||||
lapack_int lv;
|
||||
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
||||
LAPACKE_xerbla( "LAPACKE_slarfx", -1 );
|
||||
return -1;
|
||||
|
@ -50,7 +51,8 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m,
|
|||
if( LAPACKE_s_nancheck( 1, &tau, 1 ) ) {
|
||||
return -6;
|
||||
}
|
||||
if( LAPACKE_s_nancheck( m, v, 1 ) ) {
|
||||
lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
|
||||
if( LAPACKE_s_nancheck( lv, v, 1 ) ) {
|
||||
return -5;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,7 +38,7 @@ lapack_int LAPACKE_slassq( lapack_int n, float* x, lapack_int incx, float* scale
|
|||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||
if( LAPACKE_get_nancheck() ) {
|
||||
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
|
||||
if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
||||
if( LAPACKE_s_nancheck( n, x, incx ) ) {
|
||||
return -2;
|
||||
}
|
||||
if( LAPACKE_s_nancheck( 1, scale, 1 ) ) {
|
||||
|
|
|
@ -56,7 +56,7 @@ lapack_int LAPACKE_sopmtr( int matrix_layout, char side, char uplo, char trans,
|
|||
if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||
return -9;
|
||||
}
|
||||
if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) {
|
||||
if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) {
|
||||
return -8;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -57,7 +57,7 @@ lapack_int LAPACKE_sormtr( int matrix_layout, char side, char uplo, char trans,
|
|||
if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||
return -10;
|
||||
}
|
||||
if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) {
|
||||
if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) {
|
||||
return -9;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,7 +39,7 @@ lapack_int LAPACKE_zlacgv( lapack_int n, lapack_complex_double* x,
|
|||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||
if( LAPACKE_get_nancheck() ) {
|
||||
/* Optionally check input matrices for NaNs */
|
||||
if( LAPACKE_z_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) {
|
||||
if( LAPACKE_z_nancheck( n, x, incx ) ) {
|
||||
return -2;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -51,16 +51,21 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
|
|||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||
if( LAPACKE_get_nancheck() ) {
|
||||
/* Optionally check input matrices for NaNs */
|
||||
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
||||
LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
||||
nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
|
||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'c' ) &&
|
||||
LAPACKE_lsame( side, 'r' ) ) ? n :
|
||||
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
||||
lapack_int lrv, lcv; /* row, column stride */
|
||||
if( matrix_layout == LAPACK_COL_MAJOR ) {
|
||||
lrv = 1;
|
||||
lcv = ldv;
|
||||
} else {
|
||||
lrv = ldv;
|
||||
lcv = 1;
|
||||
}
|
||||
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
||||
|
||||
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
|
||||
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
||||
if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||
return -13;
|
||||
}
|
||||
|
@ -70,8 +75,8 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
|
|||
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||
if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv],
|
||||
ldv ) )
|
||||
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v,
|
||||
&v[k*lrv], ldv ) )
|
||||
return -9;
|
||||
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||
if( k > nrows_v ) {
|
||||
|
@ -79,23 +84,23 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
|
|||
return -8;
|
||||
}
|
||||
if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k,
|
||||
&v[(nrows_v-k)*ldv], ldv ) )
|
||||
&v[(nrows_v-k)*lrv], ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
|
||||
return -9;
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||
if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k],
|
||||
ldv ) )
|
||||
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k,
|
||||
&v[k*lrv], ldv ) )
|
||||
return -9;
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||
if( k > ncols_v ) {
|
||||
LAPACKE_xerbla( "LAPACKE_zlarfb", -8 );
|
||||
return -8;
|
||||
}
|
||||
if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k],
|
||||
ldv ) )
|
||||
if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k,
|
||||
&v[(ncols_v-k)*lcv], ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
|
||||
return -9;
|
||||
|
|
|
@ -43,7 +43,7 @@ lapack_int LAPACKE_zlarfg( lapack_int n, lapack_complex_double* alpha,
|
|||
if( LAPACKE_z_nancheck( 1, alpha, 1 ) ) {
|
||||
return -2;
|
||||
}
|
||||
if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
||||
if( LAPACKE_z_nancheck( n-1, x, incx ) ) {
|
||||
return -3;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,6 +38,7 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m,
|
|||
lapack_complex_double tau, lapack_complex_double* c,
|
||||
lapack_int ldc, lapack_complex_double* work )
|
||||
{
|
||||
lapack_int lv;
|
||||
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
||||
LAPACKE_xerbla( "LAPACKE_zlarfx", -1 );
|
||||
return -1;
|
||||
|
@ -51,7 +52,8 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m,
|
|||
if( LAPACKE_z_nancheck( 1, &tau, 1 ) ) {
|
||||
return -6;
|
||||
}
|
||||
if( LAPACKE_z_nancheck( m, v, 1 ) ) {
|
||||
lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
|
||||
if( LAPACKE_z_nancheck( lv, v, 1 ) ) {
|
||||
return -5;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,7 +39,7 @@ lapack_int LAPACKE_zlassq( lapack_int n, lapack_complex_double* x,
|
|||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||
if( LAPACKE_get_nancheck() ) {
|
||||
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
|
||||
if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
||||
if( LAPACKE_z_nancheck( n, x, incx ) ) {
|
||||
return -2;
|
||||
}
|
||||
if( LAPACKE_d_nancheck( 1, scale, 1 ) ) {
|
||||
|
|
|
@ -58,7 +58,7 @@ lapack_int LAPACKE_zunmtr( int matrix_layout, char side, char uplo, char trans,
|
|||
if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||
return -10;
|
||||
}
|
||||
if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) {
|
||||
if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) {
|
||||
return -9;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,7 +58,7 @@ lapack_int LAPACKE_zupmtr( int matrix_layout, char side, char uplo, char trans,
|
|||
if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||
return -9;
|
||||
}
|
||||
if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) {
|
||||
if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) {
|
||||
return -8;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -512,7 +512,7 @@ C END IF
|
|||
*
|
||||
* Call the kernel
|
||||
*
|
||||
#if defined(_OPENMP) && _OPENMP >= 201307L
|
||||
#if defined(_OPENMP) && _OPENMP >= 201307
|
||||
IF( TTYPE.NE.1 ) THEN
|
||||
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
|
||||
!$OMP$ DEPEND(in:WORK(MYID-1))
|
||||
|
|
|
@ -481,7 +481,7 @@
|
|||
*
|
||||
* Call the kernel
|
||||
*
|
||||
#if defined(_OPENMP) && _OPENMP >= 201307L
|
||||
#if defined(_OPENMP) && _OPENMP >= 201307
|
||||
IF( TTYPE.NE.1 ) THEN
|
||||
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
|
||||
!$OMP$ DEPEND(in:WORK(MYID-1))
|
||||
|
|
|
@ -512,7 +512,7 @@ C END IF
|
|||
*
|
||||
* Call the kernel
|
||||
*
|
||||
#if defined(_OPENMP) && _OPENMP >= 201307L
|
||||
#if defined(_OPENMP) && _OPENMP >= 201307
|
||||
|
||||
IF( TTYPE.NE.1 ) THEN
|
||||
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
|
||||
|
|
|
@ -67,6 +67,26 @@ double sqrt(double);
|
|||
#undef GETRF_FACTOR
|
||||
#define GETRF_FACTOR 1.00
|
||||
|
||||
|
||||
#if defined(USE_PTHREAD_LOCK)
|
||||
static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
#elif defined(USE_PTHREAD_SPINLOCK)
|
||||
static pthread_spinlock_t getrf_lock = 0;
|
||||
#else
|
||||
static BLASULONG getrf_lock = 0UL;
|
||||
#endif
|
||||
|
||||
#if defined(USE_PTHREAD_LOCK)
|
||||
static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
#elif defined(USE_PTHREAD_SPINLOCK)
|
||||
static pthread_spinlock_t getrf_flag_lock = 0;
|
||||
#else
|
||||
static BLASULONG getrf_flag_lock = 0UL;
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) {
|
||||
|
||||
double m = (double)(M - IS - BK);
|
||||
|
@ -99,7 +119,11 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
|
|||
FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE;
|
||||
FLOAT *sbb = sb;
|
||||
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
|
||||
#else
|
||||
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
|
||||
#endif
|
||||
|
||||
blasint *ipiv = (blasint *)args -> c;
|
||||
|
||||
|
@ -177,7 +201,12 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
|
|||
/* Non blocking implementation */
|
||||
|
||||
typedef struct {
|
||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||
|
@ -216,9 +245,12 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
|||
FLOAT *sbb= sb;
|
||||
|
||||
blasint *ipiv = (blasint *)args -> c;
|
||||
|
||||
BLASLONG jw;
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
|
||||
#else
|
||||
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
|
||||
|
||||
#endif
|
||||
if (args -> a == NULL) {
|
||||
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
|
||||
sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
||||
|
@ -245,8 +277,20 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
|||
for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
|
||||
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
#if 1
|
||||
{
|
||||
LOCK_COMMAND(&getrf_lock);
|
||||
jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
|
||||
UNLOCK_COMMAND(&getrf_lock);
|
||||
do {
|
||||
LOCK_COMMAND(&getrf_lock);
|
||||
jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
|
||||
UNLOCK_COMMAND(&getrf_lock);
|
||||
} while (jw);
|
||||
}
|
||||
#else
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {};
|
||||
|
||||
#endif
|
||||
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
@ -283,18 +327,23 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
|||
b + (is + jjs * lda) * COMPSIZE, lda, is);
|
||||
}
|
||||
}
|
||||
|
||||
MB;
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
for (i = 0; i < args -> nthreads; i++) {
|
||||
LOCK_COMMAND(&getrf_lock);
|
||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
|
||||
UNLOCK_COMMAND(&getrf_lock);
|
||||
}
|
||||
}
|
||||
|
||||
LOCK_COMMAND(&getrf_flag_lock);
|
||||
flag[mypos * CACHE_LINE_SIZE] = 0;
|
||||
UNLOCK_COMMAND(&getrf_flag_lock);
|
||||
|
||||
if (m == 0) {
|
||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
||||
LOCK_COMMAND(&getrf_lock);
|
||||
job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0;
|
||||
UNLOCK_COMMAND(&getrf_lock);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -318,7 +367,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
|||
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
||||
|
||||
if ((current != mypos) && (!is)) {
|
||||
#if 1
|
||||
LOCK_COMMAND(&getrf_lock);
|
||||
jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
|
||||
UNLOCK_COMMAND(&getrf_lock);
|
||||
do {
|
||||
LOCK_COMMAND(&getrf_lock);
|
||||
jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
|
||||
UNLOCK_COMMAND(&getrf_lock);
|
||||
} while (jw == 0);
|
||||
#else
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {};
|
||||
#endif
|
||||
}
|
||||
|
||||
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k,
|
||||
|
@ -327,7 +387,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
|||
|
||||
MB;
|
||||
if (is + min_i >= m) {
|
||||
LOCK_COMMAND(&getrf_lock);
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
UNLOCK_COMMAND(&getrf_lock);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -339,7 +401,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
|||
|
||||
for (i = 0; i < args -> nthreads; i++) {
|
||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
||||
#if 1
|
||||
LOCK_COMMAND(&getrf_lock);
|
||||
jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
|
||||
UNLOCK_COMMAND(&getrf_lock);
|
||||
do {
|
||||
LOCK_COMMAND(&getrf_lock);
|
||||
jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
|
||||
UNLOCK_COMMAND(&getrf_lock);
|
||||
} while(jw != 0);
|
||||
#else
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {};
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -374,11 +447,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
BLASLONG i, j, k, is, bk;
|
||||
|
||||
BLASLONG num_cpu;
|
||||
BLASLONG f;
|
||||
|
||||
#ifdef _MSC_VER
|
||||
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE];
|
||||
#else
|
||||
volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
|
@ -501,11 +580,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
if (mm >= nn) {
|
||||
|
||||
width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
|
||||
if (width == 0) width = nn;
|
||||
if (nn < width) width = nn;
|
||||
nn -= width;
|
||||
range_N[num_cpu + 1] = range_N[num_cpu] + width;
|
||||
|
||||
width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
|
||||
if (width == 0) width = mm;
|
||||
if (mm < width) width = mm;
|
||||
if (nn <= 0) width = mm;
|
||||
mm -= width;
|
||||
|
@ -514,11 +595,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
} else {
|
||||
|
||||
width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
|
||||
if (width == 0) width = mm;
|
||||
if (mm < width) width = mm;
|
||||
mm -= width;
|
||||
range_M[num_cpu + 1] = range_M[num_cpu] + width;
|
||||
|
||||
width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
|
||||
if (width == 0) width = nn;
|
||||
if (nn < width) width = nn;
|
||||
if (mm <= 0) width = nn;
|
||||
nn -= width;
|
||||
|
@ -561,7 +644,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
range_n_new[1] = offset + is + bk;
|
||||
|
||||
if (num_cpu > 0) {
|
||||
|
||||
queue[num_cpu - 1].next = NULL;
|
||||
|
||||
exec_blas_async(0, &queue[0]);
|
||||
|
@ -572,8 +654,20 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
|
||||
if (iinfo && !info) info = iinfo + is;
|
||||
|
||||
for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};
|
||||
|
||||
for (i = 0; i < num_cpu; i ++) {
|
||||
#if 1
|
||||
LOCK_COMMAND(&getrf_flag_lock);
|
||||
f=flag[i*CACHE_LINE_SIZE];
|
||||
UNLOCK_COMMAND(&getrf_flag_lock);
|
||||
while (f!=0) {
|
||||
LOCK_COMMAND(&getrf_flag_lock);
|
||||
f=flag[i*CACHE_LINE_SIZE];
|
||||
UNLOCK_COMMAND(&getrf_flag_lock);
|
||||
};
|
||||
#else
|
||||
while (flag[i*CACHE_LINE_SIZE]) {};
|
||||
#endif
|
||||
}
|
||||
TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb);
|
||||
|
||||
} else {
|
||||
|
@ -634,8 +728,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
BLASLONG range[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, nn, num_cpu;
|
||||
|
||||
volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
|
||||
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
|
|
|
@ -0,0 +1,664 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||
|
||||
//The array of job_t may overflow the stack.
|
||||
//Instead, use malloc to alloc job_t.
|
||||
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
||||
#define USE_ALLOC_HEAP
|
||||
#endif
|
||||
|
||||
|
||||
static FLOAT dm1 = -1.;
|
||||
|
||||
#ifndef KERNEL_FUNC
|
||||
#ifndef LOWER
|
||||
#define KERNEL_FUNC SYRK_KERNEL_U
|
||||
#else
|
||||
#define KERNEL_FUNC SYRK_KERNEL_L
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef LOWER
|
||||
#ifndef COMPLEX
|
||||
#define TRSM_KERNEL TRSM_KERNEL_LT
|
||||
#else
|
||||
#define TRSM_KERNEL TRSM_KERNEL_LC
|
||||
#endif
|
||||
#else
|
||||
#ifndef COMPLEX
|
||||
#define TRSM_KERNEL TRSM_KERNEL_RN
|
||||
#else
|
||||
#define TRSM_KERNEL TRSM_KERNEL_RR
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef CACHE_LINE_SIZE
|
||||
#define CACHE_LINE_SIZE 8
|
||||
#endif
|
||||
|
||||
#ifndef DIVIDE_RATE
|
||||
#define DIVIDE_RATE 2
|
||||
#endif
|
||||
|
||||
#ifndef SWITCH_RATIO
|
||||
#define SWITCH_RATIO 2
|
||||
#endif
|
||||
|
||||
#ifndef LOWER
|
||||
#define TRANS
|
||||
#endif
|
||||
|
||||
#ifndef SYRK_LOCAL
|
||||
#if !defined(LOWER) && !defined(TRANS)
|
||||
#define SYRK_LOCAL SYRK_UN
|
||||
#elif !defined(LOWER) && defined(TRANS)
|
||||
#define SYRK_LOCAL SYRK_UT
|
||||
#elif defined(LOWER) && !defined(TRANS)
|
||||
#define SYRK_LOCAL SYRK_LN
|
||||
#else
|
||||
#define SYRK_LOCAL SYRK_LT
|
||||
#endif
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
|
||||
#ifndef KERNEL_OPERATION
|
||||
#ifndef COMPLEX
|
||||
#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
|
||||
KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
|
||||
#else
|
||||
#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
|
||||
KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef ICOPY_OPERATION
|
||||
#ifndef TRANS
|
||||
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||
#else
|
||||
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef OCOPY_OPERATION
|
||||
#ifdef TRANS
|
||||
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||
#else
|
||||
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef S
|
||||
#define S args -> a
|
||||
#endif
|
||||
#ifndef A
|
||||
#define A args -> b
|
||||
#endif
|
||||
#ifndef C
|
||||
#define C args -> c
|
||||
#endif
|
||||
#ifndef LDA
|
||||
#define LDA args -> lda
|
||||
#endif
|
||||
#ifndef N
|
||||
#define N args -> m
|
||||
#endif
|
||||
#ifndef K
|
||||
#define K args -> k
|
||||
#endif
|
||||
|
||||
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
|
||||
|
||||
FLOAT *buffer[DIVIDE_RATE];
|
||||
|
||||
BLASLONG k, lda;
|
||||
BLASLONG m_from, m_to;
|
||||
|
||||
FLOAT *alpha;
|
||||
FLOAT *a, *c;
|
||||
job_t *job = (job_t *)args -> common;
|
||||
BLASLONG xxx, bufferside;
|
||||
|
||||
BLASLONG jjs, min_jj;
|
||||
BLASLONG is, min_i, div_n;
|
||||
|
||||
BLASLONG i, current;
|
||||
|
||||
k = K;
|
||||
|
||||
a = (FLOAT *)A;
|
||||
c = (FLOAT *)C;
|
||||
|
||||
lda = LDA;
|
||||
|
||||
alpha = (FLOAT *)args -> alpha;
|
||||
|
||||
m_from = range_n[mypos + 0];
|
||||
m_to = range_n[mypos + 1];
|
||||
|
||||
#if 0
|
||||
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to);
|
||||
#endif
|
||||
|
||||
div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
|
||||
buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
||||
for (i = 1; i < DIVIDE_RATE; i++) {
|
||||
buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
|
||||
}
|
||||
|
||||
#ifndef LOWER
|
||||
TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb);
|
||||
#else
|
||||
TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb);
|
||||
#endif
|
||||
|
||||
for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
|
||||
|
||||
for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){
|
||||
|
||||
min_jj = MIN(m_to, xxx + div_n) - jjs;
|
||||
|
||||
#ifndef LOWER
|
||||
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
|
||||
#else
|
||||
if (min_jj > GEMM_P) min_jj = GEMM_P;
|
||||
#endif
|
||||
|
||||
#ifndef LOWER
|
||||
OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
|
||||
|
||||
TRSM_KERNEL (k, min_jj, k, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
sb,
|
||||
buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
|
||||
a + jjs * lda * COMPSIZE, lda, 0);
|
||||
#else
|
||||
ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
|
||||
|
||||
TRSM_KERNEL (min_jj, k, k, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
|
||||
sb,
|
||||
a + jjs * COMPSIZE, lda, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef LOWER
|
||||
for (i = 0; i <= mypos; i++)
|
||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
#else
|
||||
for (i = mypos; i < args -> nthreads; i++)
|
||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
#endif
|
||||
|
||||
WMB;
|
||||
}
|
||||
|
||||
min_i = m_to - m_from;
|
||||
|
||||
if (min_i >= GEMM_P * 2) {
|
||||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
#ifndef LOWER
|
||||
ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
|
||||
#else
|
||||
OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
|
||||
#endif
|
||||
|
||||
current = mypos;
|
||||
|
||||
#ifndef LOWER
|
||||
while (current < args -> nthreads)
|
||||
#else
|
||||
while (current >= 0)
|
||||
#endif
|
||||
{
|
||||
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
|
||||
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
||||
|
||||
/* thread has to wait */
|
||||
if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
|
||||
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
|
||||
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
|
||||
c, lda, m_from, xxx);
|
||||
|
||||
if (m_from + min_i >= m_to) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef LOWER
|
||||
current ++;
|
||||
#else
|
||||
current --;
|
||||
#endif
|
||||
}
|
||||
|
||||
for(is = m_from + min_i; is < m_to; is += min_i){
|
||||
min_i = m_to - is;
|
||||
|
||||
if (min_i >= GEMM_P * 2) {
|
||||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
#ifndef LOWER
|
||||
ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
|
||||
#else
|
||||
OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
|
||||
#endif
|
||||
|
||||
current = mypos;
|
||||
|
||||
#ifndef LOWER
|
||||
while (current < args -> nthreads)
|
||||
#else
|
||||
while (current >= 0)
|
||||
#endif
|
||||
{
|
||||
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
|
||||
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
||||
|
||||
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
|
||||
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
|
||||
c, lda, is, xxx);
|
||||
|
||||
if (is + min_i >= m_to) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
#ifndef LOWER
|
||||
current ++;
|
||||
#else
|
||||
current --;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < args -> nthreads; i++) {
|
||||
if (i != mypos) {
|
||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
|
||||
|
||||
blas_arg_t newarg;
|
||||
|
||||
#ifndef USE_ALLOC_HEAP
|
||||
job_t job[MAX_CPU_NUMBER];
|
||||
#else
|
||||
job_t * job = NULL;
|
||||
#endif
|
||||
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
|
||||
BLASLONG range[MAX_CPU_NUMBER + 100];
|
||||
|
||||
BLASLONG num_cpu;
|
||||
|
||||
BLASLONG nthreads = args -> nthreads;
|
||||
|
||||
BLASLONG width, i, j, k;
|
||||
BLASLONG n, n_from, n_to;
|
||||
int mode, mask;
|
||||
double dnum;
|
||||
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1;
|
||||
#elif defined(DOUBLE)
|
||||
mode = BLAS_DOUBLE | BLAS_REAL;
|
||||
mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
|
||||
#else
|
||||
mode = BLAS_SINGLE | BLAS_REAL;
|
||||
mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||
mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1;
|
||||
#elif defined(DOUBLE)
|
||||
mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1;
|
||||
#else
|
||||
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
newarg.m = args -> m;
|
||||
newarg.k = args -> k;
|
||||
newarg.a = args -> a;
|
||||
newarg.b = args -> b;
|
||||
newarg.c = args -> c;
|
||||
newarg.lda = args -> lda;
|
||||
newarg.alpha = args -> alpha;
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
||||
if(job==NULL){
|
||||
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
|
||||
newarg.common = (void *)job;
|
||||
|
||||
n_from = 0;
|
||||
n_to = args -> m;
|
||||
|
||||
#ifndef LOWER
|
||||
|
||||
range[MAX_CPU_NUMBER] = n_to - n_from;
|
||||
range[0] = 0;
|
||||
num_cpu = 0;
|
||||
i = 0;
|
||||
n = n_to - n_from;
|
||||
|
||||
dnum = (double)n * (double)n /(double)nthreads;
|
||||
|
||||
while (i < n){
|
||||
|
||||
if (nthreads - num_cpu > 1) {
|
||||
|
||||
double di = (double)i;
|
||||
|
||||
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
|
||||
|
||||
if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1));
|
||||
|
||||
if ((width > n - i) || (width < mask)) width = n - i;
|
||||
|
||||
} else {
|
||||
width = n - i;
|
||||
}
|
||||
|
||||
range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width;
|
||||
|
||||
queue[num_cpu].mode = mode;
|
||||
queue[num_cpu].routine = inner_thread;
|
||||
queue[num_cpu].args = &newarg;
|
||||
queue[num_cpu].range_m = NULL;
|
||||
|
||||
queue[num_cpu].sa = NULL;
|
||||
queue[num_cpu].sb = NULL;
|
||||
queue[num_cpu].next = &queue[num_cpu + 1];
|
||||
|
||||
num_cpu ++;
|
||||
i += width;
|
||||
}
|
||||
|
||||
for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu];
|
||||
|
||||
#else
|
||||
|
||||
range[0] = 0;
|
||||
num_cpu = 0;
|
||||
i = 0;
|
||||
n = n_to - n_from;
|
||||
|
||||
dnum = (double)n * (double)n /(double)nthreads;
|
||||
|
||||
while (i < n){
|
||||
|
||||
if (nthreads - num_cpu > 1) {
|
||||
|
||||
double di = (double)i;
|
||||
|
||||
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
|
||||
|
||||
if ((width > n - i) || (width < mask)) width = n - i;
|
||||
|
||||
} else {
|
||||
width = n - i;
|
||||
}
|
||||
|
||||
range[num_cpu + 1] = range[num_cpu] + width;
|
||||
|
||||
queue[num_cpu].mode = mode;
|
||||
queue[num_cpu].routine = inner_thread;
|
||||
queue[num_cpu].args = &newarg;
|
||||
queue[num_cpu].range_m = NULL;
|
||||
queue[num_cpu].range_n = range;
|
||||
queue[num_cpu].sa = NULL;
|
||||
queue[num_cpu].sb = NULL;
|
||||
queue[num_cpu].next = &queue[num_cpu + 1];
|
||||
|
||||
num_cpu ++;
|
||||
i += width;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
newarg.nthreads = num_cpu;
|
||||
|
||||
if (num_cpu) {
|
||||
|
||||
for (j = 0; j < num_cpu; j++) {
|
||||
for (i = 0; i < num_cpu; i++) {
|
||||
for (k = 0; k < DIVIDE_RATE; k++) {
|
||||
job[j].working[i][CACHE_LINE_SIZE * k] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
queue[0].sa = sa;
|
||||
queue[0].sb = sb;
|
||||
queue[num_cpu - 1].next = NULL;
|
||||
|
||||
exec_blas(num_cpu, queue);
|
||||
}
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
free(job);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {
|
||||
|
||||
BLASLONG n, bk, i, blocking, lda;
|
||||
BLASLONG info;
|
||||
int mode;
|
||||
blas_arg_t newarg;
|
||||
FLOAT *a;
|
||||
FLOAT alpha[2] = { -ONE, ZERO};
|
||||
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
#elif defined(DOUBLE)
|
||||
mode = BLAS_DOUBLE | BLAS_REAL;
|
||||
#else
|
||||
mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||
#elif defined(DOUBLE)
|
||||
mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
#else
|
||||
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (args -> nthreads == 1) {
|
||||
#ifndef LOWER
|
||||
info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0);
|
||||
#else
|
||||
info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0);
|
||||
#endif
|
||||
return info;
|
||||
}
|
||||
|
||||
n = args -> n;
|
||||
a = (FLOAT *)args -> a;
|
||||
lda = args -> lda;
|
||||
|
||||
if (range_n) n = range_n[1] - range_n[0];
|
||||
|
||||
if (n <= GEMM_UNROLL_N * 2) {
|
||||
#ifndef LOWER
|
||||
info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0);
|
||||
#else
|
||||
info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0);
|
||||
#endif
|
||||
return info;
|
||||
}
|
||||
|
||||
newarg.lda = lda;
|
||||
newarg.ldb = lda;
|
||||
newarg.ldc = lda;
|
||||
newarg.alpha = alpha;
|
||||
newarg.beta = NULL;
|
||||
newarg.nthreads = args -> nthreads;
|
||||
|
||||
blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N;
|
||||
if (blocking > GEMM_Q) blocking = GEMM_Q;
|
||||
|
||||
for (i = 0; i < n; i += blocking) {
|
||||
bk = n - i;
|
||||
if (bk > blocking) bk = blocking;
|
||||
|
||||
newarg.m = bk;
|
||||
newarg.n = bk;
|
||||
newarg.a = a + (i + i * lda) * COMPSIZE;
|
||||
|
||||
info = CNAME(&newarg, NULL, NULL, sa, sb, 0);
|
||||
if (info) return info + i;
|
||||
|
||||
if (n - i - bk > 0) {
|
||||
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||
newarg.m = n - i - bk;
|
||||
newarg.k = bk;
|
||||
#ifndef LOWER
|
||||
newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE;
|
||||
#else
|
||||
newarg.b = a + ((i + bk) + i * lda) * COMPSIZE;
|
||||
#endif
|
||||
newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
|
||||
|
||||
thread_driver(&newarg, sa, sb);
|
||||
#else
|
||||
|
||||
#ifndef LOWER
|
||||
newarg.m = bk;
|
||||
newarg.n = n - i - bk;
|
||||
newarg.a = a + (i + i * lda) * COMPSIZE;
|
||||
newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;
|
||||
|
||||
gemm_thread_n(mode | BLAS_TRANSA_T,
|
||||
&newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads);
|
||||
|
||||
newarg.n = n - i - bk;
|
||||
newarg.k = bk;
|
||||
newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE;
|
||||
newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
|
||||
|
||||
#if 0
|
||||
HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0);
|
||||
#else
|
||||
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
|
||||
&newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads);
|
||||
#endif
|
||||
#else
|
||||
newarg.m = n - i - bk;
|
||||
newarg.n = bk;
|
||||
newarg.a = a + (i + i * lda) * COMPSIZE;
|
||||
newarg.b = a + (i + bk + i * lda) * COMPSIZE;
|
||||
|
||||
gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO,
|
||||
&newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads);
|
||||
|
||||
newarg.n = n - i - bk;
|
||||
newarg.k = bk;
|
||||
newarg.a = a + (i + bk + i * lda) * COMPSIZE;
|
||||
newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE;
|
||||
|
||||
#if 0
|
||||
HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
|
||||
#else
|
||||
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO,
|
||||
&newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
2
param.h
2
param.h
|
@ -2291,7 +2291,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define SYMV_P 16
|
||||
#endif
|
||||
|
||||
#if defined(P5600) || defined(I6400) || defined(P6600) || defined(I6500)
|
||||
#if defined(P5600) || defined(MIPS1004K) || defined(I6400) || defined(P6600) || defined(I6500)
|
||||
#define SNUMOPT 2
|
||||
#define DNUMOPT 2
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ endif ()
|
|||
|
||||
# known to hang with the native Windows and Android threads
|
||||
# FIXME needs checking if this works on any of the other platforms
|
||||
if (NOT NO_CBLAS)
|
||||
if (NOT USE_OPENMP)
|
||||
if (OS_CYGWIN_NT OR OS_LINUX)
|
||||
set(OpenBLAS_utest_src
|
||||
|
@ -33,6 +34,7 @@ set(OpenBLAS_utest_src
|
|||
)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (NOT NO_LAPACK)
|
||||
set(OpenBLAS_utest_src
|
||||
|
|
|
@ -17,11 +17,13 @@ endif
|
|||
|
||||
#this does not work with OpenMP nor with native Windows or Android threads
|
||||
# FIXME TBD if this works on OSX, SunOS, POWER and zarch
|
||||
ifneq ($(NO_CBLAS), 1)
|
||||
ifndef USE_OPENMP
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT))
|
||||
OBJS += test_fork.o
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
all : run_test
|
||||
|
||||
|
|
|
@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
**********************************************************************************/
|
||||
|
||||
#include "openblas_utest.h"
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
#include <cblas.h>
|
||||
|
||||
|
|
Loading…
Reference in New Issue