Merge pull request #1570 from xianyi/develop
Update release-0.3.0 branch to match develop
This commit is contained in:
commit
939452ea9d
23
.travis.yml
23
.travis.yml
|
@ -7,6 +7,7 @@ language: c
|
||||||
jobs:
|
jobs:
|
||||||
include:
|
include:
|
||||||
- &test-ubuntu
|
- &test-ubuntu
|
||||||
|
os: linux
|
||||||
stage: test
|
stage: test
|
||||||
compiler: gcc
|
compiler: gcc
|
||||||
addons:
|
addons:
|
||||||
|
@ -57,7 +58,8 @@ jobs:
|
||||||
- TARGET_BOX=LINUX32
|
- TARGET_BOX=LINUX32
|
||||||
- BTYPE="BINARY=32"
|
- BTYPE="BINARY=32"
|
||||||
|
|
||||||
- stage: test
|
- os: linux
|
||||||
|
stage: test
|
||||||
compiler: gcc
|
compiler: gcc
|
||||||
addons:
|
addons:
|
||||||
apt:
|
apt:
|
||||||
|
@ -77,6 +79,7 @@ jobs:
|
||||||
# which is slower than container-based infrastructure used for jobs
|
# which is slower than container-based infrastructure used for jobs
|
||||||
# that don't require sudo.
|
# that don't require sudo.
|
||||||
- &test-alpine
|
- &test-alpine
|
||||||
|
os: linux
|
||||||
stage: test
|
stage: test
|
||||||
dist: trusty
|
dist: trusty
|
||||||
sudo: true
|
sudo: true
|
||||||
|
@ -120,6 +123,7 @@ jobs:
|
||||||
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
|
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
|
||||||
|
|
||||||
- &test-cmake
|
- &test-cmake
|
||||||
|
os: linux
|
||||||
stage: test
|
stage: test
|
||||||
compiler: clang
|
compiler: clang
|
||||||
addons:
|
addons:
|
||||||
|
@ -147,6 +151,23 @@ jobs:
|
||||||
env:
|
env:
|
||||||
- CMAKE=1
|
- CMAKE=1
|
||||||
|
|
||||||
|
- &test-macos
|
||||||
|
os: osx
|
||||||
|
stage: test
|
||||||
|
osx_image: xcode8
|
||||||
|
before_script:
|
||||||
|
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||||
|
- brew update
|
||||||
|
- brew install gcc # for gfortran
|
||||||
|
script:
|
||||||
|
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||||
|
env:
|
||||||
|
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||||
|
|
||||||
|
- <<: *test-macos
|
||||||
|
env:
|
||||||
|
- BTYPE="BINARY=32"
|
||||||
|
|
||||||
# whitelist
|
# whitelist
|
||||||
branches:
|
branches:
|
||||||
only:
|
only:
|
||||||
|
|
6
Makefile
6
Makefile
|
@ -91,11 +91,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), FreeBSD)
|
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||||
@$(MAKE) -C exports so
|
|
||||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
|
||||||
endif
|
|
||||||
ifeq ($(OSNAME), NetBSD)
|
|
||||||
@$(MAKE) -C exports so
|
@$(MAKE) -C exports so
|
||||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -72,12 +72,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), FreeBSD)
|
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
|
||||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
|
||||||
endif
|
|
||||||
ifeq ($(OSNAME), NetBSD)
|
|
||||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||||
|
@ -101,8 +96,9 @@ endif
|
||||||
|
|
||||||
#Generating openblas.pc
|
#Generating openblas.pc
|
||||||
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||||
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
|
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
|
@ -115,7 +111,7 @@ endif
|
||||||
|
|
||||||
ifndef NO_SHARED
|
ifndef NO_SHARED
|
||||||
#ifeq logical or
|
#ifeq logical or
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
|
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
|
||||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
|
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
|
||||||
|
|
|
@ -17,6 +17,10 @@ ifdef CPUIDEMU
|
||||||
EXFLAGS = -DCPUIDEMU -DVENDOR=99
|
EXFLAGS = -DCPUIDEMU -DVENDOR=99
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(TARGET), 1004K)
|
||||||
|
TARGET_FLAGS = -mips32r2
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(TARGET), P5600)
|
ifeq ($(TARGET), P5600)
|
||||||
TARGET_FLAGS = -mips32r5
|
TARGET_FLAGS = -mips32r5
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -60,6 +60,13 @@ VERSION = 0.3.0.dev
|
||||||
# automatically detected by the the script.
|
# automatically detected by the the script.
|
||||||
# NUM_THREADS = 24
|
# NUM_THREADS = 24
|
||||||
|
|
||||||
|
# If you have enabled USE_OPENMP and your application would call
|
||||||
|
# OpenBLAS's calculation API from multi threads, please comment it in.
|
||||||
|
# This flag defines how many instances of OpenBLAS's calculation API can
|
||||||
|
# actually run in parallel. If more threads call OpenBLAS's calculation API,
|
||||||
|
# they need to wait for the preceding API calls to finish or risk data corruption.
|
||||||
|
# NUM_PARALLEL = 2
|
||||||
|
|
||||||
# if you don't need to install the static library, please comment it in.
|
# if you don't need to install the static library, please comment it in.
|
||||||
# NO_STATIC = 1
|
# NO_STATIC = 1
|
||||||
|
|
||||||
|
|
|
@ -17,15 +17,24 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
|
||||||
# http://stackoverflow.com/questions/4029274/mingw-and-make-variables
|
# http://stackoverflow.com/questions/4029274/mingw-and-make-variables
|
||||||
# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
|
# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
|
||||||
ifeq ($(origin CC),default)
|
ifeq ($(origin CC),default)
|
||||||
|
|
||||||
|
# Check if $(CC) refers to a valid command and set the value to gcc if not
|
||||||
|
ifneq ($(findstring cmd.exe,$(SHELL)),)
|
||||||
|
ifeq ($(shell where $(CC) 2>NUL),)
|
||||||
CC = gcc
|
CC = gcc
|
||||||
# Change the default compile to clang on Mac OSX.
|
|
||||||
# http://stackoverflow.com/questions/714100/os-detecting-makefile
|
|
||||||
UNAME_S := $(shell uname -s)
|
|
||||||
ifeq ($(UNAME_S),Darwin)
|
|
||||||
CC = clang
|
|
||||||
# EXTRALIB += -Wl,-no_compact_unwind
|
|
||||||
endif
|
|
||||||
endif
|
endif
|
||||||
|
else # POSIX-ish
|
||||||
|
ifeq ($(shell command -v $(CC) 2>/dev/null),)
|
||||||
|
ifeq ($(shell uname -s),Darwin)
|
||||||
|
CC = clang
|
||||||
|
# EXTRALIB += -Wl,-no_compact_unwind
|
||||||
|
else
|
||||||
|
CC = gcc
|
||||||
|
endif # Darwin
|
||||||
|
endif # CC exists
|
||||||
|
endif # Shell is sane
|
||||||
|
|
||||||
|
endif # CC is set to default
|
||||||
|
|
||||||
# Default Fortran compiler (FC) is selected by f_check.
|
# Default Fortran compiler (FC) is selected by f_check.
|
||||||
|
|
||||||
|
@ -175,6 +184,10 @@ endif
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifndef NUM_PARALLEL
|
||||||
|
NUM_PARALLEL = 1
|
||||||
|
endif
|
||||||
|
|
||||||
ifndef NUM_THREADS
|
ifndef NUM_THREADS
|
||||||
NUM_THREADS = $(NUM_CORES)
|
NUM_THREADS = $(NUM_CORES)
|
||||||
endif
|
endif
|
||||||
|
@ -230,7 +243,7 @@ endif
|
||||||
MD5SUM = md5 -r
|
MD5SUM = md5 -r
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), FreeBSD)
|
ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly))
|
||||||
MD5SUM = md5 -r
|
MD5SUM = md5 -r
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -424,7 +437,7 @@ CCOMMON_OPT += -fopenmp
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(C_COMPILER), INTEL)
|
ifeq ($(C_COMPILER), INTEL)
|
||||||
CCOMMON_OPT += -openmp
|
CCOMMON_OPT += -fopenmp
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(C_COMPILER), PGI)
|
ifeq ($(C_COMPILER), PGI)
|
||||||
|
@ -555,9 +568,14 @@ CCOMMON_OPT += -march=mips64
|
||||||
FCOMMON_OPT += -march=mips64
|
FCOMMON_OPT += -march=mips64
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), 1004K)
|
||||||
|
CCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
||||||
|
FCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), P5600)
|
ifeq ($(CORE), P5600)
|
||||||
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||||
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), I6400)
|
ifeq ($(CORE), I6400)
|
||||||
|
@ -704,7 +722,7 @@ FCOMMON_OPT += -i8
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
FCOMMON_OPT += -openmp
|
FCOMMON_OPT += -fopenmp
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -952,6 +970,8 @@ endif
|
||||||
|
|
||||||
CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS)
|
CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS)
|
||||||
|
|
||||||
|
CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL)
|
||||||
|
|
||||||
ifdef USE_SIMPLE_THREADED_LEVEL3
|
ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||||
endif
|
endif
|
||||||
|
|
230
README.md
230
README.md
|
@ -5,175 +5,219 @@
|
||||||
Travis CI: [](https://travis-ci.org/xianyi/OpenBLAS)
|
Travis CI: [](https://travis-ci.org/xianyi/OpenBLAS)
|
||||||
|
|
||||||
AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
|
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
|
||||||
|
|
||||||
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.
|
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
|
||||||
|
|
||||||
## Binary Packages
|
## Binary Packages
|
||||||
We provide binary packages for the following platform.
|
|
||||||
|
We provide official binary packages for the following platform:
|
||||||
|
|
||||||
* Windows x86/x86_64
|
* Windows x86/x86_64
|
||||||
|
|
||||||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
|
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
|
||||||
|
|
||||||
## Installation from Source
|
## Installation from Source
|
||||||
Download from project homepage. http://xianyi.github.com/OpenBLAS/
|
|
||||||
|
|
||||||
Or, check out codes from git://github.com/xianyi/OpenBLAS.git
|
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
|
||||||
|
using Git from https://github.com/xianyi/OpenBLAS.git.
|
||||||
|
|
||||||
|
### Dependencies
|
||||||
|
|
||||||
|
Building OpenBLAS requires the following to be installed:
|
||||||
|
|
||||||
|
* GNU Make
|
||||||
|
* A C compiler, e.g. GCC or Clang
|
||||||
|
* A Fortran compiler (optional, for LAPACK)
|
||||||
|
* IBM MASS (optional, see below)
|
||||||
|
|
||||||
### Normal compile
|
### Normal compile
|
||||||
* type "make" to detect the CPU automatically.
|
|
||||||
or
|
Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically.
|
||||||
* type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
|
To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`.
|
||||||
|
The full target list is in the file `TargetList.txt`.
|
||||||
|
|
||||||
### Cross compile
|
### Cross compile
|
||||||
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
|
|
||||||
|
Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler.
|
||||||
|
The target must be specified explicitly when cross compiling.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
On X86 box, compile this library for loongson3a CPU.
|
* On an x86 box, compile this library for a loongson3a CPU:
|
||||||
|
```sh
|
||||||
|
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
|
||||||
|
```
|
||||||
|
|
||||||
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
|
* On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler:
|
||||||
|
```sh
|
||||||
On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler.
|
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
|
||||||
|
```
|
||||||
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
|
|
||||||
|
|
||||||
### Debug version
|
### Debug version
|
||||||
|
|
||||||
make DEBUG=1
|
A debug version can be built using `make DEBUG=1`.
|
||||||
|
|
||||||
### Compile with MASS Support on Power CPU (Optional dependency)
|
### Compile with MASS support on Power CPU (optional)
|
||||||
|
|
||||||
[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and
|
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
|
||||||
Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER.
|
consists of a set of mathematical functions for C, C++, and Fortran applications that are
|
||||||
The library can be installed as below -
|
are tuned for optimum performance on POWER architectures.
|
||||||
|
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
|
||||||
|
The library can be installed as shown:
|
||||||
|
|
||||||
* On Ubuntu:
|
* On Ubuntu:
|
||||||
|
```sh
|
||||||
|
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
|
||||||
|
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install libxlmass-devel.8.1.5
|
||||||
|
```
|
||||||
|
|
||||||
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -</br>
|
* On RHEL/CentOS:
|
||||||
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list</br>
|
```sh
|
||||||
sudo apt-get update</br>
|
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
|
||||||
sudo apt-get install libxlmass-devel.8.1.5</br>
|
sudo rpm --import repomd.xml.key
|
||||||
|
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
|
||||||
|
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
|
||||||
|
sudo yum install libxlmass-devel.8.1.5
|
||||||
|
```
|
||||||
|
|
||||||
* On RHEL/CentOS:
|
After installing the MASS library, compile OpenBLAS with `USE_MASS=1`.
|
||||||
|
For example, to compile on Power8 with MASS support: `make USE_MASS=1 TARGET=POWER8`.
|
||||||
|
|
||||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key</br>
|
### Install to a specific directory (optional)
|
||||||
sudo rpm --import repomd.xml.key</br>
|
|
||||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo</br>
|
|
||||||
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/</br>
|
|
||||||
sudo yum install libxlmass-devel.8.1.5</br>
|
|
||||||
|
|
||||||
After installing MASS library, compile openblas with USE_MASS=1.
|
Use `PREFIX=` when invoking `make`, for example
|
||||||
|
|
||||||
Example:
|
```sh
|
||||||
|
make install PREFIX=your_installation_directory
|
||||||
|
```
|
||||||
|
|
||||||
Compiling on Power8 with MASS support -
|
The default installation directory is `/opt/OpenBLAS`.
|
||||||
|
|
||||||
make USE_MASS=1 TARGET=POWER8
|
## Supported CPUs and Operating Systems
|
||||||
|
|
||||||
### Install to the directory (optional)
|
Please read `GotoBLAS_01Readme.txt`.
|
||||||
|
|
||||||
Example:
|
### Additional supported CPUs
|
||||||
|
|
||||||
make install PREFIX=your_installation_directory
|
#### x86/x86-64
|
||||||
|
|
||||||
The default directory is /opt/OpenBLAS
|
|
||||||
|
|
||||||
## Support CPU & OS
|
|
||||||
Please read GotoBLAS_01Readme.txt
|
|
||||||
|
|
||||||
### Additional support CPU:
|
|
||||||
|
|
||||||
#### x86/x86-64:
|
|
||||||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||||
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
||||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar)
|
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||||
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
||||||
|
|
||||||
#### MIPS64:
|
#### MIPS64
|
||||||
|
|
||||||
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
||||||
- **ICT Loongson 3B**: Experimental
|
- **ICT Loongson 3B**: Experimental
|
||||||
|
|
||||||
#### ARM:
|
#### ARM
|
||||||
- **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ )
|
|
||||||
- **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 )
|
|
||||||
|
|
||||||
#### ARM64:
|
- **ARMv6**: Optimized BLAS for vfpv2 and vfpv3-d16 (e.g. BCM2835, Cortex M0+)
|
||||||
- **ARMV8**: Experimental
|
- **ARMv7**: Optimized BLAS for vfpv3-d32 (e.g. Cortex A8, A9 and A15)
|
||||||
|
|
||||||
|
#### ARM64
|
||||||
|
|
||||||
|
- **ARMv8**: Experimental
|
||||||
- **ARM Cortex-A57**: Experimental
|
- **ARM Cortex-A57**: Experimental
|
||||||
|
|
||||||
#### PPC/PPC64
|
#### PPC/PPC64
|
||||||
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with USE_OPENMP=1
|
|
||||||
|
|
||||||
#### IBM zEnterprise System:
|
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
|
||||||
|
|
||||||
|
#### IBM zEnterprise System
|
||||||
|
|
||||||
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
|
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
|
||||||
|
|
||||||
|
|
||||||
### Support OS:
|
### Supported OS
|
||||||
|
|
||||||
- **GNU/Linux**
|
- **GNU/Linux**
|
||||||
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
- **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||||
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
|
- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts.
|
||||||
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
|
- **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||||
- **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
- **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||||
|
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
|
||||||
|
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||||
|
|
||||||
## Usages
|
## Usage
|
||||||
Link with libopenblas.a or -lopenblas for shared library.
|
|
||||||
|
|
||||||
### Set the number of threads with environment variables.
|
Statically link with `libopenblas.a` or dynamically link with `-lopenblas` if OpenBLAS was
|
||||||
|
compiled as a shared library.
|
||||||
|
|
||||||
Examples:
|
### Setting the number of threads using environment variables
|
||||||
|
|
||||||
export OPENBLAS_NUM_THREADS=4
|
Environment variables are used to specify a maximum number of threads.
|
||||||
|
For example,
|
||||||
|
|
||||||
or
|
```sh
|
||||||
|
export OPENBLAS_NUM_THREADS=4
|
||||||
|
export GOTO_NUM_THREADS=4
|
||||||
|
export OMP_NUM_THREADS=4
|
||||||
|
```
|
||||||
|
|
||||||
export GOTO_NUM_THREADS=4
|
The priorities are `OPENBLAS_NUM_THREADS` > `GOTO_NUM_THREADS` > `OMP_NUM_THREADS`.
|
||||||
|
|
||||||
or
|
If you compile this library with `USE_OPENMP=1`, you should set the `OMP_NUM_THREADS`
|
||||||
|
environment variable; OpenBLAS ignores `OPENBLAS_NUM_THREADS` and `GOTO_NUM_THREADS` when
|
||||||
|
compiled with `USE_OPENMP=1`.
|
||||||
|
|
||||||
export OMP_NUM_THREADS=4
|
### Setting the number of threads at runtime
|
||||||
|
|
||||||
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
|
We provide the following functions to control the number of threads at runtime:
|
||||||
|
|
||||||
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
|
```c
|
||||||
|
void goto_set_num_threads(int num_threads);
|
||||||
|
void openblas_set_num_threads(int num_threads);
|
||||||
|
```
|
||||||
|
|
||||||
### Set the number of threads on runtime.
|
If you compile this library with `USE_OPENMP=1`, you should use the above functions too.
|
||||||
|
|
||||||
We provided the below functions to control the number of threads on runtime.
|
## Reporting bugs
|
||||||
|
|
||||||
void goto_set_num_threads(int num_threads);
|
Please submit an issue in https://github.com/xianyi/OpenBLAS/issues.
|
||||||
|
|
||||||
void openblas_set_num_threads(int num_threads);
|
|
||||||
|
|
||||||
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
|
|
||||||
|
|
||||||
## Report Bugs
|
|
||||||
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
|
|
||||||
|
|
||||||
## Contact
|
## Contact
|
||||||
|
|
||||||
* OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
|
* OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
|
||||||
* OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev
|
* OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev
|
||||||
|
|
||||||
## ChangeLog
|
## Change log
|
||||||
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
|
|
||||||
|
Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version.
|
||||||
|
|
||||||
## Troubleshooting
|
## Troubleshooting
|
||||||
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
|
|
||||||
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
|
* Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first.
|
||||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
|
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
|
||||||
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
|
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
||||||
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
|
Clang 3.0 will generate the wrong AVX binary code.
|
||||||
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
|
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||||
|
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
|
||||||
|
the library with `BIGNUMA=1`.
|
||||||
|
* OpenBLAS does not set processor affinity by default.
|
||||||
|
On Linux, you can enable processor affinity by commenting out the line `NO_AFFINITY=1` in
|
||||||
|
Makefile.rule. However, note that this may cause
|
||||||
|
[a conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
|
||||||
|
* On Loongson 3A, `make test` may fail with a `pthread_create` error (`EAGAIN`).
|
||||||
|
However, it will be okay when you run the same test case on the shell.
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
|
|
||||||
1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
|
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue
|
||||||
1. Write a test which shows that the bug was fixed or that the feature works as expected.
|
to start a discussion around a feature idea or a bug.
|
||||||
1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
|
2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
|
||||||
|
3. Write a test which shows that the bug was fixed or that the feature works as expected.
|
||||||
|
4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
|
||||||
|
|
||||||
## Donation
|
## Donation
|
||||||
|
|
||||||
Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation).
|
Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation).
|
||||||
|
|
|
@ -56,6 +56,7 @@ CELL
|
||||||
|
|
||||||
3.MIPS CPU:
|
3.MIPS CPU:
|
||||||
P5600
|
P5600
|
||||||
|
1004K
|
||||||
|
|
||||||
4.MIPS64 CPU:
|
4.MIPS64 CPU:
|
||||||
SICORTEX
|
SICORTEX
|
||||||
|
|
14
USAGE.md
14
USAGE.md
|
@ -14,6 +14,20 @@ Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
|
||||||
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
|
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
|
||||||
`MAX_CPU_NUMBER=NUM_THREADS`.
|
`MAX_CPU_NUMBER=NUM_THREADS`.
|
||||||
|
|
||||||
|
Despite its name, and due to the use of memory buffers in functions like SGEMM,
|
||||||
|
the setting of NUM_THREADS can be relevant even for a single-threaded build
|
||||||
|
of OpenBLAS, if such functions get called by multiple threads of a program
|
||||||
|
that uses OpenBLAS. In some cases, the affected code may simply crash or throw
|
||||||
|
a segmentation fault without displaying the above warning first.
|
||||||
|
|
||||||
|
Note that the number of threads used at runtime can be altered to differ from the
|
||||||
|
value NUM_THREADS was set to at build time. At runtime, the actual number of
|
||||||
|
threads can be set anywhere from 1 to the build's NUM_THREADS (note however,
|
||||||
|
that this does not change the number of memory buffers that will be allocated,
|
||||||
|
which is set at build time). The number of threads for a process can be set by
|
||||||
|
using the mechanisms described below.
|
||||||
|
|
||||||
|
|
||||||
#### How can I use OpenBLAS in multi-threaded applications?
|
#### How can I use OpenBLAS in multi-threaded applications?
|
||||||
|
|
||||||
If your application is already multi-threaded, it will conflict with OpenBLAS
|
If your application is already multi-threaded, it will conflict with OpenBLAS
|
||||||
|
|
2
c_check
2
c_check
|
@ -54,6 +54,8 @@ $compiler = GCC if ($compiler eq "");
|
||||||
$os = Linux if ($data =~ /OS_LINUX/);
|
$os = Linux if ($data =~ /OS_LINUX/);
|
||||||
$os = FreeBSD if ($data =~ /OS_FREEBSD/);
|
$os = FreeBSD if ($data =~ /OS_FREEBSD/);
|
||||||
$os = NetBSD if ($data =~ /OS_NETBSD/);
|
$os = NetBSD if ($data =~ /OS_NETBSD/);
|
||||||
|
$os = OpenBSD if ($data =~ /OS_OPENBSD/);
|
||||||
|
$os = DragonFly if ($data =~ /OS_DRAGONFLY/);
|
||||||
$os = Darwin if ($data =~ /OS_DARWIN/);
|
$os = Darwin if ($data =~ /OS_DARWIN/);
|
||||||
$os = SunOS if ($data =~ /OS_SUNOS/);
|
$os = SunOS if ($data =~ /OS_SUNOS/);
|
||||||
$os = AIX if ($data =~ /OS_AIX/);
|
$os = AIX if ($data =~ /OS_AIX/);
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
||||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||||
|
|
||||||
|
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
|
||||||
Name: OpenBLAS
|
Name: OpenBLAS
|
||||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||||
Version: @OPENBLAS_VERSION@
|
Version: @OPENBLAS_VERSION@
|
||||||
|
|
|
@ -96,6 +96,10 @@ if (NOT CMAKE_CROSSCOMPILING)
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (NOT DEFINED NUM_PARALLEL)
|
||||||
|
set(NUM_PARALLEL 1)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (NOT DEFINED NUM_THREADS)
|
if (NOT DEFINED NUM_THREADS)
|
||||||
if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0)
|
if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0)
|
||||||
# HT?
|
# HT?
|
||||||
|
@ -224,6 +228,8 @@ endif ()
|
||||||
|
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
|
||||||
|
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}")
|
||||||
|
|
||||||
if (USE_SIMPLE_THREADED_LEVEL3)
|
if (USE_SIMPLE_THREADED_LEVEL3)
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
10
common.h
10
common.h
|
@ -93,7 +93,7 @@ extern "C" {
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID)
|
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID)
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -179,7 +179,7 @@ extern "C" {
|
||||||
|
|
||||||
#define ALLOCA_ALIGN 63UL
|
#define ALLOCA_ALIGN 63UL
|
||||||
|
|
||||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
|
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
|
||||||
|
|
||||||
#ifdef NEEDBUNDERSCORE
|
#ifdef NEEDBUNDERSCORE
|
||||||
#define BLASFUNC(FUNC) FUNC##_
|
#define BLASFUNC(FUNC) FUNC##_
|
||||||
|
@ -649,6 +649,12 @@ int omp_get_num_procs(void);
|
||||||
__declspec(dllimport) int __cdecl omp_in_parallel(void);
|
__declspec(dllimport) int __cdecl omp_in_parallel(void);
|
||||||
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
|
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
|
||||||
#endif
|
#endif
|
||||||
|
#if (__STDC_VERSION__ >= 201112L)
|
||||||
|
#ifndef _Atomic
|
||||||
|
#define _Atomic volatile
|
||||||
|
#endif
|
||||||
|
#include <stdatomic.h>
|
||||||
|
#endif
|
||||||
#else
|
#else
|
||||||
#ifdef __ELF__
|
#ifdef __ELF__
|
||||||
int omp_in_parallel (void) __attribute__ ((weak));
|
int omp_in_parallel (void) __attribute__ ((weak));
|
||||||
|
|
10
common_x86.h
10
common_x86.h
|
@ -178,7 +178,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
result = x/y;
|
result = x/y;
|
||||||
return result;
|
return result;
|
||||||
#else
|
#else
|
||||||
|
#if (MAX_CPU_NUMBER > 64)
|
||||||
|
if ( y > 64) {
|
||||||
|
result = x/y;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
y = blas_quick_divide_table[y];
|
y = blas_quick_divide_table[y];
|
||||||
|
|
||||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||||
|
@ -327,7 +333,7 @@ REALNAME:
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__)
|
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__)
|
||||||
#define PROLOGUE \
|
#define PROLOGUE \
|
||||||
.text; \
|
.text; \
|
||||||
.align 16; \
|
.align 16; \
|
||||||
|
|
|
@ -196,6 +196,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
|
|
||||||
if (y <= 1) return x;
|
if (y <= 1) return x;
|
||||||
|
|
||||||
|
#if (MAX_CPU_NUMBER > 64)
|
||||||
|
if (y > 64) {
|
||||||
|
result = x / y;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
y = blas_quick_divide_table[y];
|
y = blas_quick_divide_table[y];
|
||||||
|
|
||||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||||
|
@ -403,7 +410,7 @@ REALNAME:
|
||||||
#define EPILOGUE .end
|
#define EPILOGUE .end
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
|
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(__ELF__) || defined(C_PGI)
|
||||||
#define PROLOGUE \
|
#define PROLOGUE \
|
||||||
.text; \
|
.text; \
|
||||||
.align 512; \
|
.align 512; \
|
||||||
|
|
|
@ -121,7 +121,7 @@ int detect(void)
|
||||||
return CPU_VULCAN;
|
return CPU_VULCAN;
|
||||||
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
|
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
|
||||||
return CPU_THUNDERX;
|
return CPU_THUNDERX;
|
||||||
else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */
|
else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43"))
|
||||||
return CPU_THUNDERX2T99;
|
return CPU_THUNDERX2T99;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
58
cpuid_mips.c
58
cpuid_mips.c
|
@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define CPU_UNKNOWN 0
|
#define CPU_UNKNOWN 0
|
||||||
#define CPU_P5600 1
|
#define CPU_P5600 1
|
||||||
|
#define CPU_1004K 2
|
||||||
|
|
||||||
static char *cpuname[] = {
|
static char *cpuname[] = {
|
||||||
"UNKOWN",
|
"UNKOWN",
|
||||||
"P5600"
|
"P5600",
|
||||||
|
"1004K"
|
||||||
};
|
};
|
||||||
|
|
||||||
int detect(void){
|
int detect(void){
|
||||||
|
@ -90,7 +92,7 @@ int detect(void){
|
||||||
if (!strncmp("cpu", buffer, 3)){
|
if (!strncmp("cpu", buffer, 3)){
|
||||||
p = strchr(buffer, ':') + 2;
|
p = strchr(buffer, ':') + 2;
|
||||||
#if 0
|
#if 0
|
||||||
fprintf(stderr, "%s\n", p);
|
fprintf(stderr, "%s \n", p);
|
||||||
#endif
|
#endif
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -99,43 +101,13 @@ int detect(void){
|
||||||
fclose(infile);
|
fclose(infile);
|
||||||
|
|
||||||
if(p != NULL){
|
if(p != NULL){
|
||||||
if (strstr(p, "Loongson-3A")){
|
if (strstr(p, "5600")) {
|
||||||
return CPU_LOONGSON3A;
|
return CPU_P5600;
|
||||||
}else if(strstr(p, "Loongson-3B")){
|
} else if (strstr(p, "1004K")) {
|
||||||
return CPU_LOONGSON3B;
|
return CPU_1004K;
|
||||||
}else if (strstr(p, "Loongson-3")){
|
} else
|
||||||
infile = fopen("/proc/cpuinfo", "r");
|
|
||||||
p = (char *)NULL;
|
|
||||||
while (fgets(buffer, sizeof(buffer), infile)){
|
|
||||||
if (!strncmp("system type", buffer, 11)){
|
|
||||||
p = strchr(buffer, ':') + 2;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fclose(infile);
|
|
||||||
if (strstr(p, "loongson3a"))
|
|
||||||
return CPU_LOONGSON3A;
|
|
||||||
}else{
|
|
||||||
return CPU_UNKNOWN;
|
return CPU_UNKNOWN;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
//Check model name for Loongson3
|
|
||||||
infile = fopen("/proc/cpuinfo", "r");
|
|
||||||
p = (char *)NULL;
|
|
||||||
while (fgets(buffer, sizeof(buffer), infile)){
|
|
||||||
if (!strncmp("model name", buffer, 10)){
|
|
||||||
p = strchr(buffer, ':') + 2;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fclose(infile);
|
|
||||||
if(p != NULL){
|
|
||||||
if (strstr(p, "Loongson-3A")){
|
|
||||||
return CPU_LOONGSON3A;
|
|
||||||
}else if(strstr(p, "Loongson-3B")){
|
|
||||||
return CPU_LOONGSON3B;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
return CPU_UNKNOWN;
|
return CPU_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
@ -149,7 +121,7 @@ void get_architecture(void){
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_subarchitecture(void){
|
void get_subarchitecture(void){
|
||||||
if(detect()==CPU_P5600){
|
if(detect()==CPU_P5600|| detect()==CPU_1004K){
|
||||||
printf("P5600");
|
printf("P5600");
|
||||||
}else{
|
}else{
|
||||||
printf("UNKNOWN");
|
printf("UNKNOWN");
|
||||||
|
@ -170,6 +142,14 @@ void get_cpuconfig(void){
|
||||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||||
printf("#define DTB_SIZE 4096\n");
|
printf("#define DTB_SIZE 4096\n");
|
||||||
printf("#define L2_ASSOCIATIVE 8\n");
|
printf("#define L2_ASSOCIATIVE 8\n");
|
||||||
|
} else if (detect()==CPU_1004K) {
|
||||||
|
printf("#define MIPS1004K\n");
|
||||||
|
printf("#define L1_DATA_SIZE 32768\n");
|
||||||
|
printf("#define L1_DATA_LINESIZE 32\n");
|
||||||
|
printf("#define L2_SIZE 26144\n");
|
||||||
|
printf("#define DTB_DEFAULT_ENTRIES 8\n");
|
||||||
|
printf("#define DTB_SIZE 4096\n");
|
||||||
|
printf("#define L2_ASSOCIATIVE 4\n");
|
||||||
}else{
|
}else{
|
||||||
printf("#define UNKNOWN\n");
|
printf("#define UNKNOWN\n");
|
||||||
}
|
}
|
||||||
|
@ -178,6 +158,8 @@ void get_cpuconfig(void){
|
||||||
void get_libname(void){
|
void get_libname(void){
|
||||||
if(detect()==CPU_P5600) {
|
if(detect()==CPU_P5600) {
|
||||||
printf("p5600\n");
|
printf("p5600\n");
|
||||||
|
} else if (detect()==CPU_1004K) {
|
||||||
|
printf("1004K\n");
|
||||||
}else{
|
}else{
|
||||||
printf("mips\n");
|
printf("mips\n");
|
||||||
}
|
}
|
||||||
|
|
8
ctest.c
8
ctest.c
|
@ -60,6 +60,14 @@ OS_FREEBSD
|
||||||
OS_NETBSD
|
OS_NETBSD
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__OpenBSD__)
|
||||||
|
OS_OPENBSD
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__DragonFly__)
|
||||||
|
OS_DRAGONFLY
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__sun)
|
#if defined(__sun)
|
||||||
OS_SUNOS
|
OS_SUNOS
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -91,7 +91,12 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
_Atomic
|
||||||
|
#else
|
||||||
|
volatile
|
||||||
|
#endif
|
||||||
|
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||||
} job_t;
|
} job_t;
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -67,7 +67,12 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
_Atomic
|
||||||
|
#else
|
||||||
|
volatile
|
||||||
|
#endif
|
||||||
|
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||||
} job_t;
|
} job_t;
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -91,7 +91,12 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
_Atomic
|
||||||
|
#else
|
||||||
|
volatile
|
||||||
|
#endif
|
||||||
|
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||||
} job_t;
|
} job_t;
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD)
|
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||||
#include <dlfcn.h>
|
#include <dlfcn.h>
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
#include <sys/resource.h>
|
#include <sys/resource.h>
|
||||||
|
|
|
@ -36,6 +36,7 @@
|
||||||
/* or implied, of The University of Texas at Austin. */
|
/* or implied, of The University of Texas at Austin. */
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#include <stdbool.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
//#include <sys/mman.h>
|
//#include <sys/mman.h>
|
||||||
|
@ -49,11 +50,16 @@
|
||||||
|
|
||||||
int blas_server_avail = 0;
|
int blas_server_avail = 0;
|
||||||
|
|
||||||
static void * blas_thread_buffer[MAX_CPU_NUMBER];
|
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
|
||||||
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
|
||||||
|
#else
|
||||||
|
static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
|
||||||
|
#endif
|
||||||
|
|
||||||
void goto_set_num_threads(int num_threads) {
|
void goto_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
int i=0;
|
int i=0, j=0;
|
||||||
|
|
||||||
if (num_threads < 1) num_threads = blas_num_threads;
|
if (num_threads < 1) num_threads = blas_num_threads;
|
||||||
|
|
||||||
|
@ -68,15 +74,17 @@ void goto_set_num_threads(int num_threads) {
|
||||||
omp_set_num_threads(blas_cpu_number);
|
omp_set_num_threads(blas_cpu_number);
|
||||||
|
|
||||||
//adjust buffer for each thread
|
//adjust buffer for each thread
|
||||||
for(i=0; i<blas_cpu_number; i++){
|
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||||
if(blas_thread_buffer[i]==NULL){
|
for(j=0; j<blas_cpu_number; j++){
|
||||||
blas_thread_buffer[i]=blas_memory_alloc(2);
|
if(blas_thread_buffer[i][j]==NULL){
|
||||||
|
blas_thread_buffer[i][j]=blas_memory_alloc(2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
for(; j<MAX_CPU_NUMBER; j++){
|
||||||
for(; i<MAX_CPU_NUMBER; i++){
|
if(blas_thread_buffer[i][j]!=NULL){
|
||||||
if(blas_thread_buffer[i]!=NULL){
|
blas_memory_free(blas_thread_buffer[i][j]);
|
||||||
blas_memory_free(blas_thread_buffer[i]);
|
blas_thread_buffer[i][j]=NULL;
|
||||||
blas_thread_buffer[i]=NULL;
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#if defined(ARCH_MIPS64)
|
#if defined(ARCH_MIPS64)
|
||||||
|
@ -92,30 +100,34 @@ void openblas_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
int blas_thread_init(void){
|
int blas_thread_init(void){
|
||||||
|
|
||||||
int i=0;
|
int i=0, j=0;
|
||||||
|
|
||||||
blas_get_cpu_number();
|
blas_get_cpu_number();
|
||||||
|
|
||||||
blas_server_avail = 1;
|
blas_server_avail = 1;
|
||||||
|
|
||||||
for(i=0; i<blas_num_threads; i++){
|
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||||
blas_thread_buffer[i]=blas_memory_alloc(2);
|
for(j=0; j<blas_num_threads; j++){
|
||||||
}
|
blas_thread_buffer[i][j]=blas_memory_alloc(2);
|
||||||
for(; i<MAX_CPU_NUMBER; i++){
|
}
|
||||||
blas_thread_buffer[i]=NULL;
|
for(; j<MAX_CPU_NUMBER; j++){
|
||||||
|
blas_thread_buffer[i][j]=NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int BLASFUNC(blas_thread_shutdown)(void){
|
int BLASFUNC(blas_thread_shutdown)(void){
|
||||||
int i=0;
|
int i=0, j=0;
|
||||||
blas_server_avail = 0;
|
blas_server_avail = 0;
|
||||||
|
|
||||||
for(i=0; i<MAX_CPU_NUMBER; i++){
|
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||||
if(blas_thread_buffer[i]!=NULL){
|
for(j=0; j<MAX_CPU_NUMBER; j++){
|
||||||
blas_memory_free(blas_thread_buffer[i]);
|
if(blas_thread_buffer[i][j]!=NULL){
|
||||||
blas_thread_buffer[i]=NULL;
|
blas_memory_free(blas_thread_buffer[i][j]);
|
||||||
|
blas_thread_buffer[i][j]=NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -206,7 +218,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void exec_threads(blas_queue_t *queue){
|
static void exec_threads(blas_queue_t *queue, int buf_index){
|
||||||
|
|
||||||
void *buffer, *sa, *sb;
|
void *buffer, *sa, *sb;
|
||||||
int pos=0, release_flag=0;
|
int pos=0, release_flag=0;
|
||||||
|
@ -223,7 +235,7 @@ static void exec_threads(blas_queue_t *queue){
|
||||||
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
||||||
|
|
||||||
pos = omp_get_thread_num();
|
pos = omp_get_thread_num();
|
||||||
buffer = blas_thread_buffer[pos];
|
buffer = blas_thread_buffer[buf_index][pos];
|
||||||
|
|
||||||
//fallback
|
//fallback
|
||||||
if(buffer==NULL) {
|
if(buffer==NULL) {
|
||||||
|
@ -291,7 +303,7 @@ static void exec_threads(blas_queue_t *queue){
|
||||||
|
|
||||||
int exec_blas(BLASLONG num, blas_queue_t *queue){
|
int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||||
|
|
||||||
BLASLONG i;
|
BLASLONG i, buf_index;
|
||||||
|
|
||||||
if ((num <= 0) || (queue == NULL)) return 0;
|
if ((num <= 0) || (queue == NULL)) return 0;
|
||||||
|
|
||||||
|
@ -302,6 +314,23 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
while(true) {
|
||||||
|
for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
|
||||||
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
_Bool inuse = false;
|
||||||
|
if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) {
|
||||||
|
#else
|
||||||
|
if(blas_buffer_inuse[i] == false) {
|
||||||
|
blas_buffer_inuse[i] = true;
|
||||||
|
#endif
|
||||||
|
buf_index = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(i != MAX_PARALLEL_NUMBER)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (i = 0; i < num; i ++) {
|
for (i = 0; i < num; i ++) {
|
||||||
|
|
||||||
|
@ -309,9 +338,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||||
queue[i].position = i;
|
queue[i].position = i;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
exec_threads(&queue[i]);
|
exec_threads(&queue[i], buf_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
atomic_store(&blas_buffer_inuse[buf_index], false);
|
||||||
|
#else
|
||||||
|
blas_buffer_inuse[buf_index] = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -108,7 +108,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include <sys/resource.h>
|
#include <sys/resource.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
|
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||||
#include <sys/sysctl.h>
|
#include <sys/sysctl.h>
|
||||||
#include <sys/resource.h>
|
#include <sys/resource.h>
|
||||||
#endif
|
#endif
|
||||||
|
@ -147,9 +147,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
|
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
|
||||||
#define CONSTRUCTOR __attribute__ ((constructor))
|
#define CONSTRUCTOR __attribute__ ((constructor))
|
||||||
#define DESTRUCTOR __attribute__ ((destructor))
|
#define DESTRUCTOR __attribute__ ((destructor))
|
||||||
#else
|
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
|
||||||
#define CONSTRUCTOR __attribute__ ((constructor(101)))
|
#define CONSTRUCTOR __attribute__ ((constructor(101)))
|
||||||
#define DESTRUCTOR __attribute__ ((destructor(101)))
|
#define DESTRUCTOR __attribute__ ((destructor(101)))
|
||||||
|
#else
|
||||||
|
#define CONSTRUCTOR __attribute__ ((constructor))
|
||||||
|
#define DESTRUCTOR __attribute__ ((destructor))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DYNAMIC_ARCH
|
#ifdef DYNAMIC_ARCH
|
||||||
|
@ -209,7 +212,8 @@ int ret;
|
||||||
size = CPU_ALLOC_SIZE(nums);
|
size = CPU_ALLOC_SIZE(nums);
|
||||||
ret = sched_getaffinity(0,size,cpusetp);
|
ret = sched_getaffinity(0,size,cpusetp);
|
||||||
if (ret!=0) return nums;
|
if (ret!=0) return nums;
|
||||||
nums = CPU_COUNT_S(size,cpusetp);
|
ret = CPU_COUNT_S(size,cpusetp);
|
||||||
|
if (ret > 0 && ret < nums) nums = ret;
|
||||||
CPU_FREE(cpusetp);
|
CPU_FREE(cpusetp);
|
||||||
return nums;
|
return nums;
|
||||||
#endif
|
#endif
|
||||||
|
@ -246,7 +250,7 @@ int get_num_procs(void) {
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OS_FREEBSD)
|
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||||
|
|
||||||
int get_num_procs(void) {
|
int get_num_procs(void) {
|
||||||
|
|
||||||
|
@ -336,7 +340,7 @@ extern int openblas_goto_num_threads_env();
|
||||||
extern int openblas_omp_num_threads_env();
|
extern int openblas_omp_num_threads_env();
|
||||||
|
|
||||||
int blas_get_cpu_number(void){
|
int blas_get_cpu_number(void){
|
||||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||||
int max_num;
|
int max_num;
|
||||||
#endif
|
#endif
|
||||||
int blas_goto_num = 0;
|
int blas_goto_num = 0;
|
||||||
|
@ -344,7 +348,7 @@ int blas_get_cpu_number(void){
|
||||||
|
|
||||||
if (blas_num_threads) return blas_num_threads;
|
if (blas_num_threads) return blas_num_threads;
|
||||||
|
|
||||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||||
max_num = get_num_procs();
|
max_num = get_num_procs();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -368,7 +372,7 @@ int blas_get_cpu_number(void){
|
||||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||||
else blas_num_threads = MAX_CPU_NUMBER;
|
else blas_num_threads = MAX_CPU_NUMBER;
|
||||||
|
|
||||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -54,6 +54,9 @@ static char* openblas_config_str=""
|
||||||
#ifdef NO_AFFINITY
|
#ifdef NO_AFFINITY
|
||||||
"NO_AFFINITY "
|
"NO_AFFINITY "
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef USE_OPENMP
|
||||||
|
"USE_OPENMP "
|
||||||
|
#endif
|
||||||
#ifndef DYNAMIC_ARCH
|
#ifndef DYNAMIC_ARCH
|
||||||
CHAR_CORENAME
|
CHAR_CORENAME
|
||||||
#endif
|
#endif
|
||||||
|
@ -61,18 +64,23 @@ static char* openblas_config_str=""
|
||||||
|
|
||||||
#ifdef DYNAMIC_ARCH
|
#ifdef DYNAMIC_ARCH
|
||||||
char *gotoblas_corename();
|
char *gotoblas_corename();
|
||||||
static char tmp_config_str[256];
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static char tmp_config_str[256];
|
||||||
|
int openblas_get_parallel();
|
||||||
|
|
||||||
char* CNAME() {
|
char* CNAME() {
|
||||||
#ifndef DYNAMIC_ARCH
|
char tmpstr[20];
|
||||||
return openblas_config_str;
|
|
||||||
#else
|
|
||||||
strcpy(tmp_config_str, openblas_config_str);
|
strcpy(tmp_config_str, openblas_config_str);
|
||||||
|
#ifdef DYNAMIC_ARCH
|
||||||
strcat(tmp_config_str, gotoblas_corename());
|
strcat(tmp_config_str, gotoblas_corename());
|
||||||
return tmp_config_str;
|
|
||||||
#endif
|
#endif
|
||||||
|
if (openblas_get_parallel() == 0)
|
||||||
|
sprintf(tmpstr, " SINGLE_THREADED");
|
||||||
|
else
|
||||||
|
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
|
||||||
|
strcat(tmp_config_str, tmpstr);
|
||||||
|
return tmp_config_str;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -83,3 +91,4 @@ char* openblas_get_corename() {
|
||||||
return gotoblas_corename();
|
return gotoblas_corename();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -156,7 +156,7 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
|
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD))
|
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||||
|
|
||||||
so : ../$(LIBSONAME)
|
so : ../$(LIBSONAME)
|
||||||
|
|
||||||
|
|
6
f_check
6
f_check
|
@ -97,7 +97,7 @@ if ($compiler eq "") {
|
||||||
|
|
||||||
if ($data =~ /Intel/) {
|
if ($data =~ /Intel/) {
|
||||||
$vendor = INTEL;
|
$vendor = INTEL;
|
||||||
$openmp = "-openmp";
|
$openmp = "-fopenmp";
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($data =~ /Sun Fortran/) {
|
if ($data =~ /Sun Fortran/) {
|
||||||
|
@ -127,7 +127,7 @@ if ($compiler eq "") {
|
||||||
|
|
||||||
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
|
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
|
||||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
|
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
|
||||||
if ($data =~ /zho_ge__/) {
|
if ($data =~ / zho_ge__/) {
|
||||||
$need2bu = 1;
|
$need2bu = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -155,7 +155,7 @@ if ($compiler eq "") {
|
||||||
if ($compiler =~ /ifort/) {
|
if ($compiler =~ /ifort/) {
|
||||||
$vendor = INTEL;
|
$vendor = INTEL;
|
||||||
$bu = "_";
|
$bu = "_";
|
||||||
$openmp = "-openmp";
|
$openmp = "-fopenmp";
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($compiler =~ /pathf/) {
|
if ($compiler =~ /pathf/) {
|
||||||
|
|
|
@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#ifdef OS_WINDOWS
|
#ifdef OS_WINDOWS
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#endif
|
#endif
|
||||||
#if defined(__FreeBSD__) || defined(__APPLE__)
|
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <sys/sysctl.h>
|
#include <sys/sysctl.h>
|
||||||
#endif
|
#endif
|
||||||
|
@ -1074,7 +1074,7 @@ static int get_num_cores(void) {
|
||||||
|
|
||||||
#ifdef OS_WINDOWS
|
#ifdef OS_WINDOWS
|
||||||
SYSTEM_INFO sysinfo;
|
SYSTEM_INFO sysinfo;
|
||||||
#elif defined(__FreeBSD__) || defined(__APPLE__)
|
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||||
int m[2], count;
|
int m[2], count;
|
||||||
size_t len;
|
size_t len;
|
||||||
#endif
|
#endif
|
||||||
|
@ -1088,7 +1088,7 @@ static int get_num_cores(void) {
|
||||||
GetSystemInfo(&sysinfo);
|
GetSystemInfo(&sysinfo);
|
||||||
return sysinfo.dwNumberOfProcessors;
|
return sysinfo.dwNumberOfProcessors;
|
||||||
|
|
||||||
#elif defined(__FreeBSD__) || defined(__APPLE__)
|
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||||
m[0] = CTL_HW;
|
m[0] = CTL_HW;
|
||||||
m[1] = HW_NCPU;
|
m[1] = HW_NCPU;
|
||||||
len = sizeof(int);
|
len = sizeof(int);
|
||||||
|
|
|
@ -239,6 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
} else
|
} else
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
|
||||||
|
/* FIXME TRMV multithreading appears to be broken, see issue 1332*/
|
||||||
|
nthreads = 1;
|
||||||
|
|
||||||
if(nthreads > 1) {
|
if(nthreads > 1) {
|
||||||
buffer_size = n > 16 ? 0 : n * 4 + 40;
|
buffer_size = n > 16 ? 0 : n * 4 + 40;
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,10 +29,8 @@ USE_TRMM = 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), HASWELL)
|
ifeq ($(CORE), HASWELL)
|
||||||
ifeq ($(ARCH), x86_64)
|
|
||||||
USE_TRMM = 1
|
USE_TRMM = 1
|
||||||
endif
|
endif
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(CORE), ZEN)
|
ifeq ($(CORE), ZEN)
|
||||||
USE_TRMM = 1
|
USE_TRMM = 1
|
||||||
|
|
|
@ -440,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
cmp N, #0
|
cmp N, #0
|
||||||
ble axpy_kernel_L999
|
ble axpy_kernel_L999
|
||||||
|
/*
|
||||||
cmp INC_X, #0
|
cmp INC_X, #0
|
||||||
beq axpy_kernel_L999
|
beq axpy_kernel_L999
|
||||||
|
|
||||||
cmp INC_Y, #0
|
cmp INC_Y, #0
|
||||||
beq axpy_kernel_L999
|
beq axpy_kernel_L999
|
||||||
|
*/
|
||||||
cmp INC_X, #1
|
cmp INC_X, #1
|
||||||
bne axpy_kernel_S_BEGIN
|
bne axpy_kernel_S_BEGIN
|
||||||
|
|
||||||
|
|
|
@ -483,13 +483,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
cmp N, #0
|
cmp N, #0
|
||||||
ble rot_kernel_L999
|
ble rot_kernel_L999
|
||||||
|
/*
|
||||||
cmp INC_X, #0
|
cmp INC_X, #0
|
||||||
beq rot_kernel_L999
|
beq rot_kernel_L999
|
||||||
|
|
||||||
cmp INC_Y, #0
|
cmp INC_Y, #0
|
||||||
beq rot_kernel_L999
|
beq rot_kernel_L999
|
||||||
|
*/
|
||||||
cmp INC_X, #1
|
cmp INC_X, #1
|
||||||
bne rot_kernel_S_BEGIN
|
bne rot_kernel_S_BEGIN
|
||||||
|
|
||||||
|
@ -584,6 +584,12 @@ rot_kernel_S1:
|
||||||
rot_kernel_S10:
|
rot_kernel_S10:
|
||||||
|
|
||||||
KERNEL_S1
|
KERNEL_S1
|
||||||
|
|
||||||
|
cmp INC_X, #0
|
||||||
|
beq rot_kernel_L999
|
||||||
|
|
||||||
|
cmp INC_Y, #0
|
||||||
|
beq rot_kernel_L999
|
||||||
|
|
||||||
subs I, I, #1
|
subs I, I, #1
|
||||||
bne rot_kernel_S10
|
bne rot_kernel_S10
|
||||||
|
|
|
@ -116,22 +116,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
if (m & 1) {
|
if (m & 1) {
|
||||||
|
|
||||||
if (X > posY) {
|
if (X > posY) {
|
||||||
/* ao1 += 1;
|
ao1 += 1;
|
||||||
ao2 += 1; */
|
ao2 += 1;
|
||||||
b += 2;
|
b += 2;
|
||||||
} else
|
} else
|
||||||
#ifdef UNIT
|
|
||||||
if (X < posY) {
|
if (X < posY) {
|
||||||
#endif
|
data01 = *(ao1 + 0);
|
||||||
b[ 0] = *(ao1 + 0);
|
data02 = *(ao1 + 1);
|
||||||
#ifdef UNIT
|
|
||||||
|
b[ 0] = data01;
|
||||||
|
b[ 1] = data02;
|
||||||
|
ao1 += lda;
|
||||||
|
b += 2;
|
||||||
} else {
|
} else {
|
||||||
|
#ifdef UNIT
|
||||||
|
data02 = *(ao1 + 1);
|
||||||
|
|
||||||
b[ 0] = ONE;
|
b[ 0] = ONE;
|
||||||
|
b[ 1] = data02;
|
||||||
|
#else
|
||||||
|
data01 = *(ao1 + 0);
|
||||||
|
data02 = *(ao1 + 1);
|
||||||
|
|
||||||
|
b[ 0] = data01;
|
||||||
|
b[ 1] = data02;
|
||||||
|
#endif
|
||||||
|
ao1 += 2;
|
||||||
|
b += 2;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
b[ 1] = *(ao1 + 1);
|
|
||||||
b += 2;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
posY += 2;
|
posY += 2;
|
||||||
|
@ -178,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
} while (i > 0);
|
} while (i > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// posY += 1;
|
posY += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -518,7 +518,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
i = (m & 15);
|
i = (m & 15);
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
if (X < posY) {
|
if (X < posY) {
|
||||||
/* a01 += i;
|
a01 += i;
|
||||||
a02 += i;
|
a02 += i;
|
||||||
a03 += i;
|
a03 += i;
|
||||||
a04 += i;
|
a04 += i;
|
||||||
|
@ -533,7 +533,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
a13 += i;
|
a13 += i;
|
||||||
a14 += i;
|
a14 += i;
|
||||||
a15 += i;
|
a15 += i;
|
||||||
a16 += i; */
|
a16 += i;
|
||||||
b += 16 * i;
|
b += 16 * i;
|
||||||
} else
|
} else
|
||||||
if (X > posY) {
|
if (X > posY) {
|
||||||
|
@ -1130,14 +1130,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
i = (m & 7);
|
i = (m & 7);
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
if (X < posY) {
|
if (X < posY) {
|
||||||
/* a01 += i;
|
a01 += i;
|
||||||
a02 += i;
|
a02 += i;
|
||||||
a03 += i;
|
a03 += i;
|
||||||
a04 += i;
|
a04 += i;
|
||||||
a05 += i;
|
a05 += i;
|
||||||
a06 += i;
|
a06 += i;
|
||||||
a07 += i;
|
a07 += i;
|
||||||
a08 += i; */
|
a08 += i;
|
||||||
b += 8 * i;
|
b += 8 * i;
|
||||||
} else
|
} else
|
||||||
if (X > posY) {
|
if (X > posY) {
|
||||||
|
@ -1156,13 +1156,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
b += 8;
|
b += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* a02 += i * lda;
|
a02 += i * lda;
|
||||||
a03 += i * lda;
|
a03 += i * lda;
|
||||||
a04 += i * lda;
|
a04 += i * lda;
|
||||||
a05 += i * lda;
|
a05 += i * lda;
|
||||||
a06 += i * lda;
|
a06 += i * lda;
|
||||||
a07 += i * lda;
|
a07 += i * lda;
|
||||||
a08 += i * lda; */
|
a08 += i * lda;
|
||||||
} else {
|
} else {
|
||||||
#ifdef UNIT
|
#ifdef UNIT
|
||||||
b[ 0] = ONE;
|
b[ 0] = ONE;
|
||||||
|
@ -1371,10 +1371,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
i = (m & 3);
|
i = (m & 3);
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
if (X < posY) {
|
if (X < posY) {
|
||||||
/* a01 += i;
|
a01 += i;
|
||||||
a02 += i;
|
a02 += i;
|
||||||
a03 += i;
|
a03 += i;
|
||||||
a04 += i; */
|
a04 += i;
|
||||||
b += 4 * i;
|
b += 4 * i;
|
||||||
} else
|
} else
|
||||||
if (X > posY) {
|
if (X > posY) {
|
||||||
|
@ -1387,9 +1387,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
a01 += lda;
|
a01 += lda;
|
||||||
b += 4;
|
b += 4;
|
||||||
}
|
}
|
||||||
/* a02 += lda;
|
a02 += lda;
|
||||||
a03 += lda;
|
a03 += lda;
|
||||||
a04 += lda; */
|
a04 += lda;
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
#ifdef UNIT
|
#ifdef UNIT
|
||||||
|
@ -1487,19 +1487,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
if (X < posY) {
|
if (X < posY) {
|
||||||
a01 ++;
|
a01 ++;
|
||||||
a02 ++;
|
a02 ++;
|
||||||
} else {
|
b += 2;
|
||||||
#ifdef UNIT
|
} else
|
||||||
if (X > posY) {
|
if (X > posY) {
|
||||||
#endif
|
|
||||||
b[ 0] = *(a01 + 0);
|
b[ 0] = *(a01 + 0);
|
||||||
#ifdef UNIT
|
b[ 1] = *(a01 + 1);
|
||||||
|
a01 += lda;
|
||||||
|
b += 2;
|
||||||
} else {
|
} else {
|
||||||
|
#ifdef UNIT
|
||||||
b[ 0] = ONE;
|
b[ 0] = ONE;
|
||||||
}
|
b[ 1] = *(a01 + 1);
|
||||||
|
#else
|
||||||
|
b[ 0] = *(a01 + 0);
|
||||||
|
b[ 1] = *(a01 + 1);
|
||||||
#endif
|
#endif
|
||||||
b[ 1] = *(a01 + 1);
|
b += 2;
|
||||||
}
|
}
|
||||||
b += 2;
|
|
||||||
}
|
}
|
||||||
posY += 2;
|
posY += 2;
|
||||||
}
|
}
|
||||||
|
@ -1518,25 +1522,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
do {
|
do {
|
||||||
if (X < posY) {
|
if (X < posY) {
|
||||||
a01 ++;
|
a01 += 1;
|
||||||
} else {
|
b ++;
|
||||||
#ifdef UNIT
|
} else
|
||||||
if (X > posY) {
|
if (X > posY) {
|
||||||
#endif
|
|
||||||
b[ 0] = *(a01 + 0);
|
b[ 0] = *(a01 + 0);
|
||||||
#ifdef UNIT
|
a01 += lda;
|
||||||
|
b ++;
|
||||||
} else {
|
} else {
|
||||||
|
#ifdef UNIT
|
||||||
b[ 0] = ONE;
|
b[ 0] = ONE;
|
||||||
}
|
#else
|
||||||
|
b[ 0] = *(a01 + 0);
|
||||||
#endif
|
#endif
|
||||||
a01 += lda;
|
a01 += lda;
|
||||||
}
|
b ++;
|
||||||
b ++;
|
}
|
||||||
X ++;
|
|
||||||
i --;
|
X += 1;
|
||||||
|
i --;
|
||||||
} while (i > 0);
|
} while (i > 0);
|
||||||
}
|
}
|
||||||
// posY += 1;
|
posY += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
if (m & 1) {
|
if (m & 1) {
|
||||||
|
|
||||||
if (X < posY) {
|
if (X < posY) {
|
||||||
/* ao1 += 1;
|
ao1 += 1;
|
||||||
ao2 += 1; */
|
ao2 += 1;
|
||||||
b += 2;
|
b += 2;
|
||||||
} else
|
} else
|
||||||
if (X > posY) {
|
if (X > posY) {
|
||||||
|
@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
|
|
||||||
b[ 0] = data01;
|
b[ 0] = data01;
|
||||||
b[ 1] = data02;
|
b[ 1] = data02;
|
||||||
// ao1 += lda;
|
ao1 += lda;
|
||||||
b += 2;
|
b += 2;
|
||||||
} else {
|
} else {
|
||||||
#ifdef UNIT
|
#ifdef UNIT
|
||||||
|
@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
b[ 0] = data01;
|
b[ 0] = data01;
|
||||||
b[ 1] = ZERO;
|
b[ 1] = ZERO;
|
||||||
#endif
|
#endif
|
||||||
// ao1 += lda;
|
ao1 += lda;
|
||||||
b += 2;
|
b += 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -161,18 +161,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
i = m;
|
i = m;
|
||||||
if (m > 0) {
|
if (m > 0) {
|
||||||
do {
|
do {
|
||||||
|
if (X < posY) {
|
||||||
|
b += 1;
|
||||||
|
ao1 += 1;
|
||||||
|
} else
|
||||||
|
if (X > posY) {
|
||||||
|
data01 = *(ao1 + 0);
|
||||||
|
b[ 0] = data01;
|
||||||
|
b += 1;
|
||||||
|
ao1 += lda;
|
||||||
|
} else {
|
||||||
#ifdef UNIT
|
#ifdef UNIT
|
||||||
if (X > posY) {
|
b[ 0] = ONE;
|
||||||
|
#else
|
||||||
|
data01 = *(ao1 + 0);
|
||||||
|
b[ 0] = data01;
|
||||||
#endif
|
#endif
|
||||||
b[ 0] = *(ao1 + 0);
|
b += 1;
|
||||||
#ifdef UNIT
|
ao1 += lda;
|
||||||
} else {
|
}
|
||||||
b[ 0] = ONE;
|
|
||||||
}
|
X += 1;
|
||||||
#endif
|
|
||||||
b ++;
|
|
||||||
ao1 += lda;
|
|
||||||
X ++;
|
|
||||||
i --;
|
i --;
|
||||||
} while (i > 0);
|
} while (i > 0);
|
||||||
}
|
}
|
||||||
|
|
|
@ -201,18 +201,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
if (X < posY) {
|
if (X < posY) {
|
||||||
|
|
||||||
if (m & 2) {
|
if (m & 2) {
|
||||||
/* ao1 += 2;
|
ao1 += 2;
|
||||||
ao2 += 2;
|
ao2 += 2;
|
||||||
ao3 += 2;
|
ao3 += 2;
|
||||||
ao4 += 2; */
|
ao4 += 2;
|
||||||
b += 8;
|
b += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m & 1) {
|
if (m & 1) {
|
||||||
/* ao1 += 1;
|
ao1 += 1;
|
||||||
ao2 += 1;
|
ao2 += 1;
|
||||||
ao3 += 1;
|
ao3 += 1;
|
||||||
ao4 += 1; */
|
ao4 += 1;
|
||||||
b += 4;
|
b += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
b[ 7] = data08;
|
b[ 7] = data08;
|
||||||
|
|
||||||
ao1 += 2 * lda;
|
ao1 += 2 * lda;
|
||||||
// ao2 += 2 * lda;
|
ao2 += 2 * lda;
|
||||||
b += 8;
|
b += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -253,7 +253,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
b[ 2] = data03;
|
b[ 2] = data03;
|
||||||
b[ 3] = data04;
|
b[ 3] = data04;
|
||||||
|
|
||||||
// ao1 += lda;
|
ao1 += lda;
|
||||||
b += 4;
|
b += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -401,7 +401,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
if (i) {
|
if (i) {
|
||||||
|
|
||||||
if (X < posY) {
|
if (X < posY) {
|
||||||
// ao1 += 2;
|
ao1 += 2;
|
||||||
b += 2;
|
b += 2;
|
||||||
} else
|
} else
|
||||||
if (X > posY) {
|
if (X > posY) {
|
||||||
|
@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
b[ 0] = data01;
|
b[ 0] = data01;
|
||||||
b[ 1] = data02;
|
b[ 1] = data02;
|
||||||
|
|
||||||
// ao1 += lda;
|
ao1 += lda;
|
||||||
b += 2;
|
b += 2;
|
||||||
} else {
|
} else {
|
||||||
#ifdef UNIT
|
#ifdef UNIT
|
||||||
|
@ -443,21 +443,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
do {
|
do {
|
||||||
|
|
||||||
if (X < posY) {
|
if (X < posY) {
|
||||||
|
b += 1;
|
||||||
ao1 += 1;
|
ao1 += 1;
|
||||||
} else {
|
} else
|
||||||
#ifdef UNIT
|
|
||||||
if (X > posY) {
|
if (X > posY) {
|
||||||
#endif
|
data01 = *(ao1 + 0);
|
||||||
b[ 0] = *(ao1 + 0);
|
b[ 0] = data01;
|
||||||
#ifdef UNIT
|
ao1 += lda;
|
||||||
|
b += 1;
|
||||||
} else {
|
} else {
|
||||||
|
#ifdef UNIT
|
||||||
b[ 0] = ONE;
|
b[ 0] = ONE;
|
||||||
}
|
#else
|
||||||
|
data01 = *(ao1 + 0);
|
||||||
|
b[ 0] = data01;
|
||||||
#endif
|
#endif
|
||||||
ao1 += lda;
|
ao1 += lda;
|
||||||
}
|
b += 1;
|
||||||
b ++;
|
}
|
||||||
X ++;
|
|
||||||
|
X += 1;
|
||||||
i --;
|
i --;
|
||||||
} while (i > 0);
|
} while (i > 0);
|
||||||
}
|
}
|
||||||
|
|
|
@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
||||||
}
|
}
|
||||||
|
|
||||||
a1 += 2 * lda;
|
a1 += 2 * lda;
|
||||||
// a2 += 2 * lda;
|
a2 += 2 * lda;
|
||||||
b += 8;
|
b += 8;
|
||||||
|
|
||||||
ii += 2;
|
ii += 2;
|
||||||
|
|
|
@ -139,18 +139,48 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m & 1) {
|
if (m & 1) {
|
||||||
#ifdef UNIT
|
|
||||||
|
if (X > posY) {
|
||||||
|
ao1 += 2;
|
||||||
|
ao2 += 2;
|
||||||
|
b += 4;
|
||||||
|
|
||||||
|
} else
|
||||||
if (X < posY) {
|
if (X < posY) {
|
||||||
#endif
|
data1 = *(ao1 + 0);
|
||||||
b[ 0] = *(ao1 + 0);
|
data2 = *(ao1 + 1);
|
||||||
b[ 1] = *(ao1 + 1);
|
data3 = *(ao1 + 2);
|
||||||
#ifdef UNIT
|
data4 = *(ao1 + 3);
|
||||||
|
|
||||||
|
b[ 0] = data1;
|
||||||
|
b[ 1] = data2;
|
||||||
|
b[ 2] = data3;
|
||||||
|
b[ 3] = data4;
|
||||||
|
|
||||||
|
ao1 += lda;
|
||||||
|
b += 4;
|
||||||
} else {
|
} else {
|
||||||
|
#ifdef UNIT
|
||||||
|
data3 = *(ao1 + 2);
|
||||||
|
data4 = *(ao1 + 3);
|
||||||
|
|
||||||
b[ 0] = ONE;
|
b[ 0] = ONE;
|
||||||
b[ 1] = ZERO;
|
b[ 1] = ZERO;
|
||||||
}
|
b[ 2] = data3;
|
||||||
|
b[ 3] = data4;
|
||||||
|
#else
|
||||||
|
data1 = *(ao1 + 0);
|
||||||
|
data2 = *(ao1 + 1);
|
||||||
|
data3 = *(ao1 + 2);
|
||||||
|
data4 = *(ao1 + 3);
|
||||||
|
|
||||||
|
b[ 0] = data1;
|
||||||
|
b[ 1] = data2;
|
||||||
|
b[ 2] = data3;
|
||||||
|
b[ 3] = data4;
|
||||||
#endif
|
#endif
|
||||||
b += 4;
|
b += 4;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
posY += 2;
|
posY += 2;
|
||||||
|
@ -203,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
} while (i > 0);
|
} while (i > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// posY += 1;
|
posY += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
||||||
|
|
||||||
BLASLONG i, ii, j, jj;
|
BLASLONG i, ii, j, jj;
|
||||||
|
|
||||||
FLOAT data01 = 0.0, data02 = 0.0;
|
FLOAT data01, data02;
|
||||||
FLOAT *a1;
|
FLOAT *a1;
|
||||||
|
|
||||||
lda *= 2;
|
lda *= 2;
|
||||||
|
|
|
@ -43,8 +43,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
||||||
|
|
||||||
BLASLONG i, ii, j, jj;
|
BLASLONG i, ii, j, jj;
|
||||||
|
|
||||||
FLOAT data01 = 0.0, data02 = 0.0, data03, data04;
|
FLOAT data01, data02, data03, data04;
|
||||||
FLOAT data05, data06, data07 = 0.0, data08 = 0.0;
|
FLOAT data05, data06, data07, data08;
|
||||||
FLOAT *a1, *a2;
|
FLOAT *a1, *a2;
|
||||||
|
|
||||||
lda *= 2;
|
lda *= 2;
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
include $(KERNELDIR)/KERNEL.P5600
|
|
@ -38,8 +38,8 @@ ZASUMKERNEL = ../mips/zasum_msa.c
|
||||||
else
|
else
|
||||||
SASUMKERNEL = ../mips/asum.c
|
SASUMKERNEL = ../mips/asum.c
|
||||||
DASUMKERNEL = ../mips/asum.c
|
DASUMKERNEL = ../mips/asum.c
|
||||||
CASUMKERNEL = ../mips/asum.c
|
CASUMKERNEL = ../mips/zasum.c
|
||||||
ZASUMKERNEL = ../mips/asum.c
|
ZASUMKERNEL = ../mips/zasum.c
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef HAVE_MSA
|
ifdef HAVE_MSA
|
||||||
|
@ -253,4 +253,4 @@ ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -484,10 +484,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
|
||||||
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
||||||
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||||
v2f64 v_alpha;
|
v2f64 v_alpha;
|
||||||
v2f64 x0, x1, x2, x3, y0 = 0.0, y1 = 0.0, y2 = 0.0, y3 = 0.0;
|
v2f64 x0, x1, x2, x3, y0 = {0,0}, y1 = {0,0}, y2 = {0,0}, y3 = {0,0};
|
||||||
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||||
v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
|
v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
|
||||||
v2f64 t30, t31, tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0;
|
v2f64 t30, t31, tp0 = {0,0}, tp1 = {0,0}, tp2 = {0,0}, tp3 = {0,0}, tp4 = {0,0}, tp5 = {0,0}, tp6 = {0,0}, tp7 = {0,0};
|
||||||
|
|
||||||
v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);
|
v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);
|
||||||
|
|
||||||
|
|
|
@ -41,8 +41,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
|
|
||||||
while(i < n)
|
while(i < n)
|
||||||
{
|
{
|
||||||
|
#if defined(DSDOT)
|
||||||
dot += y[iy] * x[ix] ;
|
dot += (double)y[iy] * (double)x[ix] ;
|
||||||
|
#else
|
||||||
|
dot += y[iy] * x[ix];
|
||||||
|
#endif
|
||||||
ix += inc_x ;
|
ix += inc_x ;
|
||||||
iy += inc_y ;
|
iy += inc_y ;
|
||||||
i++ ;
|
i++ ;
|
||||||
|
|
|
@ -423,9 +423,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
|
||||||
FLOAT *y_org = y;
|
FLOAT *y_org = y;
|
||||||
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
||||||
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||||
v4f32 v_alpha, x0, x1, y0 = 0.0, y1 = 0.0;
|
v4f32 v_alpha, x0, x1, y0 = {0,0,0,0}, y1 = {0,0,0,0};
|
||||||
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||||
v4f32 tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0;
|
v4f32 tp0 = {0,0,0,0}, tp1 = {0,0,0,0}, tp2 = {0,0,0,0}, tp3 = {0,0,0,0}, tp4 = {0,0,0,0}, tp5 = {0,0,0,0}, tp6 = {0,0,0,0}, tp7 = {0,0,0,0};
|
||||||
|
|
||||||
v_alpha = COPY_FLOAT_TO_VECTOR(alpha);
|
v_alpha = COPY_FLOAT_TO_VECTOR(alpha);
|
||||||
|
|
||||||
|
|
|
@ -54,3 +54,6 @@ ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S
|
||||||
ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S
|
ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S
|
||||||
ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S
|
ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S
|
||||||
ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S
|
ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S
|
||||||
|
|
||||||
|
CROTKERNEL = ../arm/zrot.c
|
||||||
|
ZROTKERNEL = ../arm/zrot.c
|
||||||
|
|
|
@ -39,7 +39,7 @@ lapack_int LAPACKE_clacgv( lapack_int n, lapack_complex_float* x,
|
||||||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||||
if( LAPACKE_get_nancheck() ) {
|
if( LAPACKE_get_nancheck() ) {
|
||||||
/* Optionally check input matrices for NaNs */
|
/* Optionally check input matrices for NaNs */
|
||||||
if( LAPACKE_c_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) {
|
if( LAPACKE_c_nancheck( n, x, incx ) ) {
|
||||||
return -2;
|
return -2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,16 +51,21 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
|
||||||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||||
if( LAPACKE_get_nancheck() ) {
|
if( LAPACKE_get_nancheck() ) {
|
||||||
/* Optionally check input matrices for NaNs */
|
/* Optionally check input matrices for NaNs */
|
||||||
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
lapack_int lrv, lcv; /* row, column stride */
|
||||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
if( matrix_layout == LAPACK_COL_MAJOR ) {
|
||||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
lrv = 1;
|
||||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
lcv = ldv;
|
||||||
LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
} else {
|
||||||
nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
|
lrv = ldv;
|
||||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
lcv = 1;
|
||||||
( ( LAPACKE_lsame( storev, 'c' ) &&
|
}
|
||||||
LAPACKE_lsame( side, 'r' ) ) ? n :
|
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
||||||
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||||
|
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
||||||
|
|
||||||
|
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||||
|
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
|
||||||
|
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
||||||
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||||
return -13;
|
return -13;
|
||||||
}
|
}
|
||||||
|
@ -70,8 +75,8 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
|
||||||
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||||
if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
|
if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv],
|
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v,
|
||||||
ldv ) )
|
&v[k*lrv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||||
if( k > nrows_v ) {
|
if( k > nrows_v ) {
|
||||||
|
@ -79,23 +84,23 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
|
||||||
return -8;
|
return -8;
|
||||||
}
|
}
|
||||||
if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k,
|
if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k,
|
||||||
&v[(nrows_v-k)*ldv], ldv ) )
|
&v[(nrows_v-k)*lrv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
|
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||||
if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
|
if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k],
|
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k,
|
||||||
ldv ) )
|
&v[k*lrv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||||
if( k > ncols_v ) {
|
if( k > ncols_v ) {
|
||||||
LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
|
LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
|
||||||
return -8;
|
return -8;
|
||||||
}
|
}
|
||||||
if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k],
|
if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k,
|
||||||
ldv ) )
|
&v[(ncols_v-k)*lcv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
|
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
|
|
|
@ -43,7 +43,7 @@ lapack_int LAPACKE_clarfg( lapack_int n, lapack_complex_float* alpha,
|
||||||
if( LAPACKE_c_nancheck( 1, alpha, 1 ) ) {
|
if( LAPACKE_c_nancheck( 1, alpha, 1 ) ) {
|
||||||
return -2;
|
return -2;
|
||||||
}
|
}
|
||||||
if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
if( LAPACKE_c_nancheck( n-1, x, incx ) ) {
|
||||||
return -3;
|
return -3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,6 +38,7 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m,
|
||||||
lapack_complex_float tau, lapack_complex_float* c,
|
lapack_complex_float tau, lapack_complex_float* c,
|
||||||
lapack_int ldc, lapack_complex_float* work )
|
lapack_int ldc, lapack_complex_float* work )
|
||||||
{
|
{
|
||||||
|
lapack_int lv;
|
||||||
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
||||||
LAPACKE_xerbla( "LAPACKE_clarfx", -1 );
|
LAPACKE_xerbla( "LAPACKE_clarfx", -1 );
|
||||||
return -1;
|
return -1;
|
||||||
|
@ -51,7 +52,8 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m,
|
||||||
if( LAPACKE_c_nancheck( 1, &tau, 1 ) ) {
|
if( LAPACKE_c_nancheck( 1, &tau, 1 ) ) {
|
||||||
return -6;
|
return -6;
|
||||||
}
|
}
|
||||||
if( LAPACKE_c_nancheck( m, v, 1 ) ) {
|
lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
|
||||||
|
if( LAPACKE_c_nancheck( lv, v, 1 ) ) {
|
||||||
return -5;
|
return -5;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,7 +39,7 @@ lapack_int LAPACKE_classq( lapack_int n, lapack_complex_float* x,
|
||||||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||||
if( LAPACKE_get_nancheck() ) {
|
if( LAPACKE_get_nancheck() ) {
|
||||||
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
|
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
|
||||||
if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
if( LAPACKE_c_nancheck( n, x, incx ) ) {
|
||||||
return -2;
|
return -2;
|
||||||
}
|
}
|
||||||
if( LAPACKE_s_nancheck( 1, scale, 1 ) ) {
|
if( LAPACKE_s_nancheck( 1, scale, 1 ) ) {
|
||||||
|
|
|
@ -58,7 +58,7 @@ lapack_int LAPACKE_cunmtr( int matrix_layout, char side, char uplo, char trans,
|
||||||
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||||
return -10;
|
return -10;
|
||||||
}
|
}
|
||||||
if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) {
|
if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) {
|
||||||
return -9;
|
return -9;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -58,7 +58,7 @@ lapack_int LAPACKE_cupmtr( int matrix_layout, char side, char uplo, char trans,
|
||||||
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||||
return -9;
|
return -9;
|
||||||
}
|
}
|
||||||
if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) {
|
if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) {
|
||||||
return -8;
|
return -8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,16 +50,21 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
|
||||||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||||
if( LAPACKE_get_nancheck() ) {
|
if( LAPACKE_get_nancheck() ) {
|
||||||
/* Optionally check input matrices for NaNs */
|
/* Optionally check input matrices for NaNs */
|
||||||
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
lapack_int lrv, lcv; /* row, column stride */
|
||||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
if( matrix_layout == LAPACK_COL_MAJOR ) {
|
||||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
lrv = 1;
|
||||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
lcv = ldv;
|
||||||
LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
} else {
|
||||||
nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
|
lrv = ldv;
|
||||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
lcv = 1;
|
||||||
( ( LAPACKE_lsame( storev, 'c' ) &&
|
}
|
||||||
LAPACKE_lsame( side, 'r' ) ) ? n :
|
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
||||||
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||||
|
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
||||||
|
|
||||||
|
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||||
|
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
|
||||||
|
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
||||||
if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||||
return -13;
|
return -13;
|
||||||
}
|
}
|
||||||
|
@ -69,8 +74,8 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
|
||||||
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||||
if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
|
if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv],
|
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v,
|
||||||
ldv ) )
|
&v[k*lrv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||||
if( k > nrows_v ) {
|
if( k > nrows_v ) {
|
||||||
|
@ -78,23 +83,23 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
|
||||||
return -8;
|
return -8;
|
||||||
}
|
}
|
||||||
if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k,
|
if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k,
|
||||||
&v[(nrows_v-k)*ldv], ldv ) )
|
&v[(nrows_v-k)*lrv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
|
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||||
if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
|
if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k],
|
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k,
|
||||||
ldv ) )
|
&v[k*lrv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||||
if( k > ncols_v ) {
|
if( k > ncols_v ) {
|
||||||
LAPACKE_xerbla( "LAPACKE_dlarfb", -8 );
|
LAPACKE_xerbla( "LAPACKE_dlarfb", -8 );
|
||||||
return -8;
|
return -8;
|
||||||
}
|
}
|
||||||
if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k],
|
if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k,
|
||||||
ldv ) )
|
&v[(ncols_v-k)*lcv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
|
if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
|
|
|
@ -42,7 +42,7 @@ lapack_int LAPACKE_dlarfg( lapack_int n, double* alpha, double* x,
|
||||||
if( LAPACKE_d_nancheck( 1, alpha, 1 ) ) {
|
if( LAPACKE_d_nancheck( 1, alpha, 1 ) ) {
|
||||||
return -2;
|
return -2;
|
||||||
}
|
}
|
||||||
if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
if( LAPACKE_d_nancheck( n-1, x, incx ) ) {
|
||||||
return -3;
|
return -3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,6 +37,7 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m,
|
||||||
lapack_int n, const double* v, double tau, double* c,
|
lapack_int n, const double* v, double tau, double* c,
|
||||||
lapack_int ldc, double* work )
|
lapack_int ldc, double* work )
|
||||||
{
|
{
|
||||||
|
lapack_int lv;
|
||||||
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
||||||
LAPACKE_xerbla( "LAPACKE_dlarfx", -1 );
|
LAPACKE_xerbla( "LAPACKE_dlarfx", -1 );
|
||||||
return -1;
|
return -1;
|
||||||
|
@ -50,7 +51,8 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m,
|
||||||
if( LAPACKE_d_nancheck( 1, &tau, 1 ) ) {
|
if( LAPACKE_d_nancheck( 1, &tau, 1 ) ) {
|
||||||
return -6;
|
return -6;
|
||||||
}
|
}
|
||||||
if( LAPACKE_d_nancheck( m, v, 1 ) ) {
|
lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
|
||||||
|
if( LAPACKE_d_nancheck( lv, v, 1 ) ) {
|
||||||
return -5;
|
return -5;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,7 @@ lapack_int LAPACKE_dlassq( lapack_int n, double* x, lapack_int incx, double* sca
|
||||||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||||
if( LAPACKE_get_nancheck() ) {
|
if( LAPACKE_get_nancheck() ) {
|
||||||
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
|
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
|
||||||
if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
if( LAPACKE_d_nancheck( n, x, incx ) ) {
|
||||||
return -2;
|
return -2;
|
||||||
}
|
}
|
||||||
if( LAPACKE_d_nancheck( 1, scale, 1 ) ) {
|
if( LAPACKE_d_nancheck( 1, scale, 1 ) ) {
|
||||||
|
|
|
@ -56,7 +56,7 @@ lapack_int LAPACKE_dopmtr( int matrix_layout, char side, char uplo, char trans,
|
||||||
if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||||
return -9;
|
return -9;
|
||||||
}
|
}
|
||||||
if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) {
|
if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) {
|
||||||
return -8;
|
return -8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -57,7 +57,7 @@ lapack_int LAPACKE_dormtr( int matrix_layout, char side, char uplo, char trans,
|
||||||
if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||||
return -10;
|
return -10;
|
||||||
}
|
}
|
||||||
if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) {
|
if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) {
|
||||||
return -9;
|
return -9;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,16 +50,21 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
|
||||||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||||
if( LAPACKE_get_nancheck() ) {
|
if( LAPACKE_get_nancheck() ) {
|
||||||
/* Optionally check input matrices for NaNs */
|
/* Optionally check input matrices for NaNs */
|
||||||
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
lapack_int lrv, lcv; /* row, column stride */
|
||||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
if( matrix_layout == LAPACK_COL_MAJOR ) {
|
||||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
lrv = 1;
|
||||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
lcv = ldv;
|
||||||
LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
} else {
|
||||||
nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
|
lrv = ldv;
|
||||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
lcv = 1;
|
||||||
( ( LAPACKE_lsame( storev, 'c' ) &&
|
}
|
||||||
LAPACKE_lsame( side, 'r' ) ) ? n :
|
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
||||||
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||||
|
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
||||||
|
|
||||||
|
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||||
|
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
|
||||||
|
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
||||||
if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||||
return -13;
|
return -13;
|
||||||
}
|
}
|
||||||
|
@ -69,8 +74,8 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
|
||||||
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||||
if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
|
if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv],
|
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v,
|
||||||
ldv ) )
|
&v[k*lrv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||||
if( k > nrows_v ) {
|
if( k > nrows_v ) {
|
||||||
|
@ -78,23 +83,23 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
|
||||||
return -8;
|
return -8;
|
||||||
}
|
}
|
||||||
if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k,
|
if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k,
|
||||||
&v[(nrows_v-k)*ldv], ldv ) )
|
&v[(nrows_v-k)*lrv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
|
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||||
if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
|
if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k],
|
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k,
|
||||||
ldv ) )
|
&v[k*lrv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||||
if( k > ncols_v ) {
|
if( k > ncols_v ) {
|
||||||
LAPACKE_xerbla( "LAPACKE_slarfb", -8 );
|
LAPACKE_xerbla( "LAPACKE_slarfb", -8 );
|
||||||
return -8;
|
return -8;
|
||||||
}
|
}
|
||||||
if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k],
|
if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k,
|
||||||
ldv ) )
|
&v[(ncols_v-k)*lcv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
|
if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
|
|
|
@ -42,7 +42,7 @@ lapack_int LAPACKE_slarfg( lapack_int n, float* alpha, float* x,
|
||||||
if( LAPACKE_s_nancheck( 1, alpha, 1 ) ) {
|
if( LAPACKE_s_nancheck( 1, alpha, 1 ) ) {
|
||||||
return -2;
|
return -2;
|
||||||
}
|
}
|
||||||
if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
if( LAPACKE_s_nancheck( n-1, x, incx ) ) {
|
||||||
return -3;
|
return -3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,6 +37,7 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m,
|
||||||
lapack_int n, const float* v, float tau, float* c,
|
lapack_int n, const float* v, float tau, float* c,
|
||||||
lapack_int ldc, float* work )
|
lapack_int ldc, float* work )
|
||||||
{
|
{
|
||||||
|
lapack_int lv;
|
||||||
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
||||||
LAPACKE_xerbla( "LAPACKE_slarfx", -1 );
|
LAPACKE_xerbla( "LAPACKE_slarfx", -1 );
|
||||||
return -1;
|
return -1;
|
||||||
|
@ -50,7 +51,8 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m,
|
||||||
if( LAPACKE_s_nancheck( 1, &tau, 1 ) ) {
|
if( LAPACKE_s_nancheck( 1, &tau, 1 ) ) {
|
||||||
return -6;
|
return -6;
|
||||||
}
|
}
|
||||||
if( LAPACKE_s_nancheck( m, v, 1 ) ) {
|
lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
|
||||||
|
if( LAPACKE_s_nancheck( lv, v, 1 ) ) {
|
||||||
return -5;
|
return -5;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,7 @@ lapack_int LAPACKE_slassq( lapack_int n, float* x, lapack_int incx, float* scale
|
||||||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||||
if( LAPACKE_get_nancheck() ) {
|
if( LAPACKE_get_nancheck() ) {
|
||||||
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
|
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
|
||||||
if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
if( LAPACKE_s_nancheck( n, x, incx ) ) {
|
||||||
return -2;
|
return -2;
|
||||||
}
|
}
|
||||||
if( LAPACKE_s_nancheck( 1, scale, 1 ) ) {
|
if( LAPACKE_s_nancheck( 1, scale, 1 ) ) {
|
||||||
|
|
|
@ -56,7 +56,7 @@ lapack_int LAPACKE_sopmtr( int matrix_layout, char side, char uplo, char trans,
|
||||||
if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||||
return -9;
|
return -9;
|
||||||
}
|
}
|
||||||
if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) {
|
if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) {
|
||||||
return -8;
|
return -8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -57,7 +57,7 @@ lapack_int LAPACKE_sormtr( int matrix_layout, char side, char uplo, char trans,
|
||||||
if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||||
return -10;
|
return -10;
|
||||||
}
|
}
|
||||||
if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) {
|
if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) {
|
||||||
return -9;
|
return -9;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,7 +39,7 @@ lapack_int LAPACKE_zlacgv( lapack_int n, lapack_complex_double* x,
|
||||||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||||
if( LAPACKE_get_nancheck() ) {
|
if( LAPACKE_get_nancheck() ) {
|
||||||
/* Optionally check input matrices for NaNs */
|
/* Optionally check input matrices for NaNs */
|
||||||
if( LAPACKE_z_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) {
|
if( LAPACKE_z_nancheck( n, x, incx ) ) {
|
||||||
return -2;
|
return -2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,16 +51,21 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
|
||||||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||||
if( LAPACKE_get_nancheck() ) {
|
if( LAPACKE_get_nancheck() ) {
|
||||||
/* Optionally check input matrices for NaNs */
|
/* Optionally check input matrices for NaNs */
|
||||||
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
lapack_int lrv, lcv; /* row, column stride */
|
||||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
if( matrix_layout == LAPACK_COL_MAJOR ) {
|
||||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
lrv = 1;
|
||||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
lcv = ldv;
|
||||||
LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
} else {
|
||||||
nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
|
lrv = ldv;
|
||||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
lcv = 1;
|
||||||
( ( LAPACKE_lsame( storev, 'c' ) &&
|
}
|
||||||
LAPACKE_lsame( side, 'r' ) ) ? n :
|
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
||||||
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||||
|
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
||||||
|
|
||||||
|
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||||
|
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
|
||||||
|
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
||||||
if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||||
return -13;
|
return -13;
|
||||||
}
|
}
|
||||||
|
@ -70,8 +75,8 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
|
||||||
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||||
if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
|
if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv],
|
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v,
|
||||||
ldv ) )
|
&v[k*lrv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||||
if( k > nrows_v ) {
|
if( k > nrows_v ) {
|
||||||
|
@ -79,23 +84,23 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
|
||||||
return -8;
|
return -8;
|
||||||
}
|
}
|
||||||
if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k,
|
if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k,
|
||||||
&v[(nrows_v-k)*ldv], ldv ) )
|
&v[(nrows_v-k)*lrv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
|
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||||
if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
|
if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k],
|
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k,
|
||||||
ldv ) )
|
&v[k*lrv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||||
if( k > ncols_v ) {
|
if( k > ncols_v ) {
|
||||||
LAPACKE_xerbla( "LAPACKE_zlarfb", -8 );
|
LAPACKE_xerbla( "LAPACKE_zlarfb", -8 );
|
||||||
return -8;
|
return -8;
|
||||||
}
|
}
|
||||||
if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k],
|
if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k,
|
||||||
ldv ) )
|
&v[(ncols_v-k)*lcv], ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
|
if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
|
||||||
return -9;
|
return -9;
|
||||||
|
|
|
@ -43,7 +43,7 @@ lapack_int LAPACKE_zlarfg( lapack_int n, lapack_complex_double* alpha,
|
||||||
if( LAPACKE_z_nancheck( 1, alpha, 1 ) ) {
|
if( LAPACKE_z_nancheck( 1, alpha, 1 ) ) {
|
||||||
return -2;
|
return -2;
|
||||||
}
|
}
|
||||||
if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
if( LAPACKE_z_nancheck( n-1, x, incx ) ) {
|
||||||
return -3;
|
return -3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,6 +38,7 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m,
|
||||||
lapack_complex_double tau, lapack_complex_double* c,
|
lapack_complex_double tau, lapack_complex_double* c,
|
||||||
lapack_int ldc, lapack_complex_double* work )
|
lapack_int ldc, lapack_complex_double* work )
|
||||||
{
|
{
|
||||||
|
lapack_int lv;
|
||||||
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
||||||
LAPACKE_xerbla( "LAPACKE_zlarfx", -1 );
|
LAPACKE_xerbla( "LAPACKE_zlarfx", -1 );
|
||||||
return -1;
|
return -1;
|
||||||
|
@ -51,7 +52,8 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m,
|
||||||
if( LAPACKE_z_nancheck( 1, &tau, 1 ) ) {
|
if( LAPACKE_z_nancheck( 1, &tau, 1 ) ) {
|
||||||
return -6;
|
return -6;
|
||||||
}
|
}
|
||||||
if( LAPACKE_z_nancheck( m, v, 1 ) ) {
|
lv = (LAPACKE_lsame( side, 'l' ) ? m : n);
|
||||||
|
if( LAPACKE_z_nancheck( lv, v, 1 ) ) {
|
||||||
return -5;
|
return -5;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,7 +39,7 @@ lapack_int LAPACKE_zlassq( lapack_int n, lapack_complex_double* x,
|
||||||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||||
if( LAPACKE_get_nancheck() ) {
|
if( LAPACKE_get_nancheck() ) {
|
||||||
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
|
/* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */
|
||||||
if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) {
|
if( LAPACKE_z_nancheck( n, x, incx ) ) {
|
||||||
return -2;
|
return -2;
|
||||||
}
|
}
|
||||||
if( LAPACKE_d_nancheck( 1, scale, 1 ) ) {
|
if( LAPACKE_d_nancheck( 1, scale, 1 ) ) {
|
||||||
|
|
|
@ -58,7 +58,7 @@ lapack_int LAPACKE_zunmtr( int matrix_layout, char side, char uplo, char trans,
|
||||||
if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||||
return -10;
|
return -10;
|
||||||
}
|
}
|
||||||
if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) {
|
if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) {
|
||||||
return -9;
|
return -9;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -58,7 +58,7 @@ lapack_int LAPACKE_zupmtr( int matrix_layout, char side, char uplo, char trans,
|
||||||
if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||||
return -9;
|
return -9;
|
||||||
}
|
}
|
||||||
if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) {
|
if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) {
|
||||||
return -8;
|
return -8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -512,7 +512,7 @@ C END IF
|
||||||
*
|
*
|
||||||
* Call the kernel
|
* Call the kernel
|
||||||
*
|
*
|
||||||
#if defined(_OPENMP) && _OPENMP >= 201307L
|
#if defined(_OPENMP) && _OPENMP >= 201307
|
||||||
IF( TTYPE.NE.1 ) THEN
|
IF( TTYPE.NE.1 ) THEN
|
||||||
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
|
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
|
||||||
!$OMP$ DEPEND(in:WORK(MYID-1))
|
!$OMP$ DEPEND(in:WORK(MYID-1))
|
||||||
|
|
|
@ -481,7 +481,7 @@
|
||||||
*
|
*
|
||||||
* Call the kernel
|
* Call the kernel
|
||||||
*
|
*
|
||||||
#if defined(_OPENMP) && _OPENMP >= 201307L
|
#if defined(_OPENMP) && _OPENMP >= 201307
|
||||||
IF( TTYPE.NE.1 ) THEN
|
IF( TTYPE.NE.1 ) THEN
|
||||||
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
|
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
|
||||||
!$OMP$ DEPEND(in:WORK(MYID-1))
|
!$OMP$ DEPEND(in:WORK(MYID-1))
|
||||||
|
|
|
@ -512,7 +512,7 @@ C END IF
|
||||||
*
|
*
|
||||||
* Call the kernel
|
* Call the kernel
|
||||||
*
|
*
|
||||||
#if defined(_OPENMP) && _OPENMP >= 201307L
|
#if defined(_OPENMP) && _OPENMP >= 201307
|
||||||
|
|
||||||
IF( TTYPE.NE.1 ) THEN
|
IF( TTYPE.NE.1 ) THEN
|
||||||
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
|
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
|
||||||
|
|
|
@ -67,6 +67,26 @@ double sqrt(double);
|
||||||
#undef GETRF_FACTOR
|
#undef GETRF_FACTOR
|
||||||
#define GETRF_FACTOR 1.00
|
#define GETRF_FACTOR 1.00
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(USE_PTHREAD_LOCK)
|
||||||
|
static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
|
#elif defined(USE_PTHREAD_SPINLOCK)
|
||||||
|
static pthread_spinlock_t getrf_lock = 0;
|
||||||
|
#else
|
||||||
|
static BLASULONG getrf_lock = 0UL;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(USE_PTHREAD_LOCK)
|
||||||
|
static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
|
#elif defined(USE_PTHREAD_SPINLOCK)
|
||||||
|
static pthread_spinlock_t getrf_flag_lock = 0;
|
||||||
|
#else
|
||||||
|
static BLASULONG getrf_flag_lock = 0UL;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) {
|
static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) {
|
||||||
|
|
||||||
double m = (double)(M - IS - BK);
|
double m = (double)(M - IS - BK);
|
||||||
|
@ -99,7 +119,11 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
|
||||||
FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE;
|
FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE;
|
||||||
FLOAT *sbb = sb;
|
FLOAT *sbb = sb;
|
||||||
|
|
||||||
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
_Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
|
||||||
|
#else
|
||||||
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
|
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
|
||||||
|
#endif
|
||||||
|
|
||||||
blasint *ipiv = (blasint *)args -> c;
|
blasint *ipiv = (blasint *)args -> c;
|
||||||
|
|
||||||
|
@ -177,7 +201,12 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
|
||||||
/* Non blocking implementation */
|
/* Non blocking implementation */
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
_Atomic
|
||||||
|
#else
|
||||||
|
volatile
|
||||||
|
#endif
|
||||||
|
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||||
} job_t;
|
} job_t;
|
||||||
|
|
||||||
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||||
|
@ -216,9 +245,12 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
||||||
FLOAT *sbb= sb;
|
FLOAT *sbb= sb;
|
||||||
|
|
||||||
blasint *ipiv = (blasint *)args -> c;
|
blasint *ipiv = (blasint *)args -> c;
|
||||||
|
BLASLONG jw;
|
||||||
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
_Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d;
|
||||||
|
#else
|
||||||
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
|
volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;
|
||||||
|
#endif
|
||||||
if (args -> a == NULL) {
|
if (args -> a == NULL) {
|
||||||
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
|
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
|
||||||
sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
||||||
|
@ -245,8 +277,20 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
||||||
for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
|
for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
|
||||||
|
|
||||||
for (i = 0; i < args -> nthreads; i++)
|
for (i = 0; i < args -> nthreads; i++)
|
||||||
|
#if 1
|
||||||
|
{
|
||||||
|
LOCK_COMMAND(&getrf_lock);
|
||||||
|
jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
|
||||||
|
UNLOCK_COMMAND(&getrf_lock);
|
||||||
|
do {
|
||||||
|
LOCK_COMMAND(&getrf_lock);
|
||||||
|
jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
|
||||||
|
UNLOCK_COMMAND(&getrf_lock);
|
||||||
|
} while (jw);
|
||||||
|
}
|
||||||
|
#else
|
||||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {};
|
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {};
|
||||||
|
#endif
|
||||||
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
||||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
@ -283,18 +327,23 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
||||||
b + (is + jjs * lda) * COMPSIZE, lda, is);
|
b + (is + jjs * lda) * COMPSIZE, lda, is);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MB;
|
MB;
|
||||||
for (i = 0; i < args -> nthreads; i++)
|
for (i = 0; i < args -> nthreads; i++) {
|
||||||
|
LOCK_COMMAND(&getrf_lock);
|
||||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||||
|
UNLOCK_COMMAND(&getrf_lock);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOCK_COMMAND(&getrf_flag_lock);
|
||||||
flag[mypos * CACHE_LINE_SIZE] = 0;
|
flag[mypos * CACHE_LINE_SIZE] = 0;
|
||||||
|
UNLOCK_COMMAND(&getrf_flag_lock);
|
||||||
|
|
||||||
if (m == 0) {
|
if (m == 0) {
|
||||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
||||||
|
LOCK_COMMAND(&getrf_lock);
|
||||||
job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0;
|
job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0;
|
||||||
|
UNLOCK_COMMAND(&getrf_lock);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -318,7 +367,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
||||||
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
||||||
|
|
||||||
if ((current != mypos) && (!is)) {
|
if ((current != mypos) && (!is)) {
|
||||||
|
#if 1
|
||||||
|
LOCK_COMMAND(&getrf_lock);
|
||||||
|
jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
|
||||||
|
UNLOCK_COMMAND(&getrf_lock);
|
||||||
|
do {
|
||||||
|
LOCK_COMMAND(&getrf_lock);
|
||||||
|
jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
|
||||||
|
UNLOCK_COMMAND(&getrf_lock);
|
||||||
|
} while (jw == 0);
|
||||||
|
#else
|
||||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {};
|
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {};
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k,
|
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k,
|
||||||
|
@ -327,7 +387,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
||||||
|
|
||||||
MB;
|
MB;
|
||||||
if (is + min_i >= m) {
|
if (is + min_i >= m) {
|
||||||
|
LOCK_COMMAND(&getrf_lock);
|
||||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||||
|
UNLOCK_COMMAND(&getrf_lock);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -339,7 +401,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
|
||||||
|
|
||||||
for (i = 0; i < args -> nthreads; i++) {
|
for (i = 0; i < args -> nthreads; i++) {
|
||||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
||||||
|
#if 1
|
||||||
|
LOCK_COMMAND(&getrf_lock);
|
||||||
|
jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
|
||||||
|
UNLOCK_COMMAND(&getrf_lock);
|
||||||
|
do {
|
||||||
|
LOCK_COMMAND(&getrf_lock);
|
||||||
|
jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
|
||||||
|
UNLOCK_COMMAND(&getrf_lock);
|
||||||
|
} while(jw != 0);
|
||||||
|
#else
|
||||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {};
|
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {};
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -374,11 +447,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
BLASLONG i, j, k, is, bk;
|
BLASLONG i, j, k, is, bk;
|
||||||
|
|
||||||
BLASLONG num_cpu;
|
BLASLONG num_cpu;
|
||||||
|
BLASLONG f;
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE];
|
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE];
|
||||||
#else
|
#else
|
||||||
volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
_Atomic
|
||||||
|
#else
|
||||||
|
volatile
|
||||||
|
#endif
|
||||||
|
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
|
@ -501,11 +580,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
if (mm >= nn) {
|
if (mm >= nn) {
|
||||||
|
|
||||||
width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
|
width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
|
||||||
|
if (width == 0) width = nn;
|
||||||
if (nn < width) width = nn;
|
if (nn < width) width = nn;
|
||||||
nn -= width;
|
nn -= width;
|
||||||
range_N[num_cpu + 1] = range_N[num_cpu] + width;
|
range_N[num_cpu + 1] = range_N[num_cpu] + width;
|
||||||
|
|
||||||
width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
|
width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
|
||||||
|
if (width == 0) width = mm;
|
||||||
if (mm < width) width = mm;
|
if (mm < width) width = mm;
|
||||||
if (nn <= 0) width = mm;
|
if (nn <= 0) width = mm;
|
||||||
mm -= width;
|
mm -= width;
|
||||||
|
@ -514,11 +595,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
|
width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
|
||||||
|
if (width == 0) width = mm;
|
||||||
if (mm < width) width = mm;
|
if (mm < width) width = mm;
|
||||||
mm -= width;
|
mm -= width;
|
||||||
range_M[num_cpu + 1] = range_M[num_cpu] + width;
|
range_M[num_cpu + 1] = range_M[num_cpu] + width;
|
||||||
|
|
||||||
width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
|
width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
|
||||||
|
if (width == 0) width = nn;
|
||||||
if (nn < width) width = nn;
|
if (nn < width) width = nn;
|
||||||
if (mm <= 0) width = nn;
|
if (mm <= 0) width = nn;
|
||||||
nn -= width;
|
nn -= width;
|
||||||
|
@ -561,7 +644,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
range_n_new[1] = offset + is + bk;
|
range_n_new[1] = offset + is + bk;
|
||||||
|
|
||||||
if (num_cpu > 0) {
|
if (num_cpu > 0) {
|
||||||
|
|
||||||
queue[num_cpu - 1].next = NULL;
|
queue[num_cpu - 1].next = NULL;
|
||||||
|
|
||||||
exec_blas_async(0, &queue[0]);
|
exec_blas_async(0, &queue[0]);
|
||||||
|
@ -572,8 +654,20 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
|
|
||||||
if (iinfo && !info) info = iinfo + is;
|
if (iinfo && !info) info = iinfo + is;
|
||||||
|
|
||||||
for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};
|
for (i = 0; i < num_cpu; i ++) {
|
||||||
|
#if 1
|
||||||
|
LOCK_COMMAND(&getrf_flag_lock);
|
||||||
|
f=flag[i*CACHE_LINE_SIZE];
|
||||||
|
UNLOCK_COMMAND(&getrf_flag_lock);
|
||||||
|
while (f!=0) {
|
||||||
|
LOCK_COMMAND(&getrf_flag_lock);
|
||||||
|
f=flag[i*CACHE_LINE_SIZE];
|
||||||
|
UNLOCK_COMMAND(&getrf_flag_lock);
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
while (flag[i*CACHE_LINE_SIZE]) {};
|
||||||
|
#endif
|
||||||
|
}
|
||||||
TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb);
|
TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
@ -634,8 +728,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
BLASLONG range[MAX_CPU_NUMBER + 1];
|
BLASLONG range[MAX_CPU_NUMBER + 1];
|
||||||
|
|
||||||
BLASLONG width, nn, num_cpu;
|
BLASLONG width, nn, num_cpu;
|
||||||
|
#if __STDC_VERSION__ >= 201112L
|
||||||
volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
|
_Atomic
|
||||||
|
#else
|
||||||
|
volatile
|
||||||
|
#endif
|
||||||
|
BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));
|
||||||
|
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
|
|
|
@ -0,0 +1,664 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||||
|
|
||||||
|
//The array of job_t may overflow the stack.
|
||||||
|
//Instead, use malloc to alloc job_t.
|
||||||
|
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
||||||
|
#define USE_ALLOC_HEAP
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
static FLOAT dm1 = -1.;
|
||||||
|
|
||||||
|
#ifndef KERNEL_FUNC
|
||||||
|
#ifndef LOWER
|
||||||
|
#define KERNEL_FUNC SYRK_KERNEL_U
|
||||||
|
#else
|
||||||
|
#define KERNEL_FUNC SYRK_KERNEL_L
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef LOWER
|
||||||
|
#ifndef COMPLEX
|
||||||
|
#define TRSM_KERNEL TRSM_KERNEL_LT
|
||||||
|
#else
|
||||||
|
#define TRSM_KERNEL TRSM_KERNEL_LC
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#ifndef COMPLEX
|
||||||
|
#define TRSM_KERNEL TRSM_KERNEL_RN
|
||||||
|
#else
|
||||||
|
#define TRSM_KERNEL TRSM_KERNEL_RR
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef CACHE_LINE_SIZE
|
||||||
|
#define CACHE_LINE_SIZE 8
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef DIVIDE_RATE
|
||||||
|
#define DIVIDE_RATE 2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef SWITCH_RATIO
|
||||||
|
#define SWITCH_RATIO 2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef LOWER
|
||||||
|
#define TRANS
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef SYRK_LOCAL
|
||||||
|
#if !defined(LOWER) && !defined(TRANS)
|
||||||
|
#define SYRK_LOCAL SYRK_UN
|
||||||
|
#elif !defined(LOWER) && defined(TRANS)
|
||||||
|
#define SYRK_LOCAL SYRK_UT
|
||||||
|
#elif defined(LOWER) && !defined(TRANS)
|
||||||
|
#define SYRK_LOCAL SYRK_LN
|
||||||
|
#else
|
||||||
|
#define SYRK_LOCAL SYRK_LT
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
_Atomic
|
||||||
|
#else
|
||||||
|
volatile
|
||||||
|
#endif
|
||||||
|
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||||
|
} job_t;
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef KERNEL_OPERATION
|
||||||
|
#ifndef COMPLEX
|
||||||
|
#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
|
||||||
|
KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
|
||||||
|
#else
|
||||||
|
#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
|
||||||
|
KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef ICOPY_OPERATION
|
||||||
|
#ifndef TRANS
|
||||||
|
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||||
|
#else
|
||||||
|
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef OCOPY_OPERATION
|
||||||
|
#ifdef TRANS
|
||||||
|
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||||
|
#else
|
||||||
|
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef S
|
||||||
|
#define S args -> a
|
||||||
|
#endif
|
||||||
|
#ifndef A
|
||||||
|
#define A args -> b
|
||||||
|
#endif
|
||||||
|
#ifndef C
|
||||||
|
#define C args -> c
|
||||||
|
#endif
|
||||||
|
#ifndef LDA
|
||||||
|
#define LDA args -> lda
|
||||||
|
#endif
|
||||||
|
#ifndef N
|
||||||
|
#define N args -> m
|
||||||
|
#endif
|
||||||
|
#ifndef K
|
||||||
|
#define K args -> k
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
|
||||||
|
|
||||||
|
FLOAT *buffer[DIVIDE_RATE];
|
||||||
|
|
||||||
|
BLASLONG k, lda;
|
||||||
|
BLASLONG m_from, m_to;
|
||||||
|
|
||||||
|
FLOAT *alpha;
|
||||||
|
FLOAT *a, *c;
|
||||||
|
job_t *job = (job_t *)args -> common;
|
||||||
|
BLASLONG xxx, bufferside;
|
||||||
|
|
||||||
|
BLASLONG jjs, min_jj;
|
||||||
|
BLASLONG is, min_i, div_n;
|
||||||
|
|
||||||
|
BLASLONG i, current;
|
||||||
|
|
||||||
|
k = K;
|
||||||
|
|
||||||
|
a = (FLOAT *)A;
|
||||||
|
c = (FLOAT *)C;
|
||||||
|
|
||||||
|
lda = LDA;
|
||||||
|
|
||||||
|
alpha = (FLOAT *)args -> alpha;
|
||||||
|
|
||||||
|
m_from = range_n[mypos + 0];
|
||||||
|
m_to = range_n[mypos + 1];
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||||
|
|
||||||
|
buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
||||||
|
for (i = 1; i < DIVIDE_RATE; i++) {
|
||||||
|
buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef LOWER
|
||||||
|
TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb);
|
||||||
|
#else
|
||||||
|
TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
|
||||||
|
|
||||||
|
for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){
|
||||||
|
|
||||||
|
min_jj = MIN(m_to, xxx + div_n) - jjs;
|
||||||
|
|
||||||
|
#ifndef LOWER
|
||||||
|
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
|
||||||
|
#else
|
||||||
|
if (min_jj > GEMM_P) min_jj = GEMM_P;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef LOWER
|
||||||
|
OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
|
||||||
|
|
||||||
|
TRSM_KERNEL (k, min_jj, k, dm1,
|
||||||
|
#ifdef COMPLEX
|
||||||
|
ZERO,
|
||||||
|
#endif
|
||||||
|
sb,
|
||||||
|
buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
|
||||||
|
a + jjs * lda * COMPSIZE, lda, 0);
|
||||||
|
#else
|
||||||
|
ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
|
||||||
|
|
||||||
|
TRSM_KERNEL (min_jj, k, k, dm1,
|
||||||
|
#ifdef COMPLEX
|
||||||
|
ZERO,
|
||||||
|
#endif
|
||||||
|
buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
|
||||||
|
sb,
|
||||||
|
a + jjs * COMPSIZE, lda, 0);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef LOWER
|
||||||
|
for (i = 0; i <= mypos; i++)
|
||||||
|
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||||
|
#else
|
||||||
|
for (i = mypos; i < args -> nthreads; i++)
|
||||||
|
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
WMB;
|
||||||
|
}
|
||||||
|
|
||||||
|
min_i = m_to - m_from;
|
||||||
|
|
||||||
|
if (min_i >= GEMM_P * 2) {
|
||||||
|
min_i = GEMM_P;
|
||||||
|
} else
|
||||||
|
if (min_i > GEMM_P) {
|
||||||
|
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef LOWER
|
||||||
|
ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
|
||||||
|
#else
|
||||||
|
OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
current = mypos;
|
||||||
|
|
||||||
|
#ifndef LOWER
|
||||||
|
while (current < args -> nthreads)
|
||||||
|
#else
|
||||||
|
while (current >= 0)
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||||
|
|
||||||
|
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
||||||
|
|
||||||
|
/* thread has to wait */
|
||||||
|
if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||||
|
|
||||||
|
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
|
||||||
|
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
|
||||||
|
c, lda, m_from, xxx);
|
||||||
|
|
||||||
|
if (m_from + min_i >= m_to) {
|
||||||
|
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||||
|
WMB;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef LOWER
|
||||||
|
current ++;
|
||||||
|
#else
|
||||||
|
current --;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
for(is = m_from + min_i; is < m_to; is += min_i){
|
||||||
|
min_i = m_to - is;
|
||||||
|
|
||||||
|
if (min_i >= GEMM_P * 2) {
|
||||||
|
min_i = GEMM_P;
|
||||||
|
} else
|
||||||
|
if (min_i > GEMM_P) {
|
||||||
|
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef LOWER
|
||||||
|
ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
|
||||||
|
#else
|
||||||
|
OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
current = mypos;
|
||||||
|
|
||||||
|
#ifndef LOWER
|
||||||
|
while (current < args -> nthreads)
|
||||||
|
#else
|
||||||
|
while (current >= 0)
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||||
|
|
||||||
|
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
||||||
|
|
||||||
|
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
|
||||||
|
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
|
||||||
|
c, lda, is, xxx);
|
||||||
|
|
||||||
|
if (is + min_i >= m_to) {
|
||||||
|
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||||
|
WMB;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#ifndef LOWER
|
||||||
|
current ++;
|
||||||
|
#else
|
||||||
|
current --;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < args -> nthreads; i++) {
|
||||||
|
if (i != mypos) {
|
||||||
|
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
||||||
|
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
|
||||||
|
|
||||||
|
blas_arg_t newarg;
|
||||||
|
|
||||||
|
#ifndef USE_ALLOC_HEAP
|
||||||
|
job_t job[MAX_CPU_NUMBER];
|
||||||
|
#else
|
||||||
|
job_t * job = NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||||
|
|
||||||
|
BLASLONG range[MAX_CPU_NUMBER + 100];
|
||||||
|
|
||||||
|
BLASLONG num_cpu;
|
||||||
|
|
||||||
|
BLASLONG nthreads = args -> nthreads;
|
||||||
|
|
||||||
|
BLASLONG width, i, j, k;
|
||||||
|
BLASLONG n, n_from, n_to;
|
||||||
|
int mode, mask;
|
||||||
|
double dnum;
|
||||||
|
|
||||||
|
#ifndef COMPLEX
|
||||||
|
#ifdef XDOUBLE
|
||||||
|
mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||||
|
mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1;
|
||||||
|
#elif defined(DOUBLE)
|
||||||
|
mode = BLAS_DOUBLE | BLAS_REAL;
|
||||||
|
mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
|
||||||
|
#else
|
||||||
|
mode = BLAS_SINGLE | BLAS_REAL;
|
||||||
|
mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#ifdef XDOUBLE
|
||||||
|
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||||
|
mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1;
|
||||||
|
#elif defined(DOUBLE)
|
||||||
|
mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||||
|
mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1;
|
||||||
|
#else
|
||||||
|
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||||
|
mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
newarg.m = args -> m;
|
||||||
|
newarg.k = args -> k;
|
||||||
|
newarg.a = args -> a;
|
||||||
|
newarg.b = args -> b;
|
||||||
|
newarg.c = args -> c;
|
||||||
|
newarg.lda = args -> lda;
|
||||||
|
newarg.alpha = args -> alpha;
|
||||||
|
|
||||||
|
#ifdef USE_ALLOC_HEAP
|
||||||
|
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
||||||
|
if(job==NULL){
|
||||||
|
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
newarg.common = (void *)job;
|
||||||
|
|
||||||
|
n_from = 0;
|
||||||
|
n_to = args -> m;
|
||||||
|
|
||||||
|
#ifndef LOWER
|
||||||
|
|
||||||
|
range[MAX_CPU_NUMBER] = n_to - n_from;
|
||||||
|
range[0] = 0;
|
||||||
|
num_cpu = 0;
|
||||||
|
i = 0;
|
||||||
|
n = n_to - n_from;
|
||||||
|
|
||||||
|
dnum = (double)n * (double)n /(double)nthreads;
|
||||||
|
|
||||||
|
while (i < n){
|
||||||
|
|
||||||
|
if (nthreads - num_cpu > 1) {
|
||||||
|
|
||||||
|
double di = (double)i;
|
||||||
|
|
||||||
|
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
|
||||||
|
|
||||||
|
if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1));
|
||||||
|
|
||||||
|
if ((width > n - i) || (width < mask)) width = n - i;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
width = n - i;
|
||||||
|
}
|
||||||
|
|
||||||
|
range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width;
|
||||||
|
|
||||||
|
queue[num_cpu].mode = mode;
|
||||||
|
queue[num_cpu].routine = inner_thread;
|
||||||
|
queue[num_cpu].args = &newarg;
|
||||||
|
queue[num_cpu].range_m = NULL;
|
||||||
|
|
||||||
|
queue[num_cpu].sa = NULL;
|
||||||
|
queue[num_cpu].sb = NULL;
|
||||||
|
queue[num_cpu].next = &queue[num_cpu + 1];
|
||||||
|
|
||||||
|
num_cpu ++;
|
||||||
|
i += width;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu];
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
range[0] = 0;
|
||||||
|
num_cpu = 0;
|
||||||
|
i = 0;
|
||||||
|
n = n_to - n_from;
|
||||||
|
|
||||||
|
dnum = (double)n * (double)n /(double)nthreads;
|
||||||
|
|
||||||
|
while (i < n){
|
||||||
|
|
||||||
|
if (nthreads - num_cpu > 1) {
|
||||||
|
|
||||||
|
double di = (double)i;
|
||||||
|
|
||||||
|
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
|
||||||
|
|
||||||
|
if ((width > n - i) || (width < mask)) width = n - i;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
width = n - i;
|
||||||
|
}
|
||||||
|
|
||||||
|
range[num_cpu + 1] = range[num_cpu] + width;
|
||||||
|
|
||||||
|
queue[num_cpu].mode = mode;
|
||||||
|
queue[num_cpu].routine = inner_thread;
|
||||||
|
queue[num_cpu].args = &newarg;
|
||||||
|
queue[num_cpu].range_m = NULL;
|
||||||
|
queue[num_cpu].range_n = range;
|
||||||
|
queue[num_cpu].sa = NULL;
|
||||||
|
queue[num_cpu].sb = NULL;
|
||||||
|
queue[num_cpu].next = &queue[num_cpu + 1];
|
||||||
|
|
||||||
|
num_cpu ++;
|
||||||
|
i += width;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
newarg.nthreads = num_cpu;
|
||||||
|
|
||||||
|
if (num_cpu) {
|
||||||
|
|
||||||
|
for (j = 0; j < num_cpu; j++) {
|
||||||
|
for (i = 0; i < num_cpu; i++) {
|
||||||
|
for (k = 0; k < DIVIDE_RATE; k++) {
|
||||||
|
job[j].working[i][CACHE_LINE_SIZE * k] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
queue[0].sa = sa;
|
||||||
|
queue[0].sb = sb;
|
||||||
|
queue[num_cpu - 1].next = NULL;
|
||||||
|
|
||||||
|
exec_blas(num_cpu, queue);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef USE_ALLOC_HEAP
|
||||||
|
free(job);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {
|
||||||
|
|
||||||
|
BLASLONG n, bk, i, blocking, lda;
|
||||||
|
BLASLONG info;
|
||||||
|
int mode;
|
||||||
|
blas_arg_t newarg;
|
||||||
|
FLOAT *a;
|
||||||
|
FLOAT alpha[2] = { -ONE, ZERO};
|
||||||
|
|
||||||
|
#ifndef COMPLEX
|
||||||
|
#ifdef XDOUBLE
|
||||||
|
mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||||
|
#elif defined(DOUBLE)
|
||||||
|
mode = BLAS_DOUBLE | BLAS_REAL;
|
||||||
|
#else
|
||||||
|
mode = BLAS_SINGLE | BLAS_REAL;
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#ifdef XDOUBLE
|
||||||
|
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||||
|
#elif defined(DOUBLE)
|
||||||
|
mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||||
|
#else
|
||||||
|
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (args -> nthreads == 1) {
|
||||||
|
#ifndef LOWER
|
||||||
|
info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0);
|
||||||
|
#else
|
||||||
|
info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0);
|
||||||
|
#endif
|
||||||
|
return info;
|
||||||
|
}
|
||||||
|
|
||||||
|
n = args -> n;
|
||||||
|
a = (FLOAT *)args -> a;
|
||||||
|
lda = args -> lda;
|
||||||
|
|
||||||
|
if (range_n) n = range_n[1] - range_n[0];
|
||||||
|
|
||||||
|
if (n <= GEMM_UNROLL_N * 2) {
|
||||||
|
#ifndef LOWER
|
||||||
|
info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0);
|
||||||
|
#else
|
||||||
|
info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0);
|
||||||
|
#endif
|
||||||
|
return info;
|
||||||
|
}
|
||||||
|
|
||||||
|
newarg.lda = lda;
|
||||||
|
newarg.ldb = lda;
|
||||||
|
newarg.ldc = lda;
|
||||||
|
newarg.alpha = alpha;
|
||||||
|
newarg.beta = NULL;
|
||||||
|
newarg.nthreads = args -> nthreads;
|
||||||
|
|
||||||
|
blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N;
|
||||||
|
if (blocking > GEMM_Q) blocking = GEMM_Q;
|
||||||
|
|
||||||
|
for (i = 0; i < n; i += blocking) {
|
||||||
|
bk = n - i;
|
||||||
|
if (bk > blocking) bk = blocking;
|
||||||
|
|
||||||
|
newarg.m = bk;
|
||||||
|
newarg.n = bk;
|
||||||
|
newarg.a = a + (i + i * lda) * COMPSIZE;
|
||||||
|
|
||||||
|
info = CNAME(&newarg, NULL, NULL, sa, sb, 0);
|
||||||
|
if (info) return info + i;
|
||||||
|
|
||||||
|
if (n - i - bk > 0) {
|
||||||
|
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||||
|
newarg.m = n - i - bk;
|
||||||
|
newarg.k = bk;
|
||||||
|
#ifndef LOWER
|
||||||
|
newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE;
|
||||||
|
#else
|
||||||
|
newarg.b = a + ((i + bk) + i * lda) * COMPSIZE;
|
||||||
|
#endif
|
||||||
|
newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
|
||||||
|
|
||||||
|
thread_driver(&newarg, sa, sb);
|
||||||
|
#else
|
||||||
|
|
||||||
|
#ifndef LOWER
|
||||||
|
newarg.m = bk;
|
||||||
|
newarg.n = n - i - bk;
|
||||||
|
newarg.a = a + (i + i * lda) * COMPSIZE;
|
||||||
|
newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;
|
||||||
|
|
||||||
|
gemm_thread_n(mode | BLAS_TRANSA_T,
|
||||||
|
&newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads);
|
||||||
|
|
||||||
|
newarg.n = n - i - bk;
|
||||||
|
newarg.k = bk;
|
||||||
|
newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE;
|
||||||
|
newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0);
|
||||||
|
#else
|
||||||
|
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
|
||||||
|
&newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads);
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
newarg.m = n - i - bk;
|
||||||
|
newarg.n = bk;
|
||||||
|
newarg.a = a + (i + i * lda) * COMPSIZE;
|
||||||
|
newarg.b = a + (i + bk + i * lda) * COMPSIZE;
|
||||||
|
|
||||||
|
gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO,
|
||||||
|
&newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads);
|
||||||
|
|
||||||
|
newarg.n = n - i - bk;
|
||||||
|
newarg.k = bk;
|
||||||
|
newarg.a = a + (i + bk + i * lda) * COMPSIZE;
|
||||||
|
newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE;
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
|
||||||
|
#else
|
||||||
|
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO,
|
||||||
|
&newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
2
param.h
2
param.h
|
@ -2291,7 +2291,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define SYMV_P 16
|
#define SYMV_P 16
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(P5600) || defined(I6400) || defined(P6600) || defined(I6500)
|
#if defined(P5600) || defined(MIPS1004K) || defined(I6400) || defined(P6600) || defined(I6500)
|
||||||
#define SNUMOPT 2
|
#define SNUMOPT 2
|
||||||
#define DNUMOPT 2
|
#define DNUMOPT 2
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,7 @@ endif ()
|
||||||
|
|
||||||
# known to hang with the native Windows and Android threads
|
# known to hang with the native Windows and Android threads
|
||||||
# FIXME needs checking if this works on any of the other platforms
|
# FIXME needs checking if this works on any of the other platforms
|
||||||
|
if (NOT NO_CBLAS)
|
||||||
if (NOT USE_OPENMP)
|
if (NOT USE_OPENMP)
|
||||||
if (OS_CYGWIN_NT OR OS_LINUX)
|
if (OS_CYGWIN_NT OR OS_LINUX)
|
||||||
set(OpenBLAS_utest_src
|
set(OpenBLAS_utest_src
|
||||||
|
@ -33,6 +34,7 @@ set(OpenBLAS_utest_src
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
if (NOT NO_LAPACK)
|
if (NOT NO_LAPACK)
|
||||||
set(OpenBLAS_utest_src
|
set(OpenBLAS_utest_src
|
||||||
|
|
|
@ -17,11 +17,13 @@ endif
|
||||||
|
|
||||||
#this does not work with OpenMP nor with native Windows or Android threads
|
#this does not work with OpenMP nor with native Windows or Android threads
|
||||||
# FIXME TBD if this works on OSX, SunOS, POWER and zarch
|
# FIXME TBD if this works on OSX, SunOS, POWER and zarch
|
||||||
|
ifneq ($(NO_CBLAS), 1)
|
||||||
ifndef USE_OPENMP
|
ifndef USE_OPENMP
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT))
|
ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT))
|
||||||
OBJS += test_fork.o
|
OBJS += test_fork.o
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
all : run_test
|
all : run_test
|
||||||
|
|
||||||
|
|
|
@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
**********************************************************************************/
|
**********************************************************************************/
|
||||||
|
|
||||||
#include "openblas_utest.h"
|
#include "openblas_utest.h"
|
||||||
|
#include <sys/types.h>
|
||||||
#include <sys/wait.h>
|
#include <sys/wait.h>
|
||||||
#include <cblas.h>
|
#include <cblas.h>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue