diff --git a/.travis.yml b/.travis.yml index 0b280c2fc..4a25e7121 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,7 @@ language: c jobs: include: - &test-ubuntu + os: linux stage: test compiler: gcc addons: @@ -57,7 +58,8 @@ jobs: - TARGET_BOX=LINUX32 - BTYPE="BINARY=32" - - stage: test + - os: linux + stage: test compiler: gcc addons: apt: @@ -77,6 +79,7 @@ jobs: # which is slower than container-based infrastructure used for jobs # that don't require sudo. - &test-alpine + os: linux stage: test dist: trusty sudo: true @@ -120,6 +123,7 @@ jobs: - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" - &test-cmake + os: linux stage: test compiler: clang addons: @@ -147,6 +151,23 @@ jobs: env: - CMAKE=1 + - &test-macos + os: osx + stage: test + osx_image: xcode8 + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" + - brew update + - brew install gcc # for gfortran + script: + - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + env: + - BTYPE="BINARY=64 INTERFACE64=1" + + - <<: *test-macos + env: + - BTYPE="BINARY=32" + # whitelist branches: only: diff --git a/Makefile b/Makefile index 5198f9e2b..c0e5fbcf8 100644 --- a/Makefile +++ b/Makefile @@ -91,11 +91,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) @ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif -ifeq ($(OSNAME), FreeBSD) - @$(MAKE) -C exports so - @ln -fs $(LIBSONAME) $(LIBPREFIX).so -endif -ifeq ($(OSNAME), NetBSD) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so endif diff --git a/Makefile.install b/Makefile.install index 81d097215..21c3c9e22 100644 --- a/Makefile.install +++ b/Makefile.install @@ -72,12 +72,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif -ifeq ($(OSNAME), FreeBSD) - @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" - @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBSONAME) $(LIBPREFIX).so -endif -ifeq ($(OSNAME), NetBSD) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so @@ -101,8 +96,9 @@ endif #Generating openblas.pc @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" - @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @@ -115,7 +111,7 @@ endif ifndef NO_SHARED #ifeq logical or -ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) diff --git a/Makefile.prebuild b/Makefile.prebuild index daa556f65..a366004a1 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -17,6 +17,10 @@ ifdef CPUIDEMU EXFLAGS = -DCPUIDEMU -DVENDOR=99 endif +ifeq ($(TARGET), 1004K) +TARGET_FLAGS = -mips32r2 +endif + ifeq ($(TARGET), P5600) TARGET_FLAGS = -mips32r5 endif diff --git a/Makefile.rule b/Makefile.rule index 62bf63df4..12734464b 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -60,6 +60,13 @@ VERSION = 0.3.0.dev # automatically detected by the the script. # NUM_THREADS = 24 +# If you have enabled USE_OPENMP and your application would call +# OpenBLAS's calculation API from multi threads, please comment it in. +# This flag defines how many instances of OpenBLAS's calculation API can +# actually run in parallel. If more threads call OpenBLAS's calculation API, +# they need to wait for the preceding API calls to finish or risk data corruption. +# NUM_PARALLEL = 2 + # if you don't need to install the static library, please comment it in. # NO_STATIC = 1 diff --git a/Makefile.system b/Makefile.system index 9720b317f..7bfac1fa8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -17,15 +17,24 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib # http://stackoverflow.com/questions/4029274/mingw-and-make-variables # - Default value is 'cc' which is not always a valid command (e.g. MinGW). ifeq ($(origin CC),default) + +# Check if $(CC) refers to a valid command and set the value to gcc if not +ifneq ($(findstring cmd.exe,$(SHELL)),) +ifeq ($(shell where $(CC) 2>NUL),) CC = gcc -# Change the default compile to clang on Mac OSX. -# http://stackoverflow.com/questions/714100/os-detecting-makefile -UNAME_S := $(shell uname -s) -ifeq ($(UNAME_S),Darwin) - CC = clang -# EXTRALIB += -Wl,-no_compact_unwind -endif endif +else # POSIX-ish +ifeq ($(shell command -v $(CC) 2>/dev/null),) +ifeq ($(shell uname -s),Darwin) +CC = clang +# EXTRALIB += -Wl,-no_compact_unwind +else +CC = gcc +endif # Darwin +endif # CC exists +endif # Shell is sane + +endif # CC is set to default # Default Fortran compiler (FC) is selected by f_check. @@ -175,6 +184,10 @@ endif endif +ifndef NUM_PARALLEL +NUM_PARALLEL = 1 +endif + ifndef NUM_THREADS NUM_THREADS = $(NUM_CORES) endif @@ -230,7 +243,7 @@ endif MD5SUM = md5 -r endif -ifeq ($(OSNAME), FreeBSD) +ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) MD5SUM = md5 -r endif @@ -424,7 +437,7 @@ CCOMMON_OPT += -fopenmp endif ifeq ($(C_COMPILER), INTEL) -CCOMMON_OPT += -openmp +CCOMMON_OPT += -fopenmp endif ifeq ($(C_COMPILER), PGI) @@ -555,9 +568,14 @@ CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif +ifeq ($(CORE), 1004K) +CCOMMON_OPT += -mips32r2 $(MSA_FLAGS) +FCOMMON_OPT += -mips32r2 $(MSA_FLAGS) +endif + ifeq ($(CORE), P5600) -CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) -FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) +CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) +FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) endif ifeq ($(CORE), I6400) @@ -704,7 +722,7 @@ FCOMMON_OPT += -i8 endif endif ifeq ($(USE_OPENMP), 1) -FCOMMON_OPT += -openmp +FCOMMON_OPT += -fopenmp endif endif @@ -952,6 +970,8 @@ endif CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) +CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL) + ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif diff --git a/README.md b/README.md index ec32c1f60..02d087334 100644 --- a/README.md +++ b/README.md @@ -5,175 +5,219 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) + ## Introduction + OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. -Please read the documents on OpenBLAS wiki pages . +Please read the documentation on the OpenBLAS wiki pages: . ## Binary Packages -We provide binary packages for the following platform. + +We provide official binary packages for the following platform: * Windows x86/x86_64 You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/). ## Installation from Source -Download from project homepage. http://xianyi.github.com/OpenBLAS/ -Or, check out codes from git://github.com/xianyi/OpenBLAS.git +Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code +using Git from https://github.com/xianyi/OpenBLAS.git. + +### Dependencies + +Building OpenBLAS requires the following to be installed: + +* GNU Make +* A C compiler, e.g. GCC or Clang +* A Fortran compiler (optional, for LAPACK) +* IBM MASS (optional, see below) + ### Normal compile - * type "make" to detect the CPU automatically. - or - * type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. + +Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically. +To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`. +The full target list is in the file `TargetList.txt`. ### Cross compile -Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. + +Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler. +The target must be specified explicitly when cross compiling. Examples: -On X86 box, compile this library for loongson3a CPU. +* On an x86 box, compile this library for a loongson3a CPU: + ```sh + make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A + ``` - make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A - -On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler. - - make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 +* On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler: + ```sh + make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 + ``` ### Debug version - make DEBUG=1 +A debug version can be built using `make DEBUG=1`. -### Compile with MASS Support on Power CPU (Optional dependency) +### Compile with MASS support on Power CPU (optional) -[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and -Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER. -The library can be installed as below - +The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library +consists of a set of mathematical functions for C, C++, and Fortran applications that are +are tuned for optimum performance on POWER architectures. +OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER. +The library can be installed as shown: - * On Ubuntu: +* On Ubuntu: + ```sh + wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add - + echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list + sudo apt-get update + sudo apt-get install libxlmass-devel.8.1.5 + ``` - wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
- echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
- sudo apt-get update
- sudo apt-get install libxlmass-devel.8.1.5
+* On RHEL/CentOS: + ```sh + wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key + sudo rpm --import repomd.xml.key + wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo + sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/ + sudo yum install libxlmass-devel.8.1.5 + ``` - * On RHEL/CentOS: +After installing the MASS library, compile OpenBLAS with `USE_MASS=1`. +For example, to compile on Power8 with MASS support: `make USE_MASS=1 TARGET=POWER8`. - wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
- sudo rpm --import repomd.xml.key
- wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
- sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
- sudo yum install libxlmass-devel.8.1.5
+### Install to a specific directory (optional) -After installing MASS library, compile openblas with USE_MASS=1. +Use `PREFIX=` when invoking `make`, for example -Example: +```sh +make install PREFIX=your_installation_directory +``` -Compiling on Power8 with MASS support - +The default installation directory is `/opt/OpenBLAS`. - make USE_MASS=1 TARGET=POWER8 +## Supported CPUs and Operating Systems -### Install to the directory (optional) +Please read `GotoBLAS_01Readme.txt`. -Example: +### Additional supported CPUs - make install PREFIX=your_installation_directory +#### x86/x86-64 -The default directory is /opt/OpenBLAS - -## Support CPU & OS -Please read GotoBLAS_01Readme.txt - -### Additional support CPU: - -#### x86/x86-64: - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. -- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar) +- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. -#### MIPS64: +#### MIPS64 + - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. - **ICT Loongson 3B**: Experimental -#### ARM: -- **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ ) -- **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 ) +#### ARM -#### ARM64: -- **ARMV8**: Experimental +- **ARMv6**: Optimized BLAS for vfpv2 and vfpv3-d16 (e.g. BCM2835, Cortex M0+) +- **ARMv7**: Optimized BLAS for vfpv3-d32 (e.g. Cortex A8, A9 and A15) + +#### ARM64 + +- **ARMv8**: Experimental - **ARM Cortex-A57**: Experimental #### PPC/PPC64 -- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with USE_OPENMP=1 -#### IBM zEnterprise System: +- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` + +#### IBM zEnterprise System + - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) - -### Support OS: +### Supported OS + - **GNU/Linux** -- **MingWin or Visual Studio(CMake)/Windows**: Please read . -- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. -- **FreeBSD**: Supported by community. We didn't test the library on this OS. -- **Android**: Supported by community. Please read . +- **MinGW or Visual Studio (CMake)/Windows**: Please read . +- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts. +- **FreeBSD**: Supported by the community. We don't actively test the library on this OS. +- **OpenBSD**: Supported by the community. We don't actively test the library on this OS. +- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS. +- **Android**: Supported by the community. Please read . -## Usages -Link with libopenblas.a or -lopenblas for shared library. +## Usage -### Set the number of threads with environment variables. +Statically link with `libopenblas.a` or dynamically link with `-lopenblas` if OpenBLAS was +compiled as a shared library. -Examples: +### Setting the number of threads using environment variables - export OPENBLAS_NUM_THREADS=4 +Environment variables are used to specify a maximum number of threads. +For example, - or +```sh +export OPENBLAS_NUM_THREADS=4 +export GOTO_NUM_THREADS=4 +export OMP_NUM_THREADS=4 +``` - export GOTO_NUM_THREADS=4 +The priorities are `OPENBLAS_NUM_THREADS` > `GOTO_NUM_THREADS` > `OMP_NUM_THREADS`. - or +If you compile this library with `USE_OPENMP=1`, you should set the `OMP_NUM_THREADS` +environment variable; OpenBLAS ignores `OPENBLAS_NUM_THREADS` and `GOTO_NUM_THREADS` when +compiled with `USE_OPENMP=1`. - export OMP_NUM_THREADS=4 +### Setting the number of threads at runtime -The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. +We provide the following functions to control the number of threads at runtime: -If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. +```c +void goto_set_num_threads(int num_threads); +void openblas_set_num_threads(int num_threads); +``` -### Set the number of threads on runtime. +If you compile this library with `USE_OPENMP=1`, you should use the above functions too. -We provided the below functions to control the number of threads on runtime. +## Reporting bugs - void goto_set_num_threads(int num_threads); - - void openblas_set_num_threads(int num_threads); - -If you compile this lib with USE_OPENMP=1, you should use the above functions, too. - -## Report Bugs -Please add a issue in https://github.com/xianyi/OpenBLAS/issues +Please submit an issue in https://github.com/xianyi/OpenBLAS/issues. ## Contact + * OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users * OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev -## ChangeLog -Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. +## Change log + +Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version. ## Troubleshooting -* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. -* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. -* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. -* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1. -* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). -* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. + +* Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first. +* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. +* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. + Clang 3.0 will generate the wrong AVX binary code. +* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), + there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build + the library with `BIGNUMA=1`. +* OpenBLAS does not set processor affinity by default. + On Linux, you can enable processor affinity by commenting out the line `NO_AFFINITY=1` in + Makefile.rule. However, note that this may cause + [a conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). +* On Loongson 3A, `make test` may fail with a `pthread_create` error (`EAGAIN`). + However, it will be okay when you run the same test case on the shell. ## Contributing -1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug. -1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. -1. Write a test which shows that the bug was fixed or that the feature works as expected. -1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. + +1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue + to start a discussion around a feature idea or a bug. +2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. +3. Write a test which shows that the bug was fixed or that the feature works as expected. +4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. ## Donation + Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation). diff --git a/TargetList.txt b/TargetList.txt index d40545cf8..aeeaa9ede 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -56,6 +56,7 @@ CELL 3.MIPS CPU: P5600 +1004K 4.MIPS64 CPU: SICORTEX diff --git a/USAGE.md b/USAGE.md index c76ceb324..89f3bba67 100644 --- a/USAGE.md +++ b/USAGE.md @@ -14,6 +14,20 @@ Please build OpenBLAS with larger `NUM_THREADS`. For example, `make NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set `MAX_CPU_NUMBER=NUM_THREADS`. +Despite its name, and due to the use of memory buffers in functions like SGEMM, +the setting of NUM_THREADS can be relevant even for a single-threaded build +of OpenBLAS, if such functions get called by multiple threads of a program +that uses OpenBLAS. In some cases, the affected code may simply crash or throw +a segmentation fault without displaying the above warning first. + +Note that the number of threads used at runtime can be altered to differ from the +value NUM_THREADS was set to at build time. At runtime, the actual number of +threads can be set anywhere from 1 to the build's NUM_THREADS (note however, +that this does not change the number of memory buffers that will be allocated, +which is set at build time). The number of threads for a process can be set by +using the mechanisms described below. + + #### How can I use OpenBLAS in multi-threaded applications? If your application is already multi-threaded, it will conflict with OpenBLAS diff --git a/c_check b/c_check index 20da288be..a3b337602 100644 --- a/c_check +++ b/c_check @@ -54,6 +54,8 @@ $compiler = GCC if ($compiler eq ""); $os = Linux if ($data =~ /OS_LINUX/); $os = FreeBSD if ($data =~ /OS_FREEBSD/); $os = NetBSD if ($data =~ /OS_NETBSD/); +$os = OpenBSD if ($data =~ /OS_OPENBSD/); +$os = DragonFly if ($data =~ /OS_DRAGONFLY/); $os = Darwin if ($data =~ /OS_DARWIN/); $os = SunOS if ($data =~ /OS_SUNOS/); $os = AIX if ($data =~ /OS_AIX/); diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index 113ba8526..35973b09b 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -1,6 +1,7 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ +openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ diff --git a/cmake/system.cmake b/cmake/system.cmake index 3fdd9390c..645895671 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -96,6 +96,10 @@ if (NOT CMAKE_CROSSCOMPILING) endif() +if (NOT DEFINED NUM_PARALLEL) + set(NUM_PARALLEL 1) +endif() + if (NOT DEFINED NUM_THREADS) if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0) # HT? @@ -224,6 +228,8 @@ endif () set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") +set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}") + if (USE_SIMPLE_THREADED_LEVEL3) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") endif () diff --git a/common.h b/common.h index ae98279ef..123e3dee7 100644 --- a/common.h +++ b/common.h @@ -93,7 +93,7 @@ extern "C" { #include #endif -#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID) +#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID) #include #endif @@ -179,7 +179,7 @@ extern "C" { #define ALLOCA_ALIGN 63UL -#define NUM_BUFFERS (MAX_CPU_NUMBER * 2) +#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ @@ -649,6 +649,12 @@ int omp_get_num_procs(void); __declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void); #endif +#if (__STDC_VERSION__ >= 201112L) +#ifndef _Atomic +#define _Atomic volatile +#endif +#include +#endif #else #ifdef __ELF__ int omp_in_parallel (void) __attribute__ ((weak)); diff --git a/common_x86.h b/common_x86.h index 4363fb2f4..4f538c948 100644 --- a/common_x86.h +++ b/common_x86.h @@ -178,7 +178,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ result = x/y; return result; #else - +#if (MAX_CPU_NUMBER > 64) + if ( y > 64) { + result = x/y; + return result; + } +#endif + y = blas_quick_divide_table[y]; __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); @@ -327,7 +333,7 @@ REALNAME: #endif #endif -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__) #define PROLOGUE \ .text; \ .align 16; \ diff --git a/common_x86_64.h b/common_x86_64.h index bee88d3ce..7461aaf60 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -196,6 +196,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ if (y <= 1) return x; +#if (MAX_CPU_NUMBER > 64) + if (y > 64) { + result = x / y; + return result; + } +#endif + y = blas_quick_divide_table[y]; __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); @@ -403,7 +410,7 @@ REALNAME: #define EPILOGUE .end #endif -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(__ELF__) || defined(C_PGI) #define PROLOGUE \ .text; \ .align 512; \ diff --git a/cpuid_arm64.c b/cpuid_arm64.c index bd7fb7f2d..a42346c88 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -121,7 +121,7 @@ int detect(void) return CPU_VULCAN; else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) return CPU_THUNDERX; - else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */ + else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43")) return CPU_THUNDERX2T99; } diff --git a/cpuid_mips.c b/cpuid_mips.c index 15c58959e..c09902936 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_UNKNOWN 0 #define CPU_P5600 1 +#define CPU_1004K 2 static char *cpuname[] = { "UNKOWN", - "P5600" + "P5600", + "1004K" }; int detect(void){ @@ -90,7 +92,7 @@ int detect(void){ if (!strncmp("cpu", buffer, 3)){ p = strchr(buffer, ':') + 2; #if 0 - fprintf(stderr, "%s\n", p); + fprintf(stderr, "%s \n", p); #endif break; } @@ -99,43 +101,13 @@ int detect(void){ fclose(infile); if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - }else if (strstr(p, "Loongson-3")){ - infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("system type", buffer, 11)){ - p = strchr(buffer, ':') + 2; - break; - } - } - fclose(infile); - if (strstr(p, "loongson3a")) - return CPU_LOONGSON3A; - }else{ + if (strstr(p, "5600")) { + return CPU_P5600; + } else if (strstr(p, "1004K")) { + return CPU_1004K; + } else return CPU_UNKNOWN; } - } - //Check model name for Loongson3 - infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("model name", buffer, 10)){ - p = strchr(buffer, ':') + 2; - break; - } - } - fclose(infile); - if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - } - } #endif return CPU_UNKNOWN; } @@ -149,7 +121,7 @@ void get_architecture(void){ } void get_subarchitecture(void){ - if(detect()==CPU_P5600){ + if(detect()==CPU_P5600|| detect()==CPU_1004K){ printf("P5600"); }else{ printf("UNKNOWN"); @@ -170,6 +142,14 @@ void get_cpuconfig(void){ printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); + } else if (detect()==CPU_1004K) { + printf("#define MIPS1004K\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 26144\n"); + printf("#define DTB_DEFAULT_ENTRIES 8\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); }else{ printf("#define UNKNOWN\n"); } @@ -178,6 +158,8 @@ void get_cpuconfig(void){ void get_libname(void){ if(detect()==CPU_P5600) { printf("p5600\n"); + } else if (detect()==CPU_1004K) { + printf("1004K\n"); }else{ printf("mips\n"); } diff --git a/ctest.c b/ctest.c index 27d3b473a..00be423d1 100644 --- a/ctest.c +++ b/ctest.c @@ -60,6 +60,14 @@ OS_FREEBSD OS_NETBSD #endif +#if defined(__OpenBSD__) +OS_OPENBSD +#endif + +#if defined(__DragonFly__) +OS_DRAGONFLY +#endif + #if defined(__sun) OS_SUNOS #endif diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index bfd991ffb..4903aa5bd 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -91,7 +91,12 @@ #endif typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#if __STDC_VERSION__ >= 201112L + _Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index 65002ae46..574f825b0 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -67,7 +67,12 @@ #endif typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#if __STDC_VERSION__ >= 201112L +_Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index a1ed8bbb1..4ab1ee8cc 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -91,7 +91,12 @@ #endif typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#if __STDC_VERSION__ >= 201112L +_Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 2e0fe190d..794dfb20e 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*********************************************************************/ #include "common.h" -#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) +#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) #include #include #include diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 8d62a8125..fccdb4320 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -36,6 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +#include #include #include //#include @@ -49,11 +50,16 @@ int blas_server_avail = 0; -static void * blas_thread_buffer[MAX_CPU_NUMBER]; +static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; +#if __STDC_VERSION__ >= 201112L +static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; +#else +static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; +#endif void goto_set_num_threads(int num_threads) { - int i=0; + int i=0, j=0; if (num_threads < 1) num_threads = blas_num_threads; @@ -68,15 +74,17 @@ void goto_set_num_threads(int num_threads) { omp_set_num_threads(blas_cpu_number); //adjust buffer for each thread - for(i=0; i mode & BLAS_PTHREAD) == 0)) { pos = omp_get_thread_num(); - buffer = blas_thread_buffer[pos]; + buffer = blas_thread_buffer[buf_index][pos]; //fallback if(buffer==NULL) { @@ -291,7 +303,7 @@ static void exec_threads(blas_queue_t *queue){ int exec_blas(BLASLONG num, blas_queue_t *queue){ - BLASLONG i; + BLASLONG i, buf_index; if ((num <= 0) || (queue == NULL)) return 0; @@ -302,6 +314,23 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ } #endif + while(true) { + for(i=0; i < MAX_PARALLEL_NUMBER; i++) { +#if __STDC_VERSION__ >= 201112L + _Bool inuse = false; + if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) { +#else + if(blas_buffer_inuse[i] == false) { + blas_buffer_inuse[i] = true; +#endif + buf_index = i; + break; + } + } + if(i != MAX_PARALLEL_NUMBER) + break; + } + #pragma omp parallel for schedule(static) for (i = 0; i < num; i ++) { @@ -309,9 +338,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ queue[i].position = i; #endif - exec_threads(&queue[i]); + exec_threads(&queue[i], buf_index); } +#if __STDC_VERSION__ >= 201112L + atomic_store(&blas_buffer_inuse[buf_index], false); +#else + blas_buffer_inuse[buf_index] = false; +#endif + return 0; } diff --git a/driver/others/memory.c b/driver/others/memory.c index 474d97c4d..ef328b945 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -108,7 +108,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if defined(OS_FREEBSD) || defined(OS_DARWIN) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #include #include #endif @@ -147,9 +147,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) -#else +#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) #define CONSTRUCTOR __attribute__ ((constructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101))) +#else +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) #endif #ifdef DYNAMIC_ARCH @@ -209,7 +212,8 @@ int ret; size = CPU_ALLOC_SIZE(nums); ret = sched_getaffinity(0,size,cpusetp); if (ret!=0) return nums; - nums = CPU_COUNT_S(size,cpusetp); + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; CPU_FREE(cpusetp); return nums; #endif @@ -246,7 +250,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) int get_num_procs(void) { @@ -336,7 +340,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -344,7 +348,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif @@ -368,7 +372,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 7d041b907..87a27712f 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -54,6 +54,9 @@ static char* openblas_config_str="" #ifdef NO_AFFINITY "NO_AFFINITY " #endif +#ifdef USE_OPENMP + "USE_OPENMP " +#endif #ifndef DYNAMIC_ARCH CHAR_CORENAME #endif @@ -61,18 +64,23 @@ static char* openblas_config_str="" #ifdef DYNAMIC_ARCH char *gotoblas_corename(); -static char tmp_config_str[256]; #endif +static char tmp_config_str[256]; +int openblas_get_parallel(); char* CNAME() { -#ifndef DYNAMIC_ARCH - return openblas_config_str; -#else +char tmpstr[20]; strcpy(tmp_config_str, openblas_config_str); +#ifdef DYNAMIC_ARCH strcat(tmp_config_str, gotoblas_corename()); - return tmp_config_str; #endif +if (openblas_get_parallel() == 0) + sprintf(tmpstr, " SINGLE_THREADED"); +else + snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER); + strcat(tmp_config_str, tmpstr); + return tmp_config_str; } @@ -83,3 +91,4 @@ char* openblas_get_corename() { return gotoblas_corename(); #endif } + diff --git a/exports/Makefile b/exports/Makefile index 79c251d62..53d4f75bb 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -156,7 +156,7 @@ endif endif #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) so : ../$(LIBSONAME) diff --git a/f_check b/f_check index 941a9a5c4..997e02393 100644 --- a/f_check +++ b/f_check @@ -97,7 +97,7 @@ if ($compiler eq "") { if ($data =~ /Intel/) { $vendor = INTEL; - $openmp = "-openmp"; + $openmp = "-fopenmp"; } if ($data =~ /Sun Fortran/) { @@ -127,7 +127,7 @@ if ($compiler eq "") { # for embeded underscore name, e.g. zho_ge, it may append 2 underscores. $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; - if ($data =~ /zho_ge__/) { + if ($data =~ / zho_ge__/) { $need2bu = 1; } } @@ -155,7 +155,7 @@ if ($compiler eq "") { if ($compiler =~ /ifort/) { $vendor = INTEL; $bu = "_"; - $openmp = "-openmp"; + $openmp = "-fopenmp"; } if ($compiler =~ /pathf/) { diff --git a/getarch.c b/getarch.c index 24ea5fe5f..992fc2b95 100644 --- a/getarch.c +++ b/getarch.c @@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef OS_WINDOWS #include #endif -#if defined(__FreeBSD__) || defined(__APPLE__) +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) #include #include #endif @@ -1074,7 +1074,7 @@ static int get_num_cores(void) { #ifdef OS_WINDOWS SYSTEM_INFO sysinfo; -#elif defined(__FreeBSD__) || defined(__APPLE__) +#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) int m[2], count; size_t len; #endif @@ -1088,7 +1088,7 @@ static int get_num_cores(void) { GetSystemInfo(&sysinfo); return sysinfo.dwNumberOfProcessors; -#elif defined(__FreeBSD__) || defined(__APPLE__) +#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) m[0] = CTL_HW; m[1] = HW_NCPU; len = sizeof(int); diff --git a/interface/ztrmv.c b/interface/ztrmv.c index 4c47e9e91..0e16632e0 100644 --- a/interface/ztrmv.c +++ b/interface/ztrmv.c @@ -239,6 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } else nthreads = 1; +/* FIXME TRMV multithreading appears to be broken, see issue 1332*/ + nthreads = 1; + if(nthreads > 1) { buffer_size = n > 16 ? 0 : n * 4 + 40; } diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4284fbfa0..066426396 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -29,10 +29,8 @@ USE_TRMM = 1 endif ifeq ($(CORE), HASWELL) -ifeq ($(ARCH), x86_64) USE_TRMM = 1 endif -endif ifeq ($(CORE), ZEN) USE_TRMM = 1 diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S index 37515f399..c35b8aece 100644 --- a/kernel/arm/axpy_vfp.S +++ b/kernel/arm/axpy_vfp.S @@ -440,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble axpy_kernel_L999 - +/* cmp INC_X, #0 beq axpy_kernel_L999 cmp INC_Y, #0 beq axpy_kernel_L999 - +*/ cmp INC_X, #1 bne axpy_kernel_S_BEGIN diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S index 25f563690..ea296dbc5 100644 --- a/kernel/arm/rot_vfp.S +++ b/kernel/arm/rot_vfp.S @@ -483,13 +483,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble rot_kernel_L999 - +/* cmp INC_X, #0 beq rot_kernel_L999 cmp INC_Y, #0 beq rot_kernel_L999 - +*/ cmp INC_X, #1 bne rot_kernel_S_BEGIN @@ -584,6 +584,12 @@ rot_kernel_S1: rot_kernel_S10: KERNEL_S1 + + cmp INC_X, #0 + beq rot_kernel_L999 + + cmp INC_Y, #0 + beq rot_kernel_L999 subs I, I, #1 bne rot_kernel_S10 diff --git a/kernel/generic/trmm_ltcopy_2.c b/kernel/generic/trmm_ltcopy_2.c index 60cdeed1c..e9ad45fa0 100644 --- a/kernel/generic/trmm_ltcopy_2.c +++ b/kernel/generic/trmm_ltcopy_2.c @@ -116,22 +116,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (m & 1) { if (X > posY) { - /* ao1 += 1; - ao2 += 1; */ + ao1 += 1; + ao2 += 1; b += 2; } else -#ifdef UNIT if (X < posY) { -#endif - b[ 0] = *(ao1 + 0); -#ifdef UNIT + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; } else { +#ifdef UNIT + data02 = *(ao1 + 1); b[ 0] = ONE; + b[ 1] = data02; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += 2; + b += 2; } -#endif - b[ 1] = *(ao1 + 1); - b += 2; } posY += 2; @@ -178,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } while (i > 0); } - // posY += 1; + posY += 1; } return 0; diff --git a/kernel/generic/trmm_utcopy_16.c b/kernel/generic/trmm_utcopy_16.c index 12642e7db..b83989f55 100644 --- a/kernel/generic/trmm_utcopy_16.c +++ b/kernel/generic/trmm_utcopy_16.c @@ -518,7 +518,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 15); if (i > 0) { if (X < posY) { - /* a01 += i; + a01 += i; a02 += i; a03 += i; a04 += i; @@ -533,7 +533,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a13 += i; a14 += i; a15 += i; - a16 += i; */ + a16 += i; b += 16 * i; } else if (X > posY) { @@ -1130,14 +1130,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 7); if (i > 0) { if (X < posY) { - /* a01 += i; + a01 += i; a02 += i; a03 += i; a04 += i; a05 += i; a06 += i; a07 += i; - a08 += i; */ + a08 += i; b += 8 * i; } else if (X > posY) { @@ -1156,13 +1156,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 8; } - /* a02 += i * lda; + a02 += i * lda; a03 += i * lda; a04 += i * lda; a05 += i * lda; a06 += i * lda; a07 += i * lda; - a08 += i * lda; */ + a08 += i * lda; } else { #ifdef UNIT b[ 0] = ONE; @@ -1371,10 +1371,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i > 0) { if (X < posY) { - /* a01 += i; + a01 += i; a02 += i; a03 += i; - a04 += i; */ + a04 += i; b += 4 * i; } else if (X > posY) { @@ -1387,9 +1387,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 += lda; b += 4; } - /* a02 += lda; + a02 += lda; a03 += lda; - a04 += lda; */ + a04 += lda; } else { #ifdef UNIT @@ -1487,19 +1487,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (X < posY) { a01 ++; a02 ++; - } else { -#ifdef UNIT + b += 2; + } else if (X > posY) { -#endif b[ 0] = *(a01 + 0); -#ifdef UNIT + b[ 1] = *(a01 + 1); + a01 += lda; + b += 2; } else { +#ifdef UNIT b[ 0] = ONE; - } + b[ 1] = *(a01 + 1); +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); #endif - b[ 1] = *(a01 + 1); - } - b += 2; + b += 2; + } } posY += 2; } @@ -1518,25 +1522,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (i > 0) { do { if (X < posY) { - a01 ++; - } else { -#ifdef UNIT + a01 += 1; + b ++; + } else if (X > posY) { -#endif b[ 0] = *(a01 + 0); -#ifdef UNIT + a01 += lda; + b ++; } else { +#ifdef UNIT b[ 0] = ONE; - } +#else + b[ 0] = *(a01 + 0); #endif - a01 += lda; - } - b ++; - X ++; - i --; + a01 += lda; + b ++; + } + + X += 1; + i --; } while (i > 0); } - // posY += 1; + posY += 1; } return 0; diff --git a/kernel/generic/trmm_utcopy_2.c b/kernel/generic/trmm_utcopy_2.c index 75076c382..ae4a19e32 100644 --- a/kernel/generic/trmm_utcopy_2.c +++ b/kernel/generic/trmm_utcopy_2.c @@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (m & 1) { if (X < posY) { - /* ao1 += 1; - ao2 += 1; */ + ao1 += 1; + ao2 += 1; b += 2; } else if (X > posY) { @@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; b[ 1] = data02; - // ao1 += lda; + ao1 += lda; b += 2; } else { #ifdef UNIT @@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; b[ 1] = ZERO; #endif - // ao1 += lda; + ao1 += lda; b += 2; } } @@ -161,18 +161,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = m; if (m > 0) { do { + if (X < posY) { + b += 1; + ao1 += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += lda; + } else { #ifdef UNIT - if (X > posY) { + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - b[ 0] = *(ao1 + 0); -#ifdef UNIT - } else { - b[ 0] = ONE; - } -#endif - b ++; - ao1 += lda; - X ++; + b += 1; + ao1 += lda; + } + + X += 1; i --; } while (i > 0); } diff --git a/kernel/generic/trmm_utcopy_4.c b/kernel/generic/trmm_utcopy_4.c index e5844094e..441f7338b 100644 --- a/kernel/generic/trmm_utcopy_4.c +++ b/kernel/generic/trmm_utcopy_4.c @@ -201,18 +201,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (X < posY) { if (m & 2) { - /* ao1 += 2; + ao1 += 2; ao2 += 2; ao3 += 2; - ao4 += 2; */ + ao4 += 2; b += 8; } if (m & 1) { - /* ao1 += 1; + ao1 += 1; ao2 += 1; ao3 += 1; - ao4 += 1; */ + ao4 += 1; b += 4; } @@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = data08; ao1 += 2 * lda; - // ao2 += 2 * lda; + ao2 += 2 * lda; b += 8; } @@ -253,7 +253,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = data03; b[ 3] = data04; - // ao1 += lda; + ao1 += lda; b += 4; } @@ -401,7 +401,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (i) { if (X < posY) { - // ao1 += 2; + ao1 += 2; b += 2; } else if (X > posY) { @@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; b[ 1] = data02; - // ao1 += lda; + ao1 += lda; b += 2; } else { #ifdef UNIT @@ -443,21 +443,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON do { if (X < posY) { + b += 1; ao1 += 1; - } else { -#ifdef UNIT + } else if (X > posY) { -#endif - b[ 0] = *(ao1 + 0); -#ifdef UNIT + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; } else { +#ifdef UNIT b[ 0] = ONE; - } +#else + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - ao1 += lda; - } - b ++; - X ++; + ao1 += lda; + b += 1; + } + + X += 1; i --; } while (i > 0); } diff --git a/kernel/generic/trsm_ltcopy_4.c b/kernel/generic/trsm_ltcopy_4.c index 07bb137d4..12043eb33 100644 --- a/kernel/generic/trsm_ltcopy_4.c +++ b/kernel/generic/trsm_ltcopy_4.c @@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT } a1 += 2 * lda; - // a2 += 2 * lda; + a2 += 2 * lda; b += 8; ii += 2; diff --git a/kernel/generic/ztrmm_ltcopy_2.c b/kernel/generic/ztrmm_ltcopy_2.c index 7969f4f3d..457890ceb 100644 --- a/kernel/generic/ztrmm_ltcopy_2.c +++ b/kernel/generic/ztrmm_ltcopy_2.c @@ -139,18 +139,48 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { -#ifdef UNIT + + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else if (X < posY) { -#endif - b[ 0] = *(ao1 + 0); - b[ 1] = *(ao1 + 1); -#ifdef UNIT + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; + + ao1 += lda; + b += 4; } else { +#ifdef UNIT + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + b[ 0] = ONE; b[ 1] = ZERO; - } + b[ 2] = data3; + b[ 3] = data4; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; #endif - b += 4; + b += 4; + } } posY += 2; @@ -203,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } while (i > 0); } - // posY += 1; + posY += 1; } return 0; diff --git a/kernel/generic/ztrsm_utcopy_1.c b/kernel/generic/ztrsm_utcopy_1.c index 0e33a7d18..08f85e891 100644 --- a/kernel/generic/ztrsm_utcopy_1.c +++ b/kernel/generic/ztrsm_utcopy_1.c @@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01 = 0.0, data02 = 0.0; + FLOAT data01, data02; FLOAT *a1; lda *= 2; diff --git a/kernel/generic/ztrsm_utcopy_2.c b/kernel/generic/ztrsm_utcopy_2.c index c34d741ee..387bb2532 100644 --- a/kernel/generic/ztrsm_utcopy_2.c +++ b/kernel/generic/ztrsm_utcopy_2.c @@ -43,8 +43,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01 = 0.0, data02 = 0.0, data03, data04; - FLOAT data05, data06, data07 = 0.0, data08 = 0.0; + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; FLOAT *a1, *a2; lda *= 2; diff --git a/kernel/mips/KERNEL.1004K b/kernel/mips/KERNEL.1004K new file mode 100644 index 000000000..67135356e --- /dev/null +++ b/kernel/mips/KERNEL.1004K @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.P5600 diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 9a16704d5..1ab193069 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -38,8 +38,8 @@ ZASUMKERNEL = ../mips/zasum_msa.c else SASUMKERNEL = ../mips/asum.c DASUMKERNEL = ../mips/asum.c -CASUMKERNEL = ../mips/asum.c -ZASUMKERNEL = ../mips/asum.c +CASUMKERNEL = ../mips/zasum.c +ZASUMKERNEL = ../mips/zasum.c endif ifdef HAVE_MSA @@ -253,4 +253,4 @@ ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -endif \ No newline at end of file +endif diff --git a/kernel/mips/dgemv_n_msa.c b/kernel/mips/dgemv_n_msa.c index 82c3a96cf..380b94d06 100644 --- a/kernel/mips/dgemv_n_msa.c +++ b/kernel/mips/dgemv_n_msa.c @@ -484,10 +484,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; v2f64 v_alpha; - v2f64 x0, x1, x2, x3, y0 = 0.0, y1 = 0.0, y2 = 0.0, y3 = 0.0; + v2f64 x0, x1, x2, x3, y0 = {0,0}, y1 = {0,0}, y2 = {0,0}, y3 = {0,0}; v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; - v2f64 t30, t31, tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0; + v2f64 t30, t31, tp0 = {0,0}, tp1 = {0,0}, tp2 = {0,0}, tp3 = {0,0}, tp4 = {0,0}, tp5 = {0,0}, tp6 = {0,0}, tp7 = {0,0}; v_alpha = COPY_DOUBLE_TO_VECTOR(alpha); diff --git a/kernel/mips/dot.c b/kernel/mips/dot.c index de7f7167f..89c9f80f6 100644 --- a/kernel/mips/dot.c +++ b/kernel/mips/dot.c @@ -41,8 +41,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) while(i < n) { - - dot += y[iy] * x[ix] ; +#if defined(DSDOT) + dot += (double)y[iy] * (double)x[ix] ; +#else + dot += y[iy] * x[ix]; +#endif ix += inc_x ; iy += inc_y ; i++ ; diff --git a/kernel/mips/sgemv_n_msa.c b/kernel/mips/sgemv_n_msa.c index e1ecb5473..66e3adebf 100644 --- a/kernel/mips/sgemv_n_msa.c +++ b/kernel/mips/sgemv_n_msa.c @@ -423,9 +423,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, FLOAT *y_org = y; FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; - v4f32 v_alpha, x0, x1, y0 = 0.0, y1 = 0.0; + v4f32 v_alpha, x0, x1, y0 = {0,0,0,0}, y1 = {0,0,0,0}; v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; - v4f32 tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0; + v4f32 tp0 = {0,0,0,0}, tp1 = {0,0,0,0}, tp2 = {0,0,0,0}, tp3 = {0,0,0,0}, tp4 = {0,0,0,0}, tp5 = {0,0,0,0}, tp6 = {0,0,0,0}, tp7 = {0,0,0,0}; v_alpha = COPY_FLOAT_TO_VECTOR(alpha); diff --git a/kernel/power/KERNEL.POWER6 b/kernel/power/KERNEL.POWER6 index 344b205fe..e6d2c9a51 100644 --- a/kernel/power/KERNEL.POWER6 +++ b/kernel/power/KERNEL.POWER6 @@ -54,3 +54,6 @@ ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S + +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c diff --git a/lapack-netlib/LAPACKE/src/lapacke_clacgv.c b/lapack-netlib/LAPACKE/src/lapacke_clacgv.c index 0014906ed..9a77c8ec0 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clacgv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clacgv.c @@ -39,7 +39,7 @@ lapack_int LAPACKE_clacgv( lapack_int n, lapack_complex_float* x, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_c_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) { + if( LAPACKE_c_nancheck( n, x, incx ) ) { return -2; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c index 18e24509d..3aeb0d7e4 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c @@ -51,16 +51,21 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); + lapack_int lrv, lcv; /* row, column stride */ + if( matrix_layout == LAPACK_COL_MAJOR ) { + lrv = 1; + lcv = ldv; + } else { + lrv = ldv; + lcv = 1; + } + ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + + nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : + ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -13; } @@ -70,8 +75,8 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], - ldv ) ) + if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, + &v[k*lrv], ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > nrows_v ) { @@ -79,23 +84,23 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct return -8; } if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*ldv], ldv ) ) + &v[(nrows_v-k)*lrv], ldv ) ) return -9; if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], - ldv ) ) + if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, + &v[k*lrv], ldv ) ) return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { + } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > ncols_v ) { LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); return -8; } - if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], - ldv ) ) + if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, + &v[(ncols_v-k)*lcv], ldv ) ) return -9; if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) return -9; diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfg.c b/lapack-netlib/LAPACKE/src/lapacke_clarfg.c index 0381a42bc..9e852a406 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfg.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfg.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_clarfg( lapack_int n, lapack_complex_float* alpha, if( LAPACKE_c_nancheck( 1, alpha, 1 ) ) { return -2; } - if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_c_nancheck( n-1, x, incx ) ) { return -3; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfx.c b/lapack-netlib/LAPACKE/src/lapacke_clarfx.c index 977e283e1..786c21412 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfx.c @@ -38,6 +38,7 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m, lapack_complex_float tau, lapack_complex_float* c, lapack_int ldc, lapack_complex_float* work ) { + lapack_int lv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_clarfx", -1 ); return -1; @@ -51,7 +52,8 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m, if( LAPACKE_c_nancheck( 1, &tau, 1 ) ) { return -6; } - if( LAPACKE_c_nancheck( m, v, 1 ) ) { + lv = (LAPACKE_lsame( side, 'l' ) ? m : n); + if( LAPACKE_c_nancheck( lv, v, 1 ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_classq.c b/lapack-netlib/LAPACKE/src/lapacke_classq.c index b8f231dbb..e4d746c5a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_classq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_classq.c @@ -39,7 +39,7 @@ lapack_int LAPACKE_classq( lapack_int n, lapack_complex_float* x, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ - if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_c_nancheck( n, x, incx ) ) { return -2; } if( LAPACKE_s_nancheck( 1, scale, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c index 1864c4121..d9fb2dca0 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_cunmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -10; } - if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) { return -9; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_cupmtr.c b/lapack-netlib/LAPACKE/src/lapacke_cupmtr.c index 51f6d8276..ba026ae68 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cupmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cupmtr.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_cupmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -9; } - if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c index 55c26f4b6..a1f49dde1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c @@ -50,16 +50,21 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); + lapack_int lrv, lcv; /* row, column stride */ + if( matrix_layout == LAPACK_COL_MAJOR ) { + lrv = 1; + lcv = ldv; + } else { + lrv = ldv; + lcv = 1; + } + ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + + nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : + ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -13; } @@ -69,8 +74,8 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], - ldv ) ) + if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, + &v[k*lrv], ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > nrows_v ) { @@ -78,23 +83,23 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct return -8; } if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*ldv], ldv ) ) + &v[(nrows_v-k)*lrv], ldv ) ) return -9; if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], - ldv ) ) + if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, + &v[k*lrv], ldv ) ) return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { + } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > ncols_v ) { LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); return -8; } - if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], - ldv ) ) + if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, + &v[(ncols_v-k)*lcv], ldv ) ) return -9; if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) return -9; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfg.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfg.c index 0f627b323..df401c41d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfg.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfg.c @@ -42,7 +42,7 @@ lapack_int LAPACKE_dlarfg( lapack_int n, double* alpha, double* x, if( LAPACKE_d_nancheck( 1, alpha, 1 ) ) { return -2; } - if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_d_nancheck( n-1, x, incx ) ) { return -3; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfx.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfx.c index ab4a58e76..7b7b7201e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfx.c @@ -37,6 +37,7 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m, lapack_int n, const double* v, double tau, double* c, lapack_int ldc, double* work ) { + lapack_int lv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_dlarfx", -1 ); return -1; @@ -50,7 +51,8 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m, if( LAPACKE_d_nancheck( 1, &tau, 1 ) ) { return -6; } - if( LAPACKE_d_nancheck( m, v, 1 ) ) { + lv = (LAPACKE_lsame( side, 'l' ) ? m : n); + if( LAPACKE_d_nancheck( lv, v, 1 ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlassq.c b/lapack-netlib/LAPACKE/src/lapacke_dlassq.c index a564240d4..0e096b6d4 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlassq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlassq.c @@ -38,7 +38,7 @@ lapack_int LAPACKE_dlassq( lapack_int n, double* x, lapack_int incx, double* sca #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ - if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_d_nancheck( n, x, incx ) ) { return -2; } if( LAPACKE_d_nancheck( 1, scale, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dopmtr.c b/lapack-netlib/LAPACKE/src/lapacke_dopmtr.c index 93d3d3d30..7fbfb11fd 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dopmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dopmtr.c @@ -56,7 +56,7 @@ lapack_int LAPACKE_dopmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -9; } - if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c index 05e4c57c8..db75a6609 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c @@ -57,7 +57,7 @@ lapack_int LAPACKE_dormtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -10; } - if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) { return -9; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c index 72fa75ef1..0ebdc931a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c @@ -50,16 +50,21 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); + lapack_int lrv, lcv; /* row, column stride */ + if( matrix_layout == LAPACK_COL_MAJOR ) { + lrv = 1; + lcv = ldv; + } else { + lrv = ldv; + lcv = 1; + } + ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + + nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : + ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -13; } @@ -69,8 +74,8 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], - ldv ) ) + if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, + &v[k*lrv], ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > nrows_v ) { @@ -78,23 +83,23 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct return -8; } if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*ldv], ldv ) ) + &v[(nrows_v-k)*lrv], ldv ) ) return -9; if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], - ldv ) ) + if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, + &v[k*lrv], ldv ) ) return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { + } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > ncols_v ) { LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); return -8; } - if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], - ldv ) ) + if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, + &v[(ncols_v-k)*lcv], ldv ) ) return -9; if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) return -9; diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfg.c b/lapack-netlib/LAPACKE/src/lapacke_slarfg.c index 295277387..ea9a83575 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfg.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfg.c @@ -42,7 +42,7 @@ lapack_int LAPACKE_slarfg( lapack_int n, float* alpha, float* x, if( LAPACKE_s_nancheck( 1, alpha, 1 ) ) { return -2; } - if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_s_nancheck( n-1, x, incx ) ) { return -3; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfx.c b/lapack-netlib/LAPACKE/src/lapacke_slarfx.c index 426137815..c2b797a98 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfx.c @@ -37,6 +37,7 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m, lapack_int n, const float* v, float tau, float* c, lapack_int ldc, float* work ) { + lapack_int lv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_slarfx", -1 ); return -1; @@ -50,7 +51,8 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m, if( LAPACKE_s_nancheck( 1, &tau, 1 ) ) { return -6; } - if( LAPACKE_s_nancheck( m, v, 1 ) ) { + lv = (LAPACKE_lsame( side, 'l' ) ? m : n); + if( LAPACKE_s_nancheck( lv, v, 1 ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_slassq.c b/lapack-netlib/LAPACKE/src/lapacke_slassq.c index 668289e18..3e265e359 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slassq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slassq.c @@ -38,7 +38,7 @@ lapack_int LAPACKE_slassq( lapack_int n, float* x, lapack_int incx, float* scale #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ - if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_s_nancheck( n, x, incx ) ) { return -2; } if( LAPACKE_s_nancheck( 1, scale, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_sopmtr.c b/lapack-netlib/LAPACKE/src/lapacke_sopmtr.c index 333789837..bf8eed4f9 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sopmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sopmtr.c @@ -56,7 +56,7 @@ lapack_int LAPACKE_sopmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -9; } - if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c index 5a9d44138..9f0e9fddf 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c @@ -57,7 +57,7 @@ lapack_int LAPACKE_sormtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -10; } - if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) { return -9; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlacgv.c b/lapack-netlib/LAPACKE/src/lapacke_zlacgv.c index 3b1130ba5..cd412dc24 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlacgv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlacgv.c @@ -39,7 +39,7 @@ lapack_int LAPACKE_zlacgv( lapack_int n, lapack_complex_double* x, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_z_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) { + if( LAPACKE_z_nancheck( n, x, incx ) ) { return -2; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c index 6ea4960f3..4fc2eb0ab 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c @@ -51,16 +51,21 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); + lapack_int lrv, lcv; /* row, column stride */ + if( matrix_layout == LAPACK_COL_MAJOR ) { + lrv = 1; + lcv = ldv; + } else { + lrv = ldv; + lcv = 1; + } + ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + + nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : + ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -13; } @@ -70,8 +75,8 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], - ldv ) ) + if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, + &v[k*lrv], ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > nrows_v ) { @@ -79,23 +84,23 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct return -8; } if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*ldv], ldv ) ) + &v[(nrows_v-k)*lrv], ldv ) ) return -9; if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], - ldv ) ) + if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, + &v[k*lrv], ldv ) ) return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { + } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > ncols_v ) { LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); return -8; } - if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], - ldv ) ) + if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, + &v[(ncols_v-k)*lcv], ldv ) ) return -9; if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) return -9; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfg.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfg.c index 14e587fcc..a566a08cb 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfg.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfg.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_zlarfg( lapack_int n, lapack_complex_double* alpha, if( LAPACKE_z_nancheck( 1, alpha, 1 ) ) { return -2; } - if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_z_nancheck( n-1, x, incx ) ) { return -3; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfx.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfx.c index 1dd1f5204..b4ebf727e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfx.c @@ -38,6 +38,7 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m, lapack_complex_double tau, lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work ) { + lapack_int lv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zlarfx", -1 ); return -1; @@ -51,7 +52,8 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m, if( LAPACKE_z_nancheck( 1, &tau, 1 ) ) { return -6; } - if( LAPACKE_z_nancheck( m, v, 1 ) ) { + lv = (LAPACKE_lsame( side, 'l' ) ? m : n); + if( LAPACKE_z_nancheck( lv, v, 1 ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlassq.c b/lapack-netlib/LAPACKE/src/lapacke_zlassq.c index a218c9b62..b8972b974 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlassq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlassq.c @@ -39,7 +39,7 @@ lapack_int LAPACKE_zlassq( lapack_int n, lapack_complex_double* x, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ - if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_z_nancheck( n, x, incx ) ) { return -2; } if( LAPACKE_d_nancheck( 1, scale, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunmtr.c b/lapack-netlib/LAPACKE/src/lapacke_zunmtr.c index f8936cd5a..433385440 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zunmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zunmtr.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_zunmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -10; } - if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) { return -9; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zupmtr.c b/lapack-netlib/LAPACKE/src/lapacke_zupmtr.c index d735c5561..80bbd9529 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zupmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zupmtr.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_zupmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -9; } - if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) { return -8; } } diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F index 6645121c1..91806bb1d 100644 --- a/lapack-netlib/SRC/chetrd_hb2st.F +++ b/lapack-netlib/SRC/chetrd_hb2st.F @@ -512,7 +512,7 @@ C END IF * * Call the kernel * -#if defined(_OPENMP) && _OPENMP >= 201307L +#if defined(_OPENMP) && _OPENMP >= 201307 IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F index d1ccc1a89..4ca0507e4 100644 --- a/lapack-netlib/SRC/dsytrd_sb2st.F +++ b/lapack-netlib/SRC/dsytrd_sb2st.F @@ -481,7 +481,7 @@ * * Call the kernel * -#if defined(_OPENMP) && _OPENMP >= 201307L +#if defined(_OPENMP) && _OPENMP >= 201307 IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F index 7b623481b..508afca06 100644 --- a/lapack-netlib/SRC/zhetrd_hb2st.F +++ b/lapack-netlib/SRC/zhetrd_hb2st.F @@ -512,7 +512,7 @@ C END IF * * Call the kernel * -#if defined(_OPENMP) && _OPENMP >= 201307L +#if defined(_OPENMP) && _OPENMP >= 201307 IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index db8c836e0..591ce4a99 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -67,6 +67,26 @@ double sqrt(double); #undef GETRF_FACTOR #define GETRF_FACTOR 1.00 + +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t getrf_lock = 0; +#else +static BLASULONG getrf_lock = 0UL; +#endif + +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t getrf_flag_lock = 0; +#else +static BLASULONG getrf_flag_lock = 0UL; +#endif + + + + static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) { double m = (double)(M - IS - BK); @@ -99,7 +119,11 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; FLOAT *sbb = sb; +#if __STDC_VERSION__ >= 201112L + _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; +#else volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; +#endif blasint *ipiv = (blasint *)args -> c; @@ -177,7 +201,12 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra /* Non blocking implementation */ typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#if __STDC_VERSION__ >= 201112L + _Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); @@ -216,9 +245,12 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * FLOAT *sbb= sb; blasint *ipiv = (blasint *)args -> c; - + BLASLONG jw; +#if __STDC_VERSION__ >= 201112L + _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; +#else volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; - +#endif if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); @@ -245,8 +277,20 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { for (i = 0; i < args -> nthreads; i++) +#if 1 + { + LOCK_COMMAND(&getrf_lock); + jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; + UNLOCK_COMMAND(&getrf_lock); + do { + LOCK_COMMAND(&getrf_lock); + jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; + UNLOCK_COMMAND(&getrf_lock); + } while (jw); + } +#else while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {}; - +#endif for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -283,18 +327,23 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * b + (is + jjs * lda) * COMPSIZE, lda, is); } } - MB; - for (i = 0; i < args -> nthreads; i++) + for (i = 0; i < args -> nthreads; i++) { + LOCK_COMMAND(&getrf_lock); job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - + UNLOCK_COMMAND(&getrf_lock); + } } + LOCK_COMMAND(&getrf_flag_lock); flag[mypos * CACHE_LINE_SIZE] = 0; + UNLOCK_COMMAND(&getrf_flag_lock); if (m == 0) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + LOCK_COMMAND(&getrf_lock); job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0; + UNLOCK_COMMAND(&getrf_lock); } } @@ -318,7 +367,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { if ((current != mypos) && (!is)) { +#if 1 + LOCK_COMMAND(&getrf_lock); + jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; + UNLOCK_COMMAND(&getrf_lock); + do { + LOCK_COMMAND(&getrf_lock); + jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; + UNLOCK_COMMAND(&getrf_lock); + } while (jw == 0); +#else while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {}; +#endif } KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, @@ -327,7 +387,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * MB; if (is + min_i >= m) { + LOCK_COMMAND(&getrf_lock); job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + UNLOCK_COMMAND(&getrf_lock); } } @@ -339,7 +401,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (i = 0; i < args -> nthreads; i++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { +#if 1 + LOCK_COMMAND(&getrf_lock); + jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; + UNLOCK_COMMAND(&getrf_lock); + do { + LOCK_COMMAND(&getrf_lock); + jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; + UNLOCK_COMMAND(&getrf_lock); + } while(jw != 0); +#else while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {}; +#endif } } @@ -374,11 +447,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, BLASLONG i, j, k, is, bk; BLASLONG num_cpu; + BLASLONG f; #ifdef _MSC_VER BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE]; #else - volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); +#if __STDC_VERSION__ >= 201112L + _Atomic +#else + volatile +#endif + BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); #endif #ifndef COMPLEX @@ -501,11 +580,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (mm >= nn) { width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (width == 0) width = nn; if (nn < width) width = nn; nn -= width; range_N[num_cpu + 1] = range_N[num_cpu] + width; width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (width == 0) width = mm; if (mm < width) width = mm; if (nn <= 0) width = mm; mm -= width; @@ -514,11 +595,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, } else { width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (width == 0) width = mm; if (mm < width) width = mm; mm -= width; range_M[num_cpu + 1] = range_M[num_cpu] + width; width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (width == 0) width = nn; if (nn < width) width = nn; if (mm <= 0) width = nn; nn -= width; @@ -561,7 +644,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, range_n_new[1] = offset + is + bk; if (num_cpu > 0) { - queue[num_cpu - 1].next = NULL; exec_blas_async(0, &queue[0]); @@ -572,8 +654,20 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (iinfo && !info) info = iinfo + is; - for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; - + for (i = 0; i < num_cpu; i ++) { +#if 1 + LOCK_COMMAND(&getrf_flag_lock); + f=flag[i*CACHE_LINE_SIZE]; + UNLOCK_COMMAND(&getrf_flag_lock); + while (f!=0) { + LOCK_COMMAND(&getrf_flag_lock); + f=flag[i*CACHE_LINE_SIZE]; + UNLOCK_COMMAND(&getrf_flag_lock); + }; +#else + while (flag[i*CACHE_LINE_SIZE]) {}; +#endif + } TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb); } else { @@ -634,8 +728,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, BLASLONG range[MAX_CPU_NUMBER + 1]; BLASLONG width, nn, num_cpu; - - volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); +#if __STDC_VERSION__ >= 201112L + _Atomic +#else + volatile +#endif + BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); #ifndef COMPLEX #ifdef XDOUBLE diff --git a/lapack/getrf/potrf_parallel.c b/lapack/getrf/potrf_parallel.c new file mode 100644 index 000000000..c2fee6bd1 --- /dev/null +++ b/lapack/getrf/potrf_parallel.c @@ -0,0 +1,664 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + +//The array of job_t may overflow the stack. +//Instead, use malloc to alloc job_t. +#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD +#define USE_ALLOC_HEAP +#endif + + +static FLOAT dm1 = -1.; + +#ifndef KERNEL_FUNC +#ifndef LOWER +#define KERNEL_FUNC SYRK_KERNEL_U +#else +#define KERNEL_FUNC SYRK_KERNEL_L +#endif +#endif + +#ifndef LOWER +#ifndef COMPLEX +#define TRSM_KERNEL TRSM_KERNEL_LT +#else +#define TRSM_KERNEL TRSM_KERNEL_LC +#endif +#else +#ifndef COMPLEX +#define TRSM_KERNEL TRSM_KERNEL_RN +#else +#define TRSM_KERNEL TRSM_KERNEL_RR +#endif +#endif + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 8 +#endif + +#ifndef DIVIDE_RATE +#define DIVIDE_RATE 2 +#endif + +#ifndef SWITCH_RATIO +#define SWITCH_RATIO 2 +#endif + +#ifndef LOWER +#define TRANS +#endif + +#ifndef SYRK_LOCAL +#if !defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL SYRK_UN +#elif !defined(LOWER) && defined(TRANS) +#define SYRK_LOCAL SYRK_UT +#elif defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL SYRK_LN +#else +#define SYRK_LOCAL SYRK_LT +#endif +#endif + +typedef struct { +#if __STDC_VERSION__ >= 201112L + _Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +} job_t; + + +#ifndef KERNEL_OPERATION +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#endif +#endif + +#ifndef ICOPY_OPERATION +#ifndef TRANS +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef OCOPY_OPERATION +#ifdef TRANS +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef S +#define S args -> a +#endif +#ifndef A +#define A args -> b +#endif +#ifndef C +#define C args -> c +#endif +#ifndef LDA +#define LDA args -> lda +#endif +#ifndef N +#define N args -> m +#endif +#ifndef K +#define K args -> k +#endif + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + FLOAT *buffer[DIVIDE_RATE]; + + BLASLONG k, lda; + BLASLONG m_from, m_to; + + FLOAT *alpha; + FLOAT *a, *c; + job_t *job = (job_t *)args -> common; + BLASLONG xxx, bufferside; + + BLASLONG jjs, min_jj; + BLASLONG is, min_i, div_n; + + BLASLONG i, current; + + k = K; + + a = (FLOAT *)A; + c = (FLOAT *)C; + + lda = LDA; + + alpha = (FLOAT *)args -> alpha; + + m_from = range_n[mypos + 0]; + m_to = range_n[mypos + 1]; + +#if 0 + fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to); +#endif + + div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; + + buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + for (i = 1; i < DIVIDE_RATE; i++) { + buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; + } + +#ifndef LOWER + TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); +#else + TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); +#endif + + for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { + + for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ + + min_jj = MIN(m_to, xxx + div_n) - jjs; + +#ifndef LOWER + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; +#else + if (min_jj > GEMM_P) min_jj = GEMM_P; +#endif + +#ifndef LOWER + OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); + + TRSM_KERNEL (k, min_jj, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + sb, + buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, + a + jjs * lda * COMPSIZE, lda, 0); +#else + ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); + + TRSM_KERNEL (min_jj, k, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, + sb, + a + jjs * COMPSIZE, lda, 0); +#endif + } + +#ifndef LOWER + for (i = 0; i <= mypos; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; +#else + for (i = mypos; i < args -> nthreads; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; +#endif + + WMB; + } + + min_i = m_to - m_from; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; + } + +#ifndef LOWER + ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); +#else + OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); +#endif + + current = mypos; + +#ifndef LOWER + while (current < args -> nthreads) +#else + while (current >= 0) +#endif + { + div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + /* thread has to wait */ + if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, lda, m_from, xxx); + + if (m_from + min_i >= m_to) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; + } + } + +#ifndef LOWER + current ++; +#else + current --; +#endif + } + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; + } + +#ifndef LOWER + ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); +#else + OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); +#endif + + current = mypos; + +#ifndef LOWER + while (current < args -> nthreads) +#else + while (current >= 0) +#endif + { + div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, lda, is, xxx); + + if (is + min_i >= m_to) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; + } + } +#ifndef LOWER + current ++; +#else + current --; +#endif + } + } + + for (i = 0; i < args -> nthreads; i++) { + if (i != mypos) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; + } + } + } + + return 0; + } + +static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ + + blas_arg_t newarg; + +#ifndef USE_ALLOC_HEAP + job_t job[MAX_CPU_NUMBER]; +#else + job_t * job = NULL; +#endif + + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range[MAX_CPU_NUMBER + 100]; + + BLASLONG num_cpu; + + BLASLONG nthreads = args -> nthreads; + + BLASLONG width, i, j, k; + BLASLONG n, n_from, n_to; + int mode, mask; + double dnum; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; + mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; + mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; +#else + mode = BLAS_SINGLE | BLAS_REAL; + mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; + mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; + mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; + mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; +#endif +#endif + + newarg.m = args -> m; + newarg.k = args -> k; + newarg.a = args -> a; + newarg.b = args -> b; + newarg.c = args -> c; + newarg.lda = args -> lda; + newarg.alpha = args -> alpha; + +#ifdef USE_ALLOC_HEAP + job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); + if(job==NULL){ + fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); + exit(1); + } +#endif + + newarg.common = (void *)job; + + n_from = 0; + n_to = args -> m; + +#ifndef LOWER + + range[MAX_CPU_NUMBER] = n_to - n_from; + range[0] = 0; + num_cpu = 0; + i = 0; + n = n_to - n_from; + + dnum = (double)n * (double)n /(double)nthreads; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + + width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); + + if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1)); + + if ((width > n - i) || (width < mask)) width = n - i; + + } else { + width = n - i; + } + + range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = NULL; + + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + + for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; + +#else + + range[0] = 0; + num_cpu = 0; + i = 0; + n = n_to - n_from; + + dnum = (double)n * (double)n /(double)nthreads; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + + width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); + + if ((width > n - i) || (width < mask)) width = n - i; + + } else { + width = n - i; + } + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = NULL; + queue[num_cpu].range_n = range; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + newarg.nthreads = num_cpu; + + if (num_cpu) { + + for (j = 0; j < num_cpu; j++) { + for (i = 0; i < num_cpu; i++) { + for (k = 0; k < DIVIDE_RATE; k++) { + job[j].working[i][CACHE_LINE_SIZE * k] = 0; + } + } + } + + queue[0].sa = sa; + queue[0].sb = sb; + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + +#ifdef USE_ALLOC_HEAP + free(job); +#endif + + return 0; +} + +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, bk, i, blocking, lda; + BLASLONG info; + int mode; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { -ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + if (args -> nthreads == 1) { +#ifndef LOWER + info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); +#else + info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); +#endif + return info; + } + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= GEMM_UNROLL_N * 2) { +#ifndef LOWER + info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); +#else + info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); +#endif + return info; + } + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + newarg.beta = NULL; + newarg.nthreads = args -> nthreads; + + blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; + if (blocking > GEMM_Q) blocking = GEMM_Q; + + for (i = 0; i < n; i += blocking) { + bk = n - i; + if (bk > blocking) bk = blocking; + + newarg.m = bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + + info = CNAME(&newarg, NULL, NULL, sa, sb, 0); + if (info) return info + i; + + if (n - i - bk > 0) { +#ifndef USE_SIMPLE_THREADED_LEVEL3 + newarg.m = n - i - bk; + newarg.k = bk; +#ifndef LOWER + newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE; +#else + newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; +#endif + newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; + + thread_driver(&newarg, sa, sb); +#else + +#ifndef LOWER + newarg.m = bk; + newarg.n = n - i - bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; + + gemm_thread_n(mode | BLAS_TRANSA_T, + &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); + + newarg.n = n - i - bk; + newarg.k = bk; + newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; + newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; + +#if 0 + HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); +#else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, + &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); +#endif +#else + newarg.m = n - i - bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + bk + i * lda) * COMPSIZE; + + gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, + &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); + + newarg.n = n - i - bk; + newarg.k = bk; + newarg.a = a + (i + bk + i * lda) * COMPSIZE; + newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; + +#if 0 + HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); +#else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, + &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); +#endif +#endif + +#endif + } + } + return 0; +} diff --git a/param.h b/param.h index 189cdc4a0..4227d548e 100644 --- a/param.h +++ b/param.h @@ -2291,7 +2291,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#if defined(P5600) || defined(I6400) || defined(P6600) || defined(I6500) +#if defined(P5600) || defined(MIPS1004K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2 diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 1b426afe7..77a42d84f 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -25,6 +25,7 @@ endif () # known to hang with the native Windows and Android threads # FIXME needs checking if this works on any of the other platforms +if (NOT NO_CBLAS) if (NOT USE_OPENMP) if (OS_CYGWIN_NT OR OS_LINUX) set(OpenBLAS_utest_src @@ -33,6 +34,7 @@ set(OpenBLAS_utest_src ) endif() endif() +endif() if (NOT NO_LAPACK) set(OpenBLAS_utest_src diff --git a/utest/Makefile b/utest/Makefile index e40b3c6db..e071540dc 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -17,11 +17,13 @@ endif #this does not work with OpenMP nor with native Windows or Android threads # FIXME TBD if this works on OSX, SunOS, POWER and zarch +ifneq ($(NO_CBLAS), 1) ifndef USE_OPENMP ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) OBJS += test_fork.o endif endif +endif all : run_test diff --git a/utest/test_fork.c b/utest/test_fork.c index e7a8dbcee..9e0244305 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" +#include #include #include