Merge branch 'release-0.2.16'

This commit is contained in:
Zhang Xianyi 2016-03-15 14:49:10 -04:00
commit fced5744fb
3456 changed files with 203824 additions and 30558 deletions

1
.gitignore vendored
View File

@ -68,3 +68,4 @@ test/zblat2
test/zblat3
build
build.*
*.swp

View File

@ -24,7 +24,12 @@ before_install:
- if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi
script: make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
script:
- set -e
- make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C utest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
# whitelist
branches:

View File

@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4)
project(OpenBLAS)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 2)
set(OpenBLAS_PATCH_VERSION 14)
set(OpenBLAS_PATCH_VERSION 16)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
enable_language(ASM)
@ -54,10 +54,6 @@ if (NOT DYNAMIC_ARCH)
list(APPEND BLASDIRS kernel)
endif ()
if (DEFINED UTEST_CHECK)
set(SANITY_CHECK 1)
endif ()
if (DEFINED SANITY_CHECK)
list(APPEND BLASDIRS reference)
endif ()
@ -110,6 +106,10 @@ if (${NO_STATIC} AND ${NO_SHARED})
message(FATAL_ERROR "Neither static nor shared are enabled.")
endif ()
#Set default output directory
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
set(TARGET_OBJS "")
foreach (SUBDIR ${SUBDIRS})
@ -139,6 +139,17 @@ add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET
include("${CMAKE_SOURCE_DIR}/cmake/export.cmake")
# Set output for libopenblas
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
endforeach()
enable_testing()
add_subdirectory(utest)
if(NOT MSVC)
#only build shared library for MSVC
@ -152,7 +163,6 @@ target_link_libraries(${OpenBLAS_LIBNAME}_static pthread)
endif()
#build test and ctest
enable_testing()
add_subdirectory(test)
if(NOT NO_CBLAS)
add_subdirectory(ctest)

View File

@ -121,6 +121,17 @@ In chronological order:
* [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
ARMv8 support.
* Jerome Robert <jeromerobert@gmx.com>
* [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478)
* [2015-12-23] `stack_check` in `gemv.c` (bug #722)
* [2015-12-28] Allow to force the number of parallel make job
* [2015-12-28] Fix detection of AMD E2-3200 detection
* [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected
* [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731)
* [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742)
* [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760)
* [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727)
* Dan Kortschak
* [2015-01-07] Added test for drotmg bug #484.
@ -130,5 +141,11 @@ In chronological order:
* Martin Koehler <https://github.com/grisuthedragon/>
* [2015-09-07] Improved imatcopy
* Ashwin Sekhar T K <https://github.com/ashwinyes/>
* [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8)
* [2015-11-20] lapack-test fixes for Cortex-A57
* [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
* [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
* [Your name or handle] <[email or website]>
* [Date] [Brief summary of your changes]

View File

@ -1,4 +1,57 @@
OpenBLAS ChangeLog
====================================================================
Version 0.2.16
15-Mar-2016
common:
* Avoid potential getenv segfault. (#716)
* Import LAPACK svn bugfix #142-#147,#150-#155
x86/x86_64:
* Optimize c/zgemv for AMD Bulldozer, Piledriver, Steamroller
* Fix bug with scipy linalg test.
ARM:
* Improve DGEMM for ARM Cortex-A57. (Thanks, Ashwin Sekhar T K)
POWER:
* Optimize D and Z BLAS3 functions for Power8.
====================================================================
Version 0.2.16.rc1
23-Feb-2016
common:
* Upgrade LAPACK to 3.6.0 version.
Add BUILD_LAPACK_DEPRECATED option in Makefile.rule to build
LAPACK deprecated functions.
* Add MAKE_NB_JOBS option in Makefile.
Force number of make jobs.This is particularly
useful when using distcc. (#735. Thanks, Jerome Robert.)
* Redesign unit test. Run unit/regression test at every build (Travis-CI and Appveyor).
* Disable multi-threading for small size swap and ger. (#744. Thanks, Jerome Robert)
* Improve small zger, zgemv, ztrmv using stack alloction (#727. Thanks, Jerome Robert)
* Let openblas_get_num_threads return the number of active threads.
(#760. Thanks, Jerome Robert)
* Support illumos(OmniOS). (#749. Thanks, Lauri Tirkkonen)
* Fix LAPACK Dormbr, Dormlq bug. (#711, #713. Thanks, Brendan Tracey)
* Update scipy benchmark script. (#745. Thanks, John Kirkham)
x86/x86_64:
* Optimize trsm kernels for AMD Bulldozer, Piledriver, Steamroller.
* Detect Intel Avoton.
* Detect AMD Trinity, Richland, E2-3200.
* Fix gemv performance bug on Mac OSX Intel Haswell.
* Fix some bugs with CMake and Visual Studio
ARM:
* Support and optimize Cortex-A57 AArch64.
(#686. Thanks, Ashwin Sekhar TK)
* Fix Android build on ARMV7 (#778. Thanks, Paul Mustiere)
* Update ARMV6 kernels.
POWER:
* Fix detection of POWER architecture
(#684. Thanks, Sebastien Villemot)
====================================================================
Version 0.2.15
27-Oct-2015

View File

@ -7,10 +7,6 @@ ifneq ($(DYNAMIC_ARCH), 1)
BLASDIRS += kernel
endif
ifdef UTEST_CHECK
SANITY_CHECK = 1
endif
ifdef SANITY_CHECK
BLASDIRS += reference
endif
@ -85,22 +81,22 @@ endif
shared :
ifndef NO_SHARED
ifeq ($(OSNAME), Linux)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
@$(MAKE) -C exports so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
@$(MAKE) -C exports so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
@$(MAKE) -C exports so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
@$(MAKE) -C exports dyn
@-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
@$(MAKE) -C exports dll
@ -117,10 +113,8 @@ ifndef CROSS
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
ifdef UTEST_CHECK
$(MAKE) -C utest all
endif
endif
ifndef NO_CBLAS
$(MAKE) -C ctest all
endif
@ -249,16 +243,23 @@ ifndef NOFORTRAN
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
ifeq ($(FC), gfortran)
ifeq ($(F_COMPILER), GFORTRAN)
-@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc
ifdef SMP
ifeq ($(OSNAME), WINNT)
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
else
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
else
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
else
-@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
ifeq ($(BUILD_LAPACK_DEPRECATED), 1)
-@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
-@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
endif
@ -288,7 +289,17 @@ endif
lapack-test :
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
ifneq ($(CROSS), 1)
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
endif
lapack-runtest:
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
blas-test:
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)

View File

@ -11,8 +11,8 @@ endif
ifeq ($(CORE), ARMV7)
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
else
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
@ -29,5 +29,3 @@ ifeq ($(CORE), ARMV5)
CCOMMON_OPT += -marm -march=armv5
FCOMMON_OPT += -marm -march=armv5
endif

View File

@ -4,4 +4,8 @@ CCOMMON_OPT += -march=armv8-a
FCOMMON_OPT += -march=armv8-a
endif
ifeq ($(CORE), CORTEXA57)
CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
endif

View File

@ -29,7 +29,7 @@ install : lib.grd
#for inc
@echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@awk 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@ -48,10 +48,10 @@ endif
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
endif
#for install static library
@ -64,7 +64,7 @@ endif
#for install shared library
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), Linux)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
@install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.2.15
VERSION = 0.2.16
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -79,6 +79,9 @@ VERSION = 0.2.15
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
# NO_LAPACKE = 1
# Build LAPACK Deprecated functions since LAPACK 3.6.0
# BUILD_LAPACK_DEPRECATED = 1
# If you want to use legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1
@ -108,6 +111,10 @@ NO_AFFINITY = 1
# Don't use parallel make.
# NO_PARALLEL_MAKE = 1
# Force number of make jobs. The default is the number of logical CPU of the host.
# This is particularly useful when using distcc
# MAKE_NB_JOBS = 2
# If you would like to know minute performance report of GotoBLAS.
# FUNCTION_PROFILE = 1
@ -138,10 +145,6 @@ NO_AFFINITY = 1
# slow (Not implemented yet).
# SANITY_CHECK = 1
# Run testcases in utest/ . When you enable UTEST_CHECK, it would enable
# SANITY_CHECK to compare the result with reference BLAS.
# UTEST_CHECK = 1
# The installation directory.
# PREFIX = /opt/OpenBLAS
@ -159,10 +162,11 @@ COMMON_PROF = -pg
# Build Debug version
# DEBUG = 1
# Improve GEMV and GER for small matrices by stack allocation.
# For details, https://github.com/xianyi/OpenBLAS/pull/482
# Set maximum stack allocation.
# The default value is 2048. 0 disable stack allocation a may reduce GER and GEMV
# performance. For details, https://github.com/xianyi/OpenBLAS/pull/482
#
MAX_STACK_ALLOC=2048
# MAX_STACK_ALLOC = 0
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoid conflicts with other BLAS libraries, especially when using

View File

@ -139,6 +139,10 @@ NO_PARALLEL_MAKE=0
endif
GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE)
ifdef MAKE_NB_JOBS
GETARCH_FLAGS += -DMAKE_NB_JOBS=$(MAKE_NB_JOBS)
endif
ifeq ($(HOSTCC), loongcc)
GETARCH_FLAGS += -static
endif
@ -292,12 +296,14 @@ endif
ifneq ($(OSNAME), WINNT)
ifneq ($(OSNAME), CYGWIN_NT)
ifneq ($(OSNAME), Interix)
ifneq ($(OSNAME), Android)
ifdef SMP
EXTRALIB += -lpthread
endif
endif
endif
endif
endif
# ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix))
@ -324,7 +330,8 @@ ifdef SANITY_CHECK
CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU)
endif
ifdef MAX_STACK_ALLOC
MAX_STACK_ALLOC ?= 2048
ifneq ($(MAX_STACK_ALLOC), 0)
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
endif
@ -374,7 +381,7 @@ FCOMMON_OPT += -m128bit-long-double
endif
ifeq ($(C_COMPILER), CLANG)
EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION
CCOMMON_OPT += -DEXPRECISION
FCOMMON_OPT += -m128bit-long-double
endif
endif
@ -388,7 +395,7 @@ endif
ifeq ($(USE_OPENMP), 1)
#check
#check
ifeq ($(USE_THREAD), 0)
$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.)
endif
@ -952,17 +959,18 @@ ifeq ($(OSNAME), SunOS)
TAR = gtar
PATCH = gpatch
GREP = ggrep
AWK = nawk
else
TAR = tar
PATCH = patch
GREP = grep
AWK = awk
endif
ifndef MD5SUM
MD5SUM = md5sum
endif
AWK = awk
REVISION = -r$(VERSION)
MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION)))
@ -971,16 +979,25 @@ ifeq ($(DEBUG), 1)
COMMON_OPT += -g
endif
ifeq ($(DEBUG), 1)
FCOMMON_OPT += -g
endif
ifndef COMMON_OPT
COMMON_OPT = -O2
endif
ifndef FCOMMON_OPT
FCOMMON_OPT = -O2 -frecursive
endif
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF)
override FFLAGS += $(FCOMMON_OPT)
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES =
#For LAPACK Fortran codes.
@ -1170,4 +1187,3 @@ SUNPATH = /opt/sunstudio12.1
else
SUNPATH = /opt/SUNWspro
endif

View File

@ -75,10 +75,11 @@ Please read GotoBLAS_01Readme.txt
#### ARM64:
- **ARMV8**: Experimental
- **ARM Cortex-A57**: Experimental
### Support OS:
- **GNU/Linux**
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supported by community. We didn't test the library on this OS.

View File

@ -74,3 +74,5 @@ ARMV5
7.ARM 64-bit CPU:
ARMV8
CORTEXA57

199
USAGE.md Normal file
View File

@ -0,0 +1,199 @@
# Notes on OpenBLAS usage
## Usage
#### Program is Terminated. Because you tried to allocate too many memory regions
In OpenBLAS, we mange a pool of memory buffers and allocate the number of
buffers as the following.
```
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
```
This error indicates that the program exceeded the number of buffers.
Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
`MAX_CPU_NUMBER=NUM_THREADS`.
#### How can I use OpenBLAS in multi-threaded applications?
If your application is already multi-threaded, it will conflict with OpenBLAS
multi-threading. Thus, you must set OpenBLAS to use single thread in any of the
following ways:
* `export OPENBLAS_NUM_THREADS=1` in the environment variables.
* Call `openblas_set_num_threads(1)` in the application on runtime.
* Build OpenBLAS single thread version, e.g. `make USE_THREAD=0`
If the application is parallelized by OpenMP, please use OpenBLAS built with
`USE_OPENMP=1`
#### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH
The environment variable which control the kernel selection is
`OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export
OPENBLAS_CORETYPE=Haswell` and the function `char* openblas_get_corename()`
returns the used target.
#### How could I disable OpenBLAS threading affinity on runtime?
You can define the `OPENBLAS_MAIN_FREE` or `GOTOBLAS_MAIN_FREE` environment
variable to disable threading affinity on runtime. For example, before the
running,
```
export OPENBLAS_MAIN_FREE=1
```
Alternatively, you can disable affinity feature with enabling `NO_AFFINITY=1`
in `Makefile.rule`.
## Linking with the library
* Link with shared library
`gcc -o test test.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas`
If the library is multithreaded, please add `-lpthread`. If the library
contains LAPACK functions, please add `-lgfortran` or other Fortran libs.
* Link with static library
`gcc -o test test.c /your/path/libopenblas.a`
You can download `test.c` from https://gist.github.com/xianyi/5780018
On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by
default), custom programs statically linked against `libopenblas.a` should also
link with the pthread library e.g.:
```
gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread
```
Failing to add the `-lpthread` flag will cause errors such as:
```
/opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory':
memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock'
memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock'
...
```
## Code examples
#### Call CBLAS interface
This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656
```
#include <cblas.h>
#include <stdio.h>
void main()
{
int i=0;
double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5};
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3);
for(i=0; i<9; i++)
printf("%lf ", C[i]);
printf("\n");
}
```
`gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran`
#### Call BLAS Fortran interface
This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018
```
#include "stdio.h"
#include "stdlib.h"
#include "sys/time.h"
#include "time.h"
extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*);
int main(int argc, char* argv[])
{
int i;
printf("test!\n");
if(argc<4){
printf("Input Error\n");
return 1;
}
int m = atoi(argv[1]);
int n = atoi(argv[2]);
int k = atoi(argv[3]);
int sizeofa = m * k;
int sizeofb = k * n;
int sizeofc = m * n;
char ta = 'N';
char tb = 'N';
double alpha = 1.2;
double beta = 0.001;
struct timeval start,finish;
double duration;
double* A = (double*)malloc(sizeof(double) * sizeofa);
double* B = (double*)malloc(sizeof(double) * sizeofb);
double* C = (double*)malloc(sizeof(double) * sizeofc);
srand((unsigned)time(NULL));
for (i=0; i<sizeofa; i++)
A[i] = i%3+1;//(rand()%100)/10.0;
for (i=0; i<sizeofb; i++)
B[i] = i%3+1;//(rand()%100)/10.0;
for (i=0; i<sizeofc; i++)
C[i] = i%3+1;//(rand()%100)/10.0;
//#if 0
printf("m=%d,n=%d,k=%d,alpha=%lf,beta=%lf,sizeofc=%d\n",m,n,k,alpha,beta,sizeofc);
gettimeofday(&start, NULL);
dgemm_(&ta, &tb, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m);
gettimeofday(&finish, NULL);
duration = ((double)(finish.tv_sec-start.tv_sec)*1000000 + (double)(finish.tv_usec-start.tv_usec)) / 1000000;
double gflops = 2.0 * m *n*k;
gflops = gflops/duration*1.0e-6;
FILE *fp;
fp = fopen("timeDGEMM.txt", "a");
fprintf(fp, "%dx%dx%d\t%lf s\t%lf MFLOPS\n", m, n, k, duration, gflops);
fclose(fp);
free(A);
free(B);
free(C);
return 0;
}
```
` gcc -o time_dgemm time_dgemm.c /your/path/libopenblas.a`
` ./time_dgemm <m> <n> <k> `
## Troubleshooting
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
## BLAS reference manual
If you want to understand every BLAS function and definition, please read
[Intel MKL reference manual](https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-F7ED9FB8-6663-4F44-A62B-61B63C4F0491.htm)
or [netlib.org](http://netlib.org/blas/)
Here are [OpenBLAS extension functions](https://github.com/xianyi/OpenBLAS/wiki/OpenBLAS-Extensions)
## How to reference OpenBLAS.
You can reference our [papers](https://github.com/xianyi/OpenBLAS/wiki/publications).
Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net directly.

View File

@ -39,4 +39,6 @@ before_build:
- cmake -G "Visual Studio 12 Win64" .
test_script:
- echo Build OK!
- echo Running Test
- cd c:\projects\OpenBLAS\utest
- openblas_utest

View File

@ -166,7 +166,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
ssymm.goto dsymm.goto csymm.goto zsymm.goto
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
smallscaling
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
@ -2132,6 +2133,8 @@ cgemm3m.$(SUFFIX) : gemm3m.c
zgemm3m.$(SUFFIX) : gemm3m.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
smallscaling: smallscaling.c ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
clean ::
@rm -f *.goto *.mkl *.acml *.atlas *.veclib

View File

@ -172,7 +172,7 @@ int main(int argc, char *argv[]){
srandom(getpid());
#endif
for(j = 0; j < m; j++){
for(j = 0; j < to; j++){
for(i = 0; i < to * COMPSIZE; i++){
a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;

View File

@ -0,0 +1,58 @@
#!/usr/bin/env python
import os
import sys
import time
import numpy
from numpy import zeros
from numpy.random import randn
from scipy.linalg import blas
def run_dsyrk(N, l):
A = randn(N, N).astype('float64', order='F')
C = zeros((N, N), dtype='float64', order='F')
start = time.time()
for i in range(0, l):
blas.dsyrk(1.0, A, c=C, overwrite_c=True)
end = time.time()
timediff = (end - start)
mflops = (N * N * N) * l / timediff
mflops *= 1e-6
size = "%dx%d" % (N, N)
print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff))
if __name__ == "__main__":
N = 128
NMAX = 2048
NINC = 128
LOOPS = 1
z = 0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p)
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range(N, NMAX + NINC, NINC):
run_dsyrk(i, LOOPS)

View File

@ -0,0 +1,58 @@
#!/usr/bin/env python
import os
import sys
import time
import numpy
from numpy import zeros
from numpy.random import randn
from scipy.linalg import blas
def run_ssyrk(N, l):
A = randn(N, N).astype('float32', order='F')
C = zeros((N, N), dtype='float32', order='F')
start = time.time()
for i in range(0, l):
blas.ssyrk(1.0, A, c=C, overwrite_c=True)
end = time.time()
timediff = (end - start)
mflops = (N * N * N) * l / timediff
mflops *= 1e-6
size = "%dx%d" % (N, N)
print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff))
if __name__ == "__main__":
N = 128
NMAX = 2048
NINC = 128
LOOPS = 1
z = 0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p)
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range(N, NMAX + NINC, NINC):
run_ssyrk(i, LOOPS)

196
benchmark/smallscaling.c Normal file
View File

@ -0,0 +1,196 @@
// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <cblas.h>
#include <omp.h>
#define MIN_SIZE 5
#define MAX_SIZE 60
#define NB_SIZE 10
// number of loop for a 1x1 matrix. Lower it if the test is
// too slow on you computer.
#define NLOOP 2e7
typedef struct {
int matrix_size;
int n_loop;
void (* bench_func)();
void (* blas_func)();
void * (* create_matrix)(int size);
} BenchParam;
void * s_create_matrix(int size) {
float * r = malloc(size * sizeof(double));
int i;
for(i = 0; i < size; i++)
r[i] = 1e3 * i / size;
return r;
}
void * c_create_matrix(int size) {
float * r = malloc(size * 2 * sizeof(double));
int i;
for(i = 0; i < 2 * size; i++)
r[i] = 1e3 * i / size;
return r;
}
void * z_create_matrix(int size) {
double * r = malloc(size * 2 * sizeof(double));
int i;
for(i = 0; i < 2 * size; i++)
r[i] = 1e3 * i / size;
return r;
}
void * d_create_matrix(int size) {
double * r = malloc(size * sizeof(double));
int i;
for(i = 0; i < size; i++)
r[i] = 1e3 * i / size;
return r;
}
void trmv_bench(BenchParam * param)
{
int i, n;
int size = param->matrix_size;
n = param->n_loop / size;
int one = 1;
void * A = param->create_matrix(size * size);
void * y = param->create_matrix(size);
for(i = 0; i < n; i++) {
param->blas_func("U", "N", "N", &size, A, &size, y, &one);
}
free(A);
free(y);
}
void gemv_bench(BenchParam * param)
{
int i, n;
int size = param->matrix_size;
n = param->n_loop / size;
double v = 1.01;
int one = 1;
void * A = param->create_matrix(size * size);
void * y = param->create_matrix(size);
for(i = 0; i < n; i++) {
param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
}
free(A);
free(y);
}
void ger_bench(BenchParam * param) {
int i, n;
int size = param->matrix_size;
n = param->n_loop / size;
double v = 1.01;
int one = 1;
void * A = param->create_matrix(size * size);
void * y = param->create_matrix(size);
for(i = 0; i < n; i++) {
param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size);
}
free(A);
free(y);
}
#ifndef _WIN32
void * pthread_func_wrapper(void * param) {
((BenchParam *)param)->bench_func(param);
pthread_exit(NULL);
}
#endif
#define NB_TESTS 5
void * TESTS[4 * NB_TESTS] = {
trmv_bench, ztrmv_, z_create_matrix, "ztrmv",
gemv_bench, dgemv_, d_create_matrix, "dgemv",
gemv_bench, zgemv_, z_create_matrix, "zgemv",
ger_bench, dger_, d_create_matrix, "dger",
ger_bench, zgerc_, z_create_matrix, "zgerc",
};
inline static double delta_time(struct timespec tick) {
struct timespec tock;
clock_gettime(CLOCK_MONOTONIC, &tock);
return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
}
double pthread_bench(BenchParam * param, int nb_threads)
{
#ifdef _WIN32
return 0;
#else
BenchParam threaded_param = *param;
pthread_t threads[nb_threads];
int t, rc;
struct timespec tick;
threaded_param.n_loop /= nb_threads;
clock_gettime(CLOCK_MONOTONIC, &tick);
for(t=0; t<nb_threads; t++){
rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param);
if (rc){
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
}
for(t=0; t<nb_threads; t++){
pthread_join(threads[t], NULL);
}
return delta_time(tick);
#endif
}
double seq_bench(BenchParam * param) {
struct timespec tick;
clock_gettime(CLOCK_MONOTONIC, &tick);
param->bench_func(param);
return delta_time(tick);
}
double omp_bench(BenchParam * param) {
BenchParam threaded_param = *param;
struct timespec tick;
int t;
int nb_threads = omp_get_max_threads();
threaded_param.n_loop /= nb_threads;
clock_gettime(CLOCK_MONOTONIC, &tick);
#pragma omp parallel for
for(t = 0; t < nb_threads; t ++){
param->bench_func(&threaded_param);
}
return delta_time(tick);
}
int main(int argc, char * argv[]) {
double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE);
BenchParam param;
int test_id;
printf ("Running on %d threads\n", omp_get_max_threads());
for(test_id = 0; test_id < NB_TESTS; test_id ++) {
double size = MIN_SIZE;
param.bench_func = TESTS[test_id * 4];
param.blas_func = TESTS[test_id * 4 + 1];
param.create_matrix = TESTS[test_id * 4 + 2];
printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]);
param.n_loop = NLOOP;
while(size <= MAX_SIZE) {
param.matrix_size = (int)(size + 0.5);
double seq_time = seq_bench(&param);
double omp_time = omp_bench(&param);
double pthread_time = pthread_bench(&param, omp_get_max_threads());
printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, "
"pthread %gs, speedup %g\n",
param.matrix_size, seq_time,
omp_time, seq_time / omp_time,
pthread_time, seq_time / pthread_time);
size *= inc_factor;
}
}
return(0);
}

View File

@ -6,6 +6,7 @@ $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
$hostarch = "x86_64" if ($hostarch eq "amd64");
$hostarch = "arm" if ($hostarch =~ /^arm.*/);
$hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$binary = $ENV{"BINARY"};

View File

@ -14,12 +14,12 @@ if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64")
if (NOT NO_EXPRECISION)
if (${F_COMPILER} MATCHES "GFORTRAN")
# N.B. I'm not sure if CMake differentiates between GCC and LSB -hpa
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB")
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB")
set(EXPRECISION 1)
set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION -m128bit-long-double")
set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double")
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "Clang")
if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
set(EXPRECISION 1)
set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION")
set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double")
@ -28,35 +28,35 @@ if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64")
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "Intel")
if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel")
set(CCOMMON_OPT "${CCOMMON_OPT} -wd981")
endif ()
if (USE_OPENMP)
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB")
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB")
set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp")
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "Clang")
if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
message(WARNING "Clang doesn't support OpenMP yet.")
set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp")
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "Intel")
if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel")
set(CCOMMON_OPT "${CCOMMON_OPT} -openmp")
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "PGI")
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64")
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
set(CEXTRALIB "${CEXTRALIB} -lstdc++")
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
endif ()
endif ()
@ -87,7 +87,7 @@ if (${ARCH} STREQUAL "ia64")
set(BINARY_DEFINED 1)
if (${F_COMPILER} MATCHES "GFORTRAN")
if (${CMAKE_C_COMPILER} STREQUAL "GNU")
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
# EXPRECISION = 1
# CCOMMON_OPT += -DEXPRECISION
endif ()

View File

@ -48,18 +48,18 @@ set(SLASRC
sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f
sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f
sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f
sgegs.f sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f
sgels.f sgelsd.f sgelss.f sgelsx.f sgelsy.f sgeql2.f sgeqlf.f
sgeqp3.f sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f
DEPRECATED/sgegs.f DEPRECATED/sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f
sgels.f sgelsd.f sgelss.f DEPRECATED/sgelsx.f sgelsy.f sgeql2.f sgeqlf.f
sgeqp3.f DEPRECATED/sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f
sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvx.f
sgetc2.f sgetri.f
sggbak.f sggbal.f sgges.f sggesx.f sggev.f sggevx.f
sggglm.f sgghrd.f sgglse.f sggqrf.f
sggrqf.f sggsvd.f sggsvp.f sgtcon.f sgtrfs.f sgtsv.f
sggrqf.f DEPRECATED/sggsvd.f DEPRECATED/sggsvp.f sgtcon.f sgtrfs.f sgtsv.f
sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f
shsein.f shseqr.f slabrd.f slacon.f slacn2.f
slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f
slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f
DEPRECATED/slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f
slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f
slansy.f slantb.f slantp.f slantr.f slanv2.f
slapll.f slapmt.f
@ -69,7 +69,7 @@ set(SLASRC
slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slargv.f
slarrv.f slartv.f
slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f
slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f slatzm.f
slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f DEPRECATED/slatzm.f
sopgtr.f sopmtr.f sorg2l.f sorg2r.f
sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f
sorgrq.f sorgtr.f sorm2l.f sorm2r.f
@ -97,7 +97,7 @@ set(SLASRC
stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f
stptrs.f
strcon.f strevc.f strexc.f strrfs.f strsen.f strsna.f strsyl.f
strtrs.f stzrqf.f stzrzf.f sstemr.f
strtrs.f DEPRECATED/stzrqf.f stzrzf.f sstemr.f
slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f
stfttr.f stpttf.f stpttr.f strttf.f strttp.f
sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f
@ -114,14 +114,14 @@ set(CLASRC
cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f
cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f
cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f
cgegs.f cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f
cgels.f cgelsd.f cgelss.f cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f
DEPRECATED/cgegs.f DEPRECATED/cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f
cgels.f cgelsd.f cgelss.f DEPRECATED/cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
DEPRECATED/cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f
cgerq2.f cgerqf.f cgesc2.f cgesdd.f cgesvd.f
cgesvx.f cgetc2.f cgetri.f
cggbak.f cggbal.f cgges.f cggesx.f cggev.f cggevx.f cggglm.f
cgghrd.f cgglse.f cggqrf.f cggrqf.f
cggsvd.f cggsvp.f
DEPRECATED/cggsvd.f DEPRECATED/cggsvp.f
cgtcon.f cgtrfs.f cgtsv.f cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f
chbevd.f chbevx.f chbgst.f chbgv.f chbgvd.f chbgvx.f chbtrd.f
checon.f cheev.f cheevd.f cheevr.f cheevx.f chegs2.f chegst.f
@ -138,7 +138,7 @@ set(CLASRC
claed0.f claed7.f claed8.f
claein.f claesy.f claev2.f clags2.f clagtm.f
clahef.f clahef_rook.f clahqr.f
clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f
DEPRECATED/clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f
clanhb.f clanhe.f
clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f
clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f
@ -149,7 +149,7 @@ set(CLASRC
clarfx.f clargv.f clarnv.f clarrv.f clartg.f clartv.f
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f
clasyf.f clasyf_rook.f clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f
clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f
DEPRECATED/clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f
cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f
cposv.f cposvx.f cpstrf.f cpstf2.f
cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f
@ -166,7 +166,7 @@ set(CLASRC
ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f
ctprfs.f ctptri.f
ctptrs.f ctrcon.f ctrevc.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f
ctrsyl.f ctrtrs.f ctzrqf.f ctzrzf.f cung2l.f cung2r.f
ctrsyl.f ctrtrs.f DEPRECATED/ctzrqf.f ctzrzf.f cung2l.f cung2r.f
cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f
cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f
cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f
@ -186,18 +186,18 @@ set(DLASRC
dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f
dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f
dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f
dgegs.f dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f
dgels.f dgelsd.f dgelss.f dgelsx.f dgelsy.f dgeql2.f dgeqlf.f
dgeqp3.f dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f
DEPRECATED/dgegs.f DEPRECATED/dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f
dgels.f dgelsd.f dgelss.f DEPRECATED/dgelsx.f dgelsy.f dgeql2.f dgeqlf.f
dgeqp3.f DEPRECATED/dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f
dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvx.f
dgetc2.f dgetri.f
dggbak.f dggbal.f dgges.f dggesx.f dggev.f dggevx.f
dggglm.f dgghrd.f dgglse.f dggqrf.f
dggrqf.f dggsvd.f dggsvp.f dgtcon.f dgtrfs.f dgtsv.f
dggrqf.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f dgtcon.f dgtrfs.f dgtsv.f
dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f
dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f
dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f
dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f
DEPRECATED/dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f
dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f
dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f
dlapll.f dlapmt.f
@ -207,7 +207,7 @@ set(DLASRC
dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f
dlargv.f dlarrv.f dlartv.f
dlarz.f dlarzb.f dlarzt.f dlasy2.f dlasyf.f dlasyf_rook.f
dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlatzm.f
dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f DEPRECATED/dlatzm.f
dopgtr.f dopmtr.f dorg2l.f dorg2r.f
dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f
dorgrq.f dorgtr.f dorm2l.f dorm2r.f
@ -235,7 +235,7 @@ set(DLASRC
dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f
dtptrs.f
dtrcon.f dtrevc.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f
dtrtrs.f dtzrqf.f dtzrzf.f dstemr.f
dtrtrs.f DEPRECATED/dtzrqf.f dtzrzf.f dstemr.f
dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f
dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f
dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f
@ -251,14 +251,14 @@ set(ZLASRC
zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f
zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f
zgegs.f zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f
zgels.f zgelsd.f zgelss.f zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
DEPRECATED/zgegs.f DEPRECATED/zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f
zgels.f zgelsd.f zgelss.f DEPRECATED/zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
DEPRECATED/zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
zgesc2.f zgesdd.f zgesvd.f zgesvx.f zgetc2.f
zgetri.f
zggbak.f zggbal.f zgges.f zggesx.f zggev.f zggevx.f zggglm.f
zgghrd.f zgglse.f zggqrf.f zggrqf.f
zggsvd.f zggsvp.f
DEPRECATED/zggsvd.f DEPRECATED/zggsvp.f
zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f
zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f
zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f
@ -275,7 +275,7 @@ set(ZLASRC
zlaed0.f zlaed7.f zlaed8.f
zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f
zlahef.f zlahef_rook.f zlahqr.f
zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f
DEPRECATED/zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f
zlangt.f zlanhb.f
zlanhe.f
zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f
@ -288,7 +288,7 @@ set(ZLASRC
zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
zlassq.f zlasyf.f zlasyf_rook.f
zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f zlatzm.f
zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f DEPRECATED/zlatzm.f
zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f
zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f
zposv.f zposvx.f zpotrs.f zpstrf.f zpstf2.f
@ -306,7 +306,7 @@ set(ZLASRC
ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f
ztprfs.f ztptri.f
ztptrs.f ztrcon.f ztrevc.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f
ztrsyl.f ztrtrs.f ztzrqf.f ztzrzf.f zung2l.f
ztrsyl.f ztrtrs.f DEPRECATED/ztzrqf.f ztzrzf.f zung2l.f
zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f
zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f
zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f

View File

@ -2038,6 +2038,59 @@ set(MATGEN
lapacke_zlagsy_work.c
)
set(Utils_SRC
lapacke_cgb_nancheck.c lapacke_dpf_nancheck.c lapacke_ssy_trans.c
lapacke_cgb_trans.c lapacke_dpf_trans.c lapacke_stb_nancheck.c
lapacke_cge_nancheck.c lapacke_dpo_nancheck.c lapacke_stb_trans.c
lapacke_cge_trans.c lapacke_dpo_trans.c lapacke_stf_nancheck.c
lapacke_cgg_nancheck.c lapacke_dpp_nancheck.c lapacke_stf_trans.c
lapacke_cgg_trans.c lapacke_dpp_trans.c lapacke_stp_nancheck.c
lapacke_cgt_nancheck.c lapacke_dpt_nancheck.c lapacke_stp_trans.c
lapacke_chb_nancheck.c lapacke_dsb_nancheck.c lapacke_str_nancheck.c
lapacke_chb_trans.c lapacke_dsb_trans.c lapacke_str_trans.c
lapacke_che_nancheck.c lapacke_dsp_nancheck.c lapacke_xerbla.c
lapacke_che_trans.c lapacke_dsp_trans.c lapacke_zgb_nancheck.c
lapacke_chp_nancheck.c lapacke_dst_nancheck.c lapacke_zgb_trans.c
lapacke_chp_trans.c lapacke_dsy_nancheck.c lapacke_zge_nancheck.c
lapacke_chs_nancheck.c lapacke_dsy_trans.c lapacke_zge_trans.c
lapacke_chs_trans.c lapacke_dtb_nancheck.c lapacke_zgg_nancheck.c
lapacke_c_nancheck.c lapacke_dtb_trans.c lapacke_zgg_trans.c
lapacke_cpb_nancheck.c lapacke_dtf_nancheck.c lapacke_zgt_nancheck.c
lapacke_cpb_trans.c lapacke_dtf_trans.c lapacke_zhb_nancheck.c
lapacke_cpf_nancheck.c lapacke_dtp_nancheck.c lapacke_zhb_trans.c
lapacke_cpf_trans.c lapacke_dtp_trans.c lapacke_zhe_nancheck.c
lapacke_cpo_nancheck.c lapacke_dtr_nancheck.c lapacke_zhe_trans.c
lapacke_cpo_trans.c lapacke_dtr_trans.c lapacke_zhp_nancheck.c
lapacke_cpp_nancheck.c lapacke_lsame.c lapacke_zhp_trans.c
lapacke_cpp_trans.c lapacke_make_complex_double.c lapacke_zhs_nancheck.c
lapacke_cpt_nancheck.c lapacke_make_complex_float.c lapacke_zhs_trans.c
lapacke_csp_nancheck.c lapacke_sgb_nancheck.c lapacke_z_nancheck.c
lapacke_csp_trans.c lapacke_sgb_trans.c lapacke_zpb_nancheck.c
lapacke_cst_nancheck.c lapacke_sge_nancheck.c lapacke_zpb_trans.c
lapacke_csy_nancheck.c lapacke_sge_trans.c lapacke_zpf_nancheck.c
lapacke_csy_trans.c lapacke_sgg_nancheck.c lapacke_zpf_trans.c
lapacke_ctb_nancheck.c lapacke_sgg_trans.c lapacke_zpo_nancheck.c
lapacke_ctb_trans.c lapacke_sgt_nancheck.c lapacke_zpo_trans.c
lapacke_ctf_nancheck.c lapacke_shs_nancheck.c lapacke_zpp_nancheck.c
lapacke_ctf_trans.c lapacke_shs_trans.c lapacke_zpp_trans.c
lapacke_ctp_nancheck.c lapacke_s_nancheck.c lapacke_zpt_nancheck.c
lapacke_ctp_trans.c lapacke_spb_nancheck.c lapacke_zsp_nancheck.c
lapacke_ctr_nancheck.c lapacke_spb_trans.c lapacke_zsp_trans.c
lapacke_ctr_trans.c lapacke_spf_nancheck.c lapacke_zst_nancheck.c
lapacke_dgb_nancheck.c lapacke_spf_trans.c lapacke_zsy_nancheck.c
lapacke_dgb_trans.c lapacke_spo_nancheck.c lapacke_zsy_trans.c
lapacke_dge_nancheck.c lapacke_spo_trans.c lapacke_ztb_nancheck.c
lapacke_dge_trans.c lapacke_spp_nancheck.c lapacke_ztb_trans.c
lapacke_dgg_nancheck.c lapacke_spp_trans.c lapacke_ztf_nancheck.c
lapacke_dgg_trans.c lapacke_spt_nancheck.c lapacke_ztf_trans.c
lapacke_dgt_nancheck.c lapacke_ssb_nancheck.c lapacke_ztp_nancheck.c
lapacke_dhs_nancheck.c lapacke_ssb_trans.c lapacke_ztp_trans.c
lapacke_dhs_trans.c lapacke_ssp_nancheck.c lapacke_ztr_nancheck.c
lapacke_d_nancheck.c lapacke_ssp_trans.c lapacke_ztr_trans.c
lapacke_dpb_nancheck.c lapacke_sst_nancheck.c
lapacke_dpb_trans.c lapacke_ssy_nancheck.c
)
set(LAPACKE_REL_SRC "")
if (BUILD_SINGLE)
list(APPEND LAPACKE_REL_SRC ${SSRC})
@ -2058,10 +2111,14 @@ endif ()
# add lapack-netlib folder to the sources
set(LAPACKE_SOURCES "")
foreach (LAE_FILE ${LAPACKE_REL_SRC})
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/lapacke/src/${LAE_FILE}")
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/src/${LAE_FILE}")
endforeach ()
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/lapacke/include")
foreach (Utils_FILE ${Utils_SRC})
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/utils/${Utils_FILE}")
endforeach ()
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h" "${lapacke_include_dir}/lapacke_mangling.h")
include_directories(${lapacke_include_dir})
set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")

View File

@ -86,13 +86,14 @@ extern "C" {
#if !defined(_MSC_VER)
#include <unistd.h>
#endif
#include <time.h>
#ifdef OS_LINUX
#include <malloc.h>
#include <sched.h>
#endif
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD)
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID)
#include <sched.h>
#endif
@ -331,12 +332,13 @@ typedef int blasint;
#endif
#endif
/*
#ifdef PILEDRIVER
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
*/
/*
#ifdef STEAMROLLER
@ -410,7 +412,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#ifndef ASSEMBLER
#ifdef OS_WINDOWS
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) GetEnvironmentVariable((n), (p), sizeof(p))
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
#else
typedef char* env_var_t;
#define readenv(p, n) ((p)=getenv(n))
@ -726,6 +728,7 @@ typedef struct {
#endif
#ifndef ASSEMBLER
#include "common_stackalloc.h"
#if 0
#include "symcopy.h"
#endif

View File

@ -43,28 +43,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef ASSEMBLER
static void __inline blas_lock(volatile BLASULONG *address){
long register ret;
BLASULONG ret;
do {
while (*address) {YIELDING;};
__asm__ __volatile__(
"ldaxr %0, [%1] \n\t"
"stlxr w2, %2, [%1] \n\t"
"orr %0, %0, x2 \n\t"
: "=r"(ret)
: "r"(address), "r"(1l)
: "memory", "x2"
"mov x4, #1 \n\t"
"1: \n\t"
"ldaxr x2, [%1] \n\t"
"cbnz x2, 1b \n\t"
"2: \n\t"
"stxr w3, x4, [%1] \n\t"
"cbnz w3, 1b \n\t"
"mov %0, #0 \n\t"
: "=r"(ret), "=r"(address)
: "1"(address)
: "memory", "x2" , "x3", "x4"
);
} while (ret);
MB;
}
#define BLAS_LOCK_DEFINED
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
@ -89,8 +100,10 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \
.text ;\
.align 4 ;\
.global REALNAME ;\
.func REALNAME ;\
.type REALNAME, %function ;\
REALNAME:
#define EPILOGUE
@ -107,7 +120,11 @@ REALNAME:
#endif
#define HUGE_PAGESIZE ( 4 << 20)
#if defined(CORTEXA57)
#define BUFFER_SIZE (20 << 20)
#else
#define BUFFER_SIZE (16 << 20)
#endif
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)

View File

@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH
#endif
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL)
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
#define DCBT_ARG 0
#else
#define DCBT_ARG 8
@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define L1_PREFETCH dcbtst
#endif
#if defined(POWER8)
#define L1_DUALFETCH
#define L1_PREFETCHSIZE (16 + 128 * 100)
#define L1_PREFETCH dcbtst
#endif
#
#ifndef L1_PREFETCH
#define L1_PREFETCH dcbt
#endif
@ -790,6 +797,8 @@ Lmcount$lazy_ptr:
#define BUFFER_SIZE ( 2 << 20)
#elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8)
#define BUFFER_SIZE ( 64 << 20)
#else
#define BUFFER_SIZE ( 16 << 20)
#endif

73
common_stackalloc.h Normal file
View File

@ -0,0 +1,73 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define STACK_ALLOC_PROTECT
#ifdef STACK_ALLOC_PROTECT
// Try to detect stack smashing
#include <assert.h>
#define STACK_ALLOC_PROTECT_SET volatile int stack_check = 0x7fc01234;
#define STACK_ALLOC_PROTECT_CHECK assert(stack_check == 0x7fc01234);
#else
#define STACK_ALLOC_PROTECT_SET
#define STACK_ALLOC_PROTECT_CHECK
#endif
#if defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
/*
* Allocate a buffer on the stack if the size is smaller than MAX_STACK_ALLOC.
* Stack allocation is much faster than blas_memory_alloc or malloc, particularly
* when OpenBLAS is used from a multi-threaded application.
* SIZE must be carefully chosen to be:
* - as small as possible to maximize the number of stack allocation
* - large enough to support all architectures and kernel
* Chosing a too small SIZE will lead to a stack smashing.
*/
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \
/* do not restore all register */ \
volatile int stack_alloc_size = SIZE; \
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \
stack_alloc_size = 0; \
STACK_ALLOC_PROTECT_SET \
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
#else
//Original OpenBLAS/GotoBLAS codes.
#define STACK_ALLOC(SIZE, TYPE, BUFFER) BUFFER = (TYPE *)blas_memory_alloc(1)
#endif
#if defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
#define STACK_FREE(BUFFER) \
STACK_ALLOC_PROTECT_CHECK \
if(!stack_alloc_size) \
blas_memory_free(BUFFER);
#else
#define STACK_FREE(BUFFER) blas_memory_free(BUFFER)
#endif

View File

@ -41,6 +41,10 @@
#ifndef ASSEMBLER
#ifdef C_MSVC
#include <intrin.h>
#endif
#define MB
#define WMB
@ -170,12 +174,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
if (y <= 1) return x;
#if defined(_MSC_VER) && !defined(__clang__)
result = x/y;
return result;
#else
y = blas_quick_divide_table[y];
#if defined(_MSC_VER) && !defined(__clang__)
(void*)result;
return x*y;
#else
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
return result;

View File

@ -396,7 +396,7 @@ REALNAME:
#define PROFCODE
#define EPILOGUE .end REALNAME
#define EPILOGUE .end
#endif
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)

View File

@ -115,6 +115,9 @@ int detect(void)
if (strstr(p, "0xc0f")) {
return CPU_CORTEXA15;
}
if (strstr(p, "0xd07")) {
return CPU_ARMV7; //ARMV8 on 32-bit
}
}
@ -158,6 +161,27 @@ int detect(void)
}
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if ((!strncmp("CPU architecture", buffer, 16)))
{
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL) {
if (strstr(p, "8")) {
return CPU_ARMV7; //ARMV8 on 32-bit
}
}
#endif
return CPU_UNKNOWN;

View File

@ -29,12 +29,19 @@
#define CPU_UNKNOWN 0
#define CPU_ARMV8 1
#define CPU_CORTEXA57 2
static char *cpuname[] = {
"UNKOWN",
"ARMV8"
"UNKNOWN",
"ARMV8" ,
"CORTEXA57"
};
static char *cpuname_lower[] = {
"unknown",
"armv8" ,
"cortexa57"
};
int get_feature(char *search)
{
@ -53,13 +60,13 @@ int get_feature(char *search)
{
p = strchr(buffer, ':') + 2;
break;
}
}
}
}
fclose(infile);
fclose(infile);
if( p == NULL ) return;
if( p == NULL ) return 0;
t = strtok(p," ");
while( t = strtok(NULL," "))
@ -82,11 +89,30 @@ int detect(void)
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)))
if (!strncmp("CPU part", buffer, 8))
{
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL) {
if (strstr(p, "0xd07")) {
return CPU_CORTEXA57;
}
}
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)) ||
(!strncmp("CPU architecture", buffer, 16)))
{
p = strchr(buffer, ':') + 2;
break;
@ -100,7 +126,7 @@ int detect(void)
if (strstr(p, "AArch64"))
{
return CPU_ARMV8;
return CPU_ARMV8;
}
@ -118,23 +144,13 @@ char *get_corename(void)
void get_architecture(void)
{
printf("ARM");
printf("ARM64");
}
void get_subarchitecture(void)
{
int d = detect();
switch (d)
{
case CPU_ARMV8:
printf("ARMV8");
break;
default:
printf("UNKNOWN");
break;
}
printf("%s", cpuname[d]);
}
void get_subdirname(void)
@ -160,26 +176,34 @@ void get_cpuconfig(void)
printf("#define L2_ASSOCIATIVE 4\n");
break;
case CPU_CORTEXA57:
printf("#define CORTEXA57\n");
printf("#define HAVE_VFP\n");
printf("#define HAVE_VFPV3\n");
printf("#define HAVE_NEON\n");
printf("#define HAVE_VFPV4\n");
printf("#define L1_CODE_SIZE 49152\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 3\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 2\n");
printf("#define L2_SIZE 2097152\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
}
}
void get_libname(void)
{
int d = detect();
switch (d)
{
case CPU_ARMV8:
printf("armv8\n");
break;
}
printf("%s", cpuname_lower[d]);
}
void get_features(void)
{

View File

@ -55,6 +55,7 @@
#define CPUTYPE_POWER6 5
#define CPUTYPE_CELL 6
#define CPUTYPE_PPCG4 7
#define CPUTYPE_POWER8 8
char *cpuname[] = {
"UNKNOWN",
@ -65,6 +66,7 @@ char *cpuname[] = {
"POWER6",
"CELL",
"PPCG4",
"POWER8"
};
char *lowercpuname[] = {
@ -76,6 +78,7 @@ char *lowercpuname[] = {
"power6",
"cell",
"ppcg4",
"power8"
};
char *corename[] = {
@ -87,6 +90,7 @@ char *corename[] = {
"POWER6",
"CELL",
"PPCG4",
"POWER8"
};
int detect(void){
@ -115,7 +119,7 @@ int detect(void){
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;

View File

@ -1172,6 +1172,9 @@ int get_cpuname(void){
#endif
else
return CPUTYPE_NEHALEM;
case 13:
// Avoton
return CPUTYPE_NEHALEM;
}
break;
case 5:
@ -1229,6 +1232,7 @@ int get_cpuname(void){
case 2:
return CPUTYPE_OPTERON;
case 1:
case 3:
case 10:
return CPUTYPE_BARCELONA;
case 6:
@ -1239,13 +1243,19 @@ int get_cpuname(void){
return CPUTYPE_BULLDOZER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 2:
case 2: //AMD Piledriver
case 3: //AMD Richland
if(support_avx())
return CPUTYPE_PILEDRIVER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 0:
switch(exmodel){
case 1: //AMD Trinity
if(support_avx())
return CPUTYPE_PILEDRIVER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 3:
if(support_avx())
return CPUTYPE_STEAMROLLER;
@ -1668,6 +1678,9 @@ int get_coretype(void){
#endif
else
return CORE_NEHALEM;
case 13:
// Avoton
return CORE_NEHALEM;
}
break;
case 5:
@ -1718,7 +1731,8 @@ int get_coretype(void){
return CORE_BULLDOZER;
else
return CORE_BARCELONA; //OS don't support AVX.
case 2:
case 2: //AMD Piledriver
case 3: //AMD Richland
if(support_avx())
return CORE_PILEDRIVER;
else
@ -1726,6 +1740,12 @@ int get_coretype(void){
case 0:
switch(exmodel){
case 1: //AMD Trinity
if(support_avx())
return CORE_PILEDRIVER;
else
return CORE_BARCELONA; //OS don't support AVX.
case 3:
if(support_avx())
return CORE_STEAMROLLER;

View File

@ -1365,8 +1365,9 @@
*
150 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
IF( TRACE )
$ CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
*
160 CONTINUE
RETURN

View File

@ -1365,8 +1365,9 @@
*
150 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
IF( TRACE )
$ CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
*
160 CONTINUE
RETURN

View File

@ -1335,8 +1335,9 @@
*
150 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
IF( TRACE )
$ CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
*
160 CONTINUE
RETURN

View File

@ -1339,8 +1339,9 @@
*
150 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
IF( TRACE )
$ CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
*
160 CONTINUE
RETURN

View File

@ -1350,7 +1350,7 @@
*
* Call the subroutine.
*
IF( SNAME( 4: 5 ).EQ.'mv' )THEN
IF( SNAME( 10: 11 ).EQ.'mv' )THEN
IF( FULL )THEN
IF( TRACE )
$ WRITE( NTRA, FMT = 9993 )NC, SNAME,
@ -1376,7 +1376,7 @@
CALL CZTPMV( IORDER, UPLO, TRANS, DIAG,
$ N, AA, XX, INCX )
END IF
ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN
ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN
IF( FULL )THEN
IF( TRACE )
$ WRITE( NTRA, FMT = 9993 )NC, SNAME,
@ -1465,7 +1465,7 @@
END IF
*
IF( .NOT.NULL )THEN
IF( SNAME( 4: 5 ).EQ.'mv' )THEN
IF( SNAME( 10: 11 ).EQ.'mv' )THEN
*
* Check the result.
*
@ -1473,7 +1473,7 @@
$ INCX, ZERO, Z, INCX, XT, G,
$ XX, EPS, ERR, FATAL, NOUT,
$ .TRUE. )
ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN
ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN
*
* Compute approximation to original vector.
*
@ -1611,7 +1611,7 @@
* .. Common blocks ..
COMMON /INFOC/INFOT, NOUTC, OK
* .. Executable Statements ..
CONJ = SNAME( 5: 5 ).EQ.'c'
CONJ = SNAME( 11: 11 ).EQ.'c'
* Define the number of arguments.
NARGS = 9
*

View File

@ -1366,8 +1366,9 @@
*
150 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
IF( TRACE )
$ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
*
160 CONTINUE
RETURN

View File

@ -1366,8 +1366,9 @@
*
150 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
IF( TRACE )
$ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
*
160 CONTINUE
RETURN

View File

@ -1,7 +1,7 @@
'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@ -1,7 +1,7 @@
'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@ -1,7 +1,7 @@
'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@ -1,7 +1,7 @@
'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@ -1,7 +1,7 @@
'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@ -1,7 +1,7 @@
'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@ -1,7 +1,7 @@
'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@ -1,7 +1,7 @@
'ZBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@ -1,7 +1,7 @@
'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@ -1,7 +1,7 @@
'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@ -55,7 +55,7 @@
static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
FLOAT *a, *x, *y;
BLASLONG incx, incy;
BLASLONG incx;
BLASLONG m_from, m_to, i;
#ifndef COMPLEX
FLOAT result;
@ -68,7 +68,6 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
y = (FLOAT *)args -> c;
incx = args -> ldb;
incy = args -> ldc;
m_from = 0;
m_to = args -> m;

View File

@ -43,7 +43,7 @@
static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
FLOAT *a, *x, *y;
BLASLONG lda, incx, incy;
BLASLONG incx, incy;
BLASLONG i, m_from, m_to;
FLOAT alpha_r;
#ifdef COMPLEX
@ -56,7 +56,6 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
incx = args -> lda;
incy = args -> ldb;
lda = args -> ldc;
alpha_r = *((FLOAT *)args -> alpha + 0);
#ifdef COMPLEX

View File

@ -46,7 +46,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
BLASLONG incx;
BLASLONG i, m_from, m_to;
FLOAT alpha_r;
#if defined(COMPLEX) && !defined(HER) && !defined(HERREV)
#if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV)
FLOAT alpha_i;
#endif
@ -56,7 +56,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
incx = args -> lda;
alpha_r = *((FLOAT *)args -> alpha + 0);
#if defined(COMPLEX) && !defined(HER) && !defined(HERREV)
#if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV)
alpha_i = *((FLOAT *)args -> alpha + 1);
#endif

View File

@ -55,7 +55,7 @@
static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
FLOAT *a, *x, *y;
BLASLONG lda, incx, incy;
BLASLONG lda, incx;
BLASLONG m_from, m_to;
a = (FLOAT *)args -> a;
@ -64,7 +64,6 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
lda = args -> lda;
incx = args -> ldb;
incy = args -> ldc;
m_from = 0;
m_to = args -> m;

View File

@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
BLASLONG length;
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1);
}

View File

@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
BLASLONG length;
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1);
}

View File

@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
BLASLONG length;
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1);
}

View File

@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
BLASLONG length;
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1);
}

View File

@ -43,12 +43,10 @@
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1);
}

View File

@ -43,12 +43,10 @@
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1);
}

View File

@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#endif
x = buffer;
buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
buffer += ((COMPSIZE * args -> m + 3) & ~3);
}
#ifndef TRANS
@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE;
queue[num_cpu - 1].next = NULL;

View File

@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
BLASLONG length;
#if (TRANSA == 2) || (TRANSA == 4)
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1);
}

View File

@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
BLASLONG length;
#if (TRANSA == 2) || (TRANSA == 4)
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1);
}

View File

@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
BLASLONG length;
#if (TRANSA == 2) || (TRANSA == 4)
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1);
}

View File

@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
BLASLONG length;
#if (TRANSA == 2) || (TRANSA == 4)
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1);
}

View File

@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#ifndef UNIT
FLOAT atemp1, atemp2, btemp1, btemp2;
#endif
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1);
}

View File

@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#ifndef UNIT
FLOAT atemp1, atemp2, btemp1, btemp2;
#endif
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1);
}

View File

@ -51,12 +51,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#ifndef UNIT
FLOAT ar, ai, br, bi, ratio, den;
#endif
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1);
}

View File

@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#ifndef UNIT
FLOAT ar, ai, br, bi, ratio, den;
#endif
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1);
}

View File

@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
COPY_K(m, b, incb, buffer, 1);
}

View File

@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
COPY_K(m, b, incb, buffer, 1);
}

View File

@ -48,8 +48,7 @@ foreach (float_type ${FLOAT_TYPES})
# TRANS needs to be set/unset when CONJ is set/unset, so can't use it as a combination
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK" 3 "herk_N" false ${float_type})
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;TRANS;CONJ" 3 "herk_C" false ${float_type})
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type})
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type})
# Need to set CONJ for trmm and trsm
GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trmm_LR" false ${float_type})
GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trmm_LC" false ${float_type})
@ -72,6 +71,10 @@ foreach (float_type ${FLOAT_TYPES})
GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER;TRANS;CONJ" "her2k_LC" false "" "" false ${float_type})
if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3)
#herk
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type})
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type})
#hemm
GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NN;THREADED_LEVEL3" 0 "hemm_thread_L" false ${float_type})
GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NC;RSIDE;THREADED_LEVEL3" 0 "hemm_thread_R" false ${float_type})
@ -96,6 +99,17 @@ foreach (float_type ${FLOAT_TYPES})
endif()
endif ()
endforeach ()
# for gemm3m
if(USE_GEMM3M)
foreach (GEMM_DEFINE ${GEMM_DEFINES})
string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC)
GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE}" "gemm3m_${GEMM_DEFINE_LC}" false "" "" false ${float_type})
if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3)
GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm3m_thread_${GEMM_DEFINE_LC}" false "" "" false ${float_type})
endif ()
endforeach ()
endif()
endif ()
endforeach ()

View File

@ -65,7 +65,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1];
BLASLONG procs, total_procs, num_cpu_m, num_cpu_n;
BLASLONG procs, num_cpu_m, num_cpu_n;
BLASLONG width, i, j;
BLASLONG divM, divN;

View File

@ -335,7 +335,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

View File

@ -230,7 +230,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
BLASLONG is, min_i, div_n;
BLASLONG i, current;
BLASLONG l1stride, l2size;
BLASLONG l1stride;
#ifdef TIMING
BLASULONG rpcc_counter;
@ -298,8 +298,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif
) return 0;
l2size = GEMM_P * GEMM_Q;
#if 0
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n",
mypos, m_from, m_to, n_from, n_to, N_from, N_to);
@ -369,7 +367,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
START_RPCC();
@ -706,7 +706,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
n = n_to - n_from;
}
if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
if ((m < nthreads * SWITCH_RATIO) || (n < nthreads * SWITCH_RATIO)) {
GEMM_LOCAL(args, range_m, range_n, sa, sb, 0);
return 0;
}

View File

@ -33,6 +33,7 @@ set(COMMON_SOURCES
xerbla.c
openblas_set_num_threads.c
openblas_error_handle.c
openblas_env.c
openblas_get_num_procs.c
openblas_get_num_threads.c
)

View File

@ -1,7 +1,7 @@
TOPDIR = ../..
include ../../Makefile.system
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) openblas_env.$(SUFFIX)
#COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
@ -118,6 +118,9 @@ openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c
openblas_error_handle.$(SUFFIX) : openblas_error_handle.c
$(CC) $(CFLAGS) -c $< -o $(@F)
openblas_env.$(SUFFIX) : openblas_env.c
$(CC) $(CFLAGS) -c $< -o $(@F)
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
$(CC) $(CFLAGS) -c $< -o $(@F)

View File

@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************/
#include "common.h"
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS)
#include <dlfcn.h>
#include <signal.h>
#include <sys/resource.h>
@ -92,6 +92,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#endif
extern unsigned int openblas_thread_timeout();
#ifdef SMP_SERVER
#undef MONITOR
@ -524,6 +526,7 @@ static int blas_monitor(void *arg){
int blas_thread_init(void){
BLASLONG i;
int ret;
int thread_timeout_env;
#ifdef NEED_STACKATTR
pthread_attr_t attr;
#endif
@ -540,22 +543,12 @@ int blas_thread_init(void){
if (!blas_server_avail){
env_var_t p;
if (readenv(p,"THREAD_TIMEOUT")) {
thread_timeout = atoi(p);
if (thread_timeout < 4) thread_timeout = 4;
if (thread_timeout > 30) thread_timeout = 30;
thread_timeout = (1 << thread_timeout);
}else{
if (readenv(p,"GOTO_THREAD_TIMEOUT")) {
thread_timeout = atoi(p);
if (thread_timeout < 4) thread_timeout = 4;
if (thread_timeout > 30) thread_timeout = 30;
thread_timeout = (1 << thread_timeout);
}
}
thread_timeout_env=openblas_thread_timeout();
if (thread_timeout_env>0) {
if (thread_timeout_env < 4) thread_timeout_env = 4;
if (thread_timeout_env > 30) thread_timeout_env = 30;
thread_timeout = (1 << thread_timeout_env);
}
for(i = 0; i < blas_num_threads - 1; i++){
@ -576,10 +569,12 @@ int blas_thread_init(void){
struct rlimit rlim;
const char *msg = strerror(ret);
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg);
#ifdef RLIMIT_NPROC
if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
"%ld current, %ld max\n", (long)(rlim.rlim_cur), (long)(rlim.rlim_max));
}
#endif
if(0 != raise(SIGINT)) {
fprintf(STDERR, "OpenBLAS blas_thread_init: calling exit(3)\n");
exit(EXIT_FAILURE);

View File

@ -261,6 +261,11 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Avoton
if (model == 13) {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
return NULL;
case 5:
//Intel Broadwell
@ -318,7 +323,7 @@ static gotoblas_t *get_coretype(void){
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 2){
}else if(model == 2 || model == 3){
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
if(support_avx())
return &gotoblas_PILEDRIVER;
@ -327,7 +332,15 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 0){
if (exmodel == 3) {
if (exmodel == 1) {
//AMD Trinity
if(support_avx())
return &gotoblas_PILEDRIVER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if (exmodel == 3) {
//AMD STEAMROLLER
if(support_avx())
return &gotoblas_STEAMROLLER;
@ -378,7 +391,7 @@ static char *corename[] = {
"Nehalem",
"Athlon",
"Opteron",
"Opteron(SSE3)",
"Opteron_SSE3",
"Barcelona",
"Nano",
"Sandybridge",

View File

@ -104,6 +104,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <errno.h>
#include <linux/unistd.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/resource.h>
#endif
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
@ -142,7 +144,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(_MSC_VER) && !defined(__clang__)
#define CONSTRUCTOR __cdecl
#define DESTRUCTOR __cdecl
#elif defined(OS_DARWIN) && defined(C_GCC)
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#else
@ -167,7 +169,7 @@ void goto_set_num_threads(int num_threads) {};
#else
#ifdef OS_LINUX
#if defined(OS_LINUX) || defined(OS_SUNOS)
#ifndef NO_AFFINITY
int get_num_procs(void);
#else
@ -292,8 +294,11 @@ void openblas_fork_handler()
#endif
}
extern int openblas_num_threads_env();
extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
env_var_t p;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num;
#endif
@ -308,18 +313,18 @@ int blas_get_cpu_number(void){
blas_goto_num = 0;
#ifndef USE_OPENMP
if (readenv(p,"OPENBLAS_NUM_THREADS")) blas_goto_num = atoi(p);
blas_goto_num=openblas_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
if (blas_goto_num == 0) {
if (readenv(p,"GOTO_NUM_THREADS")) blas_goto_num = atoi(p);
if (blas_goto_num < 0) blas_goto_num = 0;
blas_goto_num=openblas_goto_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
}
#endif
blas_omp_num = 0;
if (readenv(p,"OMP_NUM_THREADS")) blas_omp_num = atoi(p);
blas_omp_num=openblas_omp_num_threads_env();
if (blas_omp_num < 0) blas_omp_num = 0;
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
@ -355,7 +360,9 @@ int openblas_get_num_threads(void) {
#ifndef SMP
return 1;
#else
return blas_get_cpu_number();
// init blas_cpu_number if needed
blas_get_cpu_number();
return blas_cpu_number;
#endif
}
@ -914,7 +921,6 @@ static volatile struct {
} memory[NUM_BUFFERS];
static int memory_initialized = 0;
static void gotoblas_memory_init(void);
/* Memory allocation routine */
/* procpos ... indicates where it comes from */
@ -1337,6 +1343,7 @@ static void gotoblas_memory_init(void) {
/* Initialization for all function; this function should be called before main */
static int gotoblas_initialized = 0;
extern void openblas_read_env();
void CONSTRUCTOR gotoblas_init(void) {
@ -1346,6 +1353,8 @@ void CONSTRUCTOR gotoblas_init(void) {
openblas_fork_handler();
#endif
openblas_read_env();
#ifdef PROFILE
moncontrol (0);
#endif
@ -1362,6 +1371,19 @@ void CONSTRUCTOR gotoblas_init(void) {
gotoblas_memory_init();
#endif
//#if defined(OS_LINUX)
#if 0
struct rlimit curlimit;
if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
{
if ( curlimit.rlim_cur != curlimit.rlim_max )
{
curlimit.rlim_cur = curlimit.rlim_max;
setrlimit(RLIMIT_STACK, &curlimit);
}
}
#endif
#ifdef SMP
if (blas_cpu_number == 0) blas_get_cpu_number();
#ifdef SMP_SERVER

View File

@ -0,0 +1,84 @@
/***************************************************************************
Copyright (c) 2011-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static int openblas_env_verbose=0;
static unsigned int openblas_env_thread_timeout=0;
static int openblas_env_block_factor=0;
static int openblas_env_openblas_num_threads=0;
static int openblas_env_goto_num_threads=0;
static int openblas_env_omp_num_threads=0;
int openblas_verbose() { return openblas_env_verbose;}
unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;}
int openblas_block_factor() { return openblas_env_block_factor;}
int openblas_num_threads_env() { return openblas_env_openblas_num_threads;}
int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;}
int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;}
void openblas_read_env() {
int ret=0;
env_var_t p;
if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
if(ret<0) ret=0;
openblas_env_verbose=ret;
ret=0;
if (readenv(p,"OPENBLAS_BLOCK_FACTOR")) ret = atoi(p);
if(ret<0) ret=0;
openblas_env_block_factor=ret;
ret=0;
if (readenv(p,"OPENBLAS_THREAD_TIMEOUT")) ret = atoi(p);
if(ret<0) ret=0;
openblas_env_thread_timeout=(unsigned int)ret;
ret=0;
if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p);
if(ret<0) ret=0;
openblas_env_openblas_num_threads=ret;
ret=0;
if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p);
if(ret<0) ret=0;
openblas_env_goto_num_threads=ret;
ret=0;
if (readenv(p,"OMP_NUM_THREADS")) ret = atoi(p);
if(ret<0) ret=0;
openblas_env_omp_num_threads=ret;
}

View File

@ -33,13 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
int openblas_verbose() {
int ret=0;
env_var_t p;
if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
if(ret<0) ret=0;
return ret;
}
extern int openblas_verbose();
void openblas_warning(int verbose, const char * msg) {
int current_verbose;

View File

@ -40,6 +40,7 @@
#include <string.h>
#include "common.h"
extern int openblas_block_factor();
int get_L2_size(void);
#define DEFAULT_GEMM_P 128
@ -249,7 +250,6 @@ int get_L2_size(void){
void blas_set_parameter(void){
env_var_t p;
int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
int size = 16;
@ -468,9 +468,8 @@ void blas_set_parameter(void){
#endif
#endif
if (readenv(p,"GOTO_BLOCK_FACTOR")) {
factor = atoi(p);
factor=openblas_block_factor();
if (factor>0) {
if (factor < 10) factor = 10;
if (factor > 200) factor = 200;

View File

@ -26,10 +26,16 @@ ifndef ONLY_CBLAS
ONLY_CBLAS = 0
endif
ifndef BUILD_LAPACK_DEPRECATED
BUILD_LAPACK_DEPRECATED = 0
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(F_COMPILER), GFORTRAN)
ifndef ONLY_CBLAS
EXTRALIB += -lgfortran
endif
endif
ifeq ($(USE_OPENMP), 1)
ifeq ($(C_COMPILER), GCC)
EXTRALIB += -lgomp
@ -39,9 +45,11 @@ endif
ifeq ($(OSNAME), CYGWIN_NT)
ifeq ($(F_COMPILER), GFORTRAN)
ifndef ONLY_CBLAS
EXTRALIB += -lgfortran
endif
endif
endif
all::
@ -88,17 +96,17 @@ dll : ../$(LIBDLLNAME)
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB)
libopenblas.def : gensymbol
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
libgoto_hpl.def : gensymbol
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
else
../$(LIBNAME).renamed : ../$(LIBNAME) objconv.def
$(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed
$(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def
../$(LIBNAME).osx.renamed : ../$(LIBNAME) objconv.def
$(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).osx.renamed
$(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
endif
ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2))
#only build without Fortran
@ -110,7 +118,7 @@ endif
dllinit.$(SUFFIX) : dllinit.c
$(CC) $(CFLAGS) -c -o $(@F) -s $<
ifeq ($(OSNAME), Linux)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
so : ../$(LIBSONAME)
@ -201,26 +209,26 @@ static : ../$(LIBNAME)
rm -f goto.$(SUFFIX)
osx.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
aix.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
objcopy.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
objconv.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
test : linktest.c
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
rm -f linktest
linktest.c : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > linktest.c
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c
clean ::
@rm -f *.def *.dylib __.SYMDEF*
@rm -f *.def *.dylib __.SYMDEF* *.renamed
include ../Makefile.tail

View File

@ -173,18 +173,18 @@
sgbbrd, sgbcon, sgbequ, sgbrfs, sgbsv,
sgbsvx, sgbtf2, sgbtrf, sgbtrs, sgebak, sgebal, sgebd2,
sgebrd, sgecon, sgeequ, sgees, sgeesx, sgeev, sgeevx,
sgegs, sgegv, sgehd2, sgehrd, sgelq2, sgelqf,
sgels, sgelsd, sgelss, sgelsx, sgelsy, sgeql2, sgeqlf,
sgeqp3, sgeqpf, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs,
sgehd2, sgehrd, sgelq2, sgelqf,
sgels, sgelsd, sgelss, sgelsy, sgeql2, sgeqlf,
sgeqp3, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs,
sgerq2, sgerqf, sgesc2, sgesdd, sgesvd, sgesvx,
sgetc2, sgetri,
sggbak, sggbal, sgges, sggesx, sggev, sggevx,
sggglm, sgghrd, sgglse, sggqrf,
sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv,
sggrqf, sgtcon, sgtrfs, sgtsv,
sgtsvx, sgttrf, sgttrs, sgtts2, shgeqz,
shsein, shseqr, slabrd, slacon, slacn2,
slaein, slaexc, slag2, slags2, slagtm, slagv2, slahqr,
slahrd, slahr2, slaic1, slaln2, slals0, slalsa, slalsd,
slahr2, slaic1, slaln2, slals0, slalsa, slalsd,
slangb, slange, slangt, slanhs, slansb, slansp,
slansy, slantb, slantp, slantr, slanv2,
slapll, slapmt,
@ -194,7 +194,7 @@
slarf, slarfb, slarfg, slarfgp, slarft, slarfx, slargv,
slarrv, slartv,
slarz, slarzb, slarzt, slasy2, slasyf,
slatbs, slatdf, slatps, slatrd, slatrs, slatrz, slatzm,
slatbs, slatdf, slatps, slatrd, slatrs, slatrz,
sopgtr, sopmtr, sorg2l, sorg2r,
sorgbr, sorghr, sorgl2, sorglq, sorgql, sorgqr, sorgr2,
sorgrq, sorgtr, sorm2l, sorm2r,
@ -220,7 +220,7 @@
stgsja, stgsna, stgsy2, stgsyl, stpcon, stprfs, stptri,
stptrs,
strcon, strevc, strexc, strrfs, strsen, strsna, strsyl,
strtrs, stzrqf, stzrzf, sstemr,
strtrs, stzrzf, sstemr,
slansf, spftrf, spftri, spftrs, ssfrk, stfsm, stftri, stfttp,
stfttr, stpttf, stpttr, strttf, strttp,
sgejsv, sgesvj, sgsvj0, sgsvj1,
@ -245,14 +245,13 @@
cbdsqr, cgbbrd, cgbcon, cgbequ, cgbrfs, cgbsv, cgbsvx,
cgbtf2, cgbtrf, cgbtrs, cgebak, cgebal, cgebd2, cgebrd,
cgecon, cgeequ, cgees, cgeesx, cgeev, cgeevx,
cgegs, cgegv, cgehd2, cgehrd, cgelq2, cgelqf,
cgels, cgelsd, cgelss, cgelsx, cgelsy, cgeql2, cgeqlf, cgeqp3,
cgeqpf, cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs,
cgehd2, cgehrd, cgelq2, cgelqf,
cgels, cgelsd, cgelss, cgelsy, cgeql2, cgeqlf, cgeqp3,
cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs,
cgerq2, cgerqf, cgesc2, cgesdd, cgesvd,
cgesvx, cgetc2, cgetri,
cggbak, cggbal, cgges, cggesx, cggev, cggevx, cggglm,
cgghrd, cgglse, cggqrf, cggrqf,
cggsvd, cggsvp,
cgtcon, cgtrfs, cgtsv, cgtsvx, cgttrf, cgttrs, cgtts2, chbev,
chbevd, chbevx, chbgst, chbgv, chbgvd, chbgvx, chbtrd,
checon, cheev, cheevd, cheevr, cheevx, chegs2, chegst,
@ -267,7 +266,7 @@
claed0, claed7, claed8,
claein, claesy, claev2, clags2, clagtm,
clahef, clahqr,
clahrd, clahr2, claic1, clals0, clalsa, clalsd, clangb, clange, clangt,
clahr2, claic1, clals0, clalsa, clalsd, clangb, clange, clangt,
clanhb, clanhe,
clanhp, clanhs, clanht, clansb, clansp, clansy, clantb,
clantp, clantr, clapll, clapmt, clarcm, claqgb, claqge,
@ -278,7 +277,7 @@
clarfx, clargv, clarnv, clarrv, clartg, clartv,
clarz, clarzb, clarzt, clascl, claset, clasr, classq,
clasyf, clatbs, clatdf, clatps, clatrd, clatrs, clatrz,
clatzm, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv,
cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv,
cpbsvx, cpbtf2, cpbtrf, cpbtrs, cpocon, cpoequ, cporfs,
cposv, cposvx, cpstrf, cpstf2,
cppcon, cppequ, cpprfs, cppsv, cppsvx, cpptrf, cpptri, cpptrs,
@ -293,7 +292,7 @@
ctgexc, ctgsen, ctgsja, ctgsna, ctgsy2, ctgsyl, ctpcon,
ctprfs, ctptri,
ctptrs, ctrcon, ctrevc, ctrexc, ctrrfs, ctrsen, ctrsna,
ctrsyl, ctrtrs, ctzrqf, ctzrzf, cung2l, cung2r,
ctrsyl, ctrtrs, ctzrzf, cung2l, cung2r,
cungbr, cunghr, cungl2, cunglq, cungql, cungqr, cungr2,
cungrq, cungtr, cunm2l, cunm2r, cunmbr, cunmhr, cunml2,
cunmlq, cunmql, cunmqr, cunmr2, cunmr3, cunmrq, cunmrz,
@ -321,18 +320,18 @@
dgbbrd, dgbcon, dgbequ, dgbrfs, dgbsv,
dgbsvx, dgbtf2, dgbtrf, dgbtrs, dgebak, dgebal, dgebd2,
dgebrd, dgecon, dgeequ, dgees, dgeesx, dgeev, dgeevx,
dgegs, dgegv, dgehd2, dgehrd, dgelq2, dgelqf,
dgels, dgelsd, dgelss, dgelsx, dgelsy, dgeql2, dgeqlf,
dgeqp3, dgeqpf, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs,
dgehd2, dgehrd, dgelq2, dgelqf,
dgels, dgelsd, dgelss, dgelsy, dgeql2, dgeqlf,
dgeqp3, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs,
dgerq2, dgerqf, dgesc2, dgesdd, dgesvd, dgesvx,
dgetc2, dgetri,
dggbak, dggbal, dgges, dggesx, dggev, dggevx,
dggglm, dgghrd, dgglse, dggqrf,
dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv,
dggrqf, dgtcon, dgtrfs, dgtsv,
dgtsvx, dgttrf, dgttrs, dgtts2, dhgeqz,
dhsein, dhseqr, dlabrd, dlacon, dlacn2,
dlaein, dlaexc, dlag2, dlags2, dlagtm, dlagv2, dlahqr,
dlahrd, dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd,
dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd,
dlangb, dlange, dlangt, dlanhs, dlansb, dlansp,
dlansy, dlantb, dlantp, dlantr, dlanv2,
dlapll, dlapmt,
@ -342,7 +341,7 @@
dlarf, dlarfb, dlarfg, dlarfgp, dlarft, dlarfx,
dlargv, dlarrv, dlartv,
dlarz, dlarzb, dlarzt, dlasy2, dlasyf,
dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, dlatzm,
dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz,
dopgtr, dopmtr, dorg2l, dorg2r,
dorgbr, dorghr, dorgl2, dorglq, dorgql, dorgqr, dorgr2,
dorgrq, dorgtr, dorm2l, dorm2r,
@ -368,7 +367,7 @@
dtgsja, dtgsna, dtgsy2, dtgsyl, dtpcon, dtprfs, dtptri,
dtptrs,
dtrcon, dtrevc, dtrexc, dtrrfs, dtrsen, dtrsna, dtrsyl,
dtrtrs, dtzrqf, dtzrzf, dstemr,
dtrtrs, dtzrzf, dstemr,
dsgesv, dsposv, dlag2s, slag2d, dlat2s,
dlansf, dpftrf, dpftri, dpftrs, dsfrk, dtfsm, dtftri, dtfttp,
dtfttr, dtpttf, dtpttr, dtrttf, dtrttp,
@ -387,14 +386,13 @@
zbdsqr, zgbbrd, zgbcon, zgbequ, zgbrfs, zgbsv, zgbsvx,
zgbtf2, zgbtrf, zgbtrs, zgebak, zgebal, zgebd2, zgebrd,
zgecon, zgeequ, zgees, zgeesx, zgeev, zgeevx,
zgegs, zgegv, zgehd2, zgehrd, zgelq2, zgelqf,
zgels, zgelsd, zgelss, zgelsx, zgelsy, zgeql2, zgeqlf, zgeqp3,
zgeqpf, zgeqr2, zgeqr2p, zgeqrf, zgeqrfp, zgerfs, zgerq2, zgerqf,
zgehd2, zgehrd, zgelq2, zgelqf,
zgels, zgelsd, zgelss, zgelsy, zgeql2, zgeqlf, zgeqp3,
zgeqr2, zgeqr2p, zgeqrf, zgeqrfp, zgerfs, zgerq2, zgerqf,
zgesc2, zgesdd, zgesvd, zgesvx, zgetc2,
zgetri,
zggbak, zggbal, zgges, zggesx, zggev, zggevx, zggglm,
zgghrd, zgglse, zggqrf, zggrqf,
zggsvd, zggsvp,
zgtcon, zgtrfs, zgtsv, zgtsvx, zgttrf, zgttrs, zgtts2, zhbev,
zhbevd, zhbevx, zhbgst, zhbgv, zhbgvd, zhbgvx, zhbtrd,
zhecon, zheev, zheevd, zheevr, zheevx, zhegs2, zhegst,
@ -409,7 +407,7 @@
zlaed0, zlaed7, zlaed8,
zlaein, zlaesy, zlaev2, zlags2, zlagtm,
zlahef, zlahqr,
zlahrd, zlahr2, zlaic1, zlals0, zlalsa, zlalsd, zlangb, zlange,
zlahr2, zlaic1, zlals0, zlalsa, zlalsd, zlangb, zlange,
zlangt, zlanhb,
zlanhe,
zlanhp, zlanhs, zlanht, zlansb, zlansp, zlansy, zlantb,
@ -422,7 +420,7 @@
zlarfx, zlargv, zlarnv, zlarrv, zlartg, zlartv,
zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr,
zlassq, zlasyf,
zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz, zlatzm,
zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz,
zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv,
zpbsvx, zpbtf2, zpbtrf, zpbtrs, zpocon, zpoequ, zporfs,
zposv, zposvx, zpotrs, zpstrf, zpstf2,
@ -438,7 +436,7 @@
ztgexc, ztgsen, ztgsja, ztgsna, ztgsy2, ztgsyl, ztpcon,
ztprfs, ztptri,
ztptrs, ztrcon, ztrevc, ztrexc, ztrrfs, ztrsen, ztrsna,
ztrsyl, ztrtrs, ztzrqf, ztzrzf, zung2l,
ztrsyl, ztrtrs, ztzrzf, zung2l,
zung2r, zungbr, zunghr, zungl2, zunglq, zungql, zungqr, zungr2,
zungrq, zungtr, zunm2l, zunm2r, zunmbr, zunmhr, zunml2,
zunmlq, zunmql, zunmqr, zunmr2, zunmr3, zunmrq, zunmrz,
@ -452,6 +450,139 @@
zunbdb5, zunbdb6, zuncsd, zuncsd2by1,
zgeqrt, zgeqrt2, zgeqrt3, zgemqrt,
ztpqrt, ztpqrt2, ztpmqrt, ztprfb,
# functions added for lapack-3.6.0
cgejsv,
cgesvdx,
cgesvj,
cgetrf2,
cgges3,
cggev3,
cgghd3,
cggsvd3,
cggsvp3,
cgsvj0,
cgsvj1,
clagge,
claghe,
clagsy,
clahilb,
clakf2,
clarge,
clarnd,
claror,
clarot,
clatm1,
clatm2,
clatm3,
clatm5,
clatm6,
clatme,
clatmr,
clatms,
clatmt,
cpotrf2,
csbmv,
cspr2,
csyr2,
cunm22,
dbdsvdx,
dgesvdx,
dgetrf2,
dgges3,
dggev3,
dgghd3,
dggsvd3,
dggsvp3,
dladiv2,
dlagge,
dlagsy,
dlahilb,
dlakf2,
dlaran,
dlarge,
dlarnd,
dlaror,
dlarot,
dlatm1,
dlatm2,
dlatm3,
dlatm5,
dlatm6,
dlatm7,
dlatme,
dlatmr,
dlatms,
dlatmt,
dorm22,
dpotrf2,
dsecnd,
sbdsvdx,
second,
sgesvdx,
sgetrf2,
sgges3,
sggev3,
sgghd3,
sggsvd3,
sggsvp3,
sladiv2,
slagge,
slagsy,
slahilb,
slakf2,
slaran,
slarge,
slarnd,
slaror,
slarot,
slatm1,
slatm2,
slatm3,
slatm5,
slatm6,
slatm7,
slatme,
slatmr,
slatms,
slatmt,
sorm22,
spotrf2,
zgejsv,
zgesvdx,
zgesvj,
zgetrf2,
zgges3,
zggev3,
zgghd3,
zggsvd3,
zggsvp3,
zgsvj0,
zgsvj1,
zlagge,
zlaghe,
zlagsy,
zlahilb,
zlakf2,
zlarge,
zlarnd,
zlaror,
zlarot,
zlatm1,
zlatm2,
zlatm3,
zlatm5,
zlatm6,
zlatme,
zlatmr,
zlatms,
zlatmt,
zpotrf2,
zsbmv,
zspr2,
zsyr2,
zunm22
);
@lapack_extendedprecision_objs = (
@ -459,6 +590,13 @@
dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx,
);
@lapack_deprecated_objs = (
cgegs, cggsvd, ctzrqf, dgeqpf, dlatzm, sgelsx, slahrd, zgegv, zggsvp,
cgegv, cggsvp, dgegs, dggsvd, dtzrqf, sgeqpf, slatzm, zgelsx, zlahrd,
cgelsx, clahrd, dgegv, dggsvp, sgegs, sggsvd, stzrqf, zgeqpf, zlatzm,
cgeqpf, clatzm, dgelsx, dlahrd, sgegv, sggsvp, zgegs, zggsvd, ztzrqf,
);
@lapackeobjs = (
# LAPACK C interface routines.
#
@ -682,8 +820,6 @@
LAPACKE_cgeqlf_work,
LAPACKE_cgeqp3,
LAPACKE_cgeqp3_work,
LAPACKE_cgeqpf,
LAPACKE_cgeqpf_work,
LAPACKE_cgeqr2,
LAPACKE_cgeqr2_work,
LAPACKE_cgeqrf,
@ -738,10 +874,6 @@
LAPACKE_cggqrf_work,
LAPACKE_cggrqf,
LAPACKE_cggrqf_work,
LAPACKE_cggsvd,
LAPACKE_cggsvd_work,
LAPACKE_cggsvp,
LAPACKE_cggsvp_work,
LAPACKE_cgtcon,
LAPACKE_cgtcon_work,
LAPACKE_cgtrfs,
@ -1186,8 +1318,6 @@
LAPACKE_dgeqlf_work,
LAPACKE_dgeqp3,
LAPACKE_dgeqp3_work,
LAPACKE_dgeqpf,
LAPACKE_dgeqpf_work,
LAPACKE_dgeqr2,
LAPACKE_dgeqr2_work,
LAPACKE_dgeqrf,
@ -1244,10 +1374,6 @@
LAPACKE_dggqrf_work,
LAPACKE_dggrqf,
LAPACKE_dggrqf_work,
LAPACKE_dggsvd,
LAPACKE_dggsvd_work,
LAPACKE_dggsvp,
LAPACKE_dggsvp_work,
LAPACKE_dgtcon,
LAPACKE_dgtcon_work,
LAPACKE_dgtrfs,
@ -1676,8 +1802,6 @@
LAPACKE_sgeqlf_work,
LAPACKE_sgeqp3,
LAPACKE_sgeqp3_work,
LAPACKE_sgeqpf,
LAPACKE_sgeqpf_work,
LAPACKE_sgeqr2,
LAPACKE_sgeqr2_work,
LAPACKE_sgeqrf,
@ -1734,10 +1858,6 @@
LAPACKE_sggqrf_work,
LAPACKE_sggrqf,
LAPACKE_sggrqf_work,
LAPACKE_sggsvd,
LAPACKE_sggsvd_work,
LAPACKE_sggsvp,
LAPACKE_sggsvp_work,
LAPACKE_sgtcon,
LAPACKE_sgtcon_work,
LAPACKE_sgtrfs,
@ -2158,8 +2278,6 @@
LAPACKE_zgeqlf_work,
LAPACKE_zgeqp3,
LAPACKE_zgeqp3_work,
LAPACKE_zgeqpf,
LAPACKE_zgeqpf_work,
LAPACKE_zgeqr2,
LAPACKE_zgeqr2_work,
LAPACKE_zgeqrf,
@ -2214,10 +2332,6 @@
LAPACKE_zggqrf_work,
LAPACKE_zggrqf,
LAPACKE_zggrqf_work,
LAPACKE_zggsvd,
LAPACKE_zggsvd_work,
LAPACKE_zggsvp,
LAPACKE_zggsvp_work,
LAPACKE_zgtcon,
LAPACKE_zgtcon_work,
LAPACKE_zgtrfs,
@ -2707,6 +2821,134 @@
LAPACKE_slagsy_work,
LAPACKE_zlagsy,
LAPACKE_zlagsy_work,
## new function from lapack-3.6.0
LAPACKE_cgejsv,
LAPACKE_cgejsv_work,
LAPACKE_cgesvdx,
LAPACKE_cgesvdx_work,
LAPACKE_cgesvj,
LAPACKE_cgesvj_work,
LAPACKE_cgetrf2,
LAPACKE_cgetrf2_work,
LAPACKE_cgges3,
LAPACKE_cgges3_work,
LAPACKE_cggev3,
LAPACKE_cggev3_work,
LAPACKE_cgghd3,
LAPACKE_cgghd3_work,
LAPACKE_cggsvd3,
LAPACKE_cggsvd3_work,
LAPACKE_cggsvp3,
LAPACKE_cggsvp3_work,
LAPACKE_chetrf_rook,
LAPACKE_chetrf_rook_work,
LAPACKE_chetrs_rook,
LAPACKE_chetrs_rook_work,
LAPACKE_clapmt,
LAPACKE_clapmt_work,
LAPACKE_clascl,
LAPACKE_clascl_work,
LAPACKE_cpotrf2,
LAPACKE_cpotrf2_work,
LAPACKE_csytrf_rook,
LAPACKE_csytrf_rook_work,
LAPACKE_csytrs_rook,
LAPACKE_csytrs_rook_work,
LAPACKE_cuncsd2by1,
LAPACKE_cuncsd2by1_work,
LAPACKE_dbdsvdx,
LAPACKE_dbdsvdx_work,
LAPACKE_dgesvdx,
LAPACKE_dgesvdx_work,
LAPACKE_dgetrf2,
LAPACKE_dgetrf2_work,
LAPACKE_dgges3,
LAPACKE_dgges3_work,
LAPACKE_dggev3,
LAPACKE_dggev3_work,
LAPACKE_dgghd3,
LAPACKE_dgghd3_work,
LAPACKE_dggsvd3,
LAPACKE_dggsvd3_work,
LAPACKE_dggsvp3,
LAPACKE_dggsvp3_work,
LAPACKE_dlapmt,
LAPACKE_dlapmt_work,
LAPACKE_dlascl,
LAPACKE_dlascl_work,
LAPACKE_dorcsd2by1,
LAPACKE_dorcsd2by1_work,
LAPACKE_dpotrf2,
LAPACKE_dpotrf2_work,
LAPACKE_dsytrf_rook,
LAPACKE_dsytrf_rook_work,
LAPACKE_dsytrs_rook,
LAPACKE_dsytrs_rook_work,
LAPACKE_sbdsvdx,
LAPACKE_sbdsvdx_work,
LAPACKE_sgesvdx,
LAPACKE_sgesvdx_work,
LAPACKE_sgetrf2,
LAPACKE_sgetrf2_work,
LAPACKE_sgges3,
LAPACKE_sgges3_work,
LAPACKE_sggev3,
LAPACKE_sggev3_work,
LAPACKE_sgghd3,
LAPACKE_sgghd3_work,
LAPACKE_sggsvd3,
LAPACKE_sggsvd3_work,
LAPACKE_sggsvp3,
LAPACKE_sggsvp3_work,
LAPACKE_slapmt,
LAPACKE_slapmt_work,
LAPACKE_slascl,
LAPACKE_slascl_work,
LAPACKE_sorcsd2by1,
LAPACKE_sorcsd2by1_work,
LAPACKE_spotrf2,
LAPACKE_spotrf2_work,
LAPACKE_ssytrf_rook,
LAPACKE_ssytrf_rook_work,
LAPACKE_ssytrs_rook,
LAPACKE_ssytrs_rook_work,
LAPACKE_stpqrt,
LAPACKE_stpqrt_work,
LAPACKE_zgejsv,
LAPACKE_zgejsv_work,
LAPACKE_zgesvdx,
LAPACKE_zgesvdx_work,
LAPACKE_zgesvj,
LAPACKE_zgesvj_work,
LAPACKE_zgetrf2,
LAPACKE_zgetrf2_work,
LAPACKE_zgges3,
LAPACKE_zgges3_work,
LAPACKE_zggev3,
LAPACKE_zggev3_work,
LAPACKE_zgghd3,
LAPACKE_zgghd3_work,
LAPACKE_zggsvd3,
LAPACKE_zggsvd3_work,
LAPACKE_zggsvp3,
LAPACKE_zggsvp3_work,
LAPACKE_zhetrf_rook,
LAPACKE_zhetrf_rook_work,
LAPACKE_zhetrs_rook,
LAPACKE_zhetrs_rook_work,
LAPACKE_zlapmt,
LAPACKE_zlapmt_work,
LAPACKE_zlascl,
LAPACKE_zlascl_work,
LAPACKE_zpotrf2,
LAPACKE_zpotrf2_work,
LAPACKE_zsytrf_rook,
LAPACKE_zsytrf_rook_work,
LAPACKE_zsytrs_rook,
LAPACKE_zsytrs_rook_work,
LAPACKE_zuncsd2by1,
LAPACKE_zuncsd2by1_work
);
#These function may need 2 underscores.
@ -2749,6 +2991,11 @@ if ($ARGV[8] == 1) {
@need_2underscore_objs = (@lapack_embeded_underscore_objs);
};
if ($ARGV[11] == 1){
#BUILD_LAPACK_DEPRECATED=1
@underscore_objs =(@underscore_objs, @lapack_deprecated_objs);
}
} else {
@underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs);
}

View File

@ -1,5 +1,7 @@
#!/usr/bin/perl
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
#
# 1. Not specified
# 1.1 Automatically detect, then check compiler
@ -272,8 +274,9 @@ if ($link ne "") {
}
if ($flags =~ /^\-Y/) {
next if ($hostos eq 'SunOS');
$linker_L .= "-Wl,". $flags . " ";
}
}
if ($flags =~ /^\-rpath\@/) {
$flags =~ s/\@/\,/g;

View File

@ -86,7 +86,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
#ifdef linux
#if defined(linux) || defined(__sun__)
#include <sys/sysinfo.h>
#include <unistd.h>
#endif
@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "POWER5"
#endif
#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8)
#if defined(FORCE_POWER6) || defined(FORCE_POWER7)
#define FORCE
#define ARCHITECTURE "POWER"
#define SUBARCHITECTURE "POWER6"
@ -565,6 +565,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "POWER6"
#endif
#if defined(FORCE_POWER8)
#define FORCE
#define ARCHITECTURE "POWER"
#define SUBARCHITECTURE "POWER8"
#define SUBDIRNAME "power"
#define ARCHCONFIG "-DPOWER8 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \
"-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "power8"
#define CORENAME "POWER8"
#endif
#ifdef FORCE_PPCG4
#define FORCE
#define ARCHITECTURE "POWER"
@ -819,10 +833,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 "
#define LIBNAME "armv8"
#define CORENAME "XGENE1"
#else
#define CORENAME "ARMV8"
#endif
#ifdef FORCE_CORTEXA57
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "ARMV8"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA57 " \
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
#define LIBNAME "cortexa57"
#define CORENAME "CORTEXA57"
#else
#endif
#ifndef FORCE
@ -892,7 +920,7 @@ static int get_num_cores(void) {
size_t len;
#endif
#ifdef linux
#if defined(linux) || defined(__sun__)
//returns the number of processors which are currently online
return sysconf(_SC_NPROCESSORS_ONLN);
@ -984,7 +1012,9 @@ int main(int argc, char *argv[]){
#endif
#endif
#if NO_PARALLEL_MAKE==1
#ifdef MAKE_NB_JOBS
printf("MAKE += -j %d\n", MAKE_NB_JOBS);
#elif NO_PARALLEL_MAKE==1
printf("MAKE += -j 1\n");
#else
#ifndef OS_WINDOWS

View File

@ -79,11 +79,9 @@ void NAME(char *TRANS, blasint *M, blasint *N,
FLOAT alpha = *ALPHA;
FLOAT beta = *BETA;
FLOAT *buffer;
int buffer_size;
#ifdef SMP
int nthreads;
int nthreads_max;
int nthreads_avail;
double MNK;
#endif
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = {
@ -134,13 +132,10 @@ void CNAME(enum CBLAS_ORDER order,
FLOAT *buffer;
blasint lenx, leny;
int trans;
int trans, buffer_size;
blasint info, t;
#ifdef SMP
int nthreads;
int nthreads_max;
int nthreads_avail;
double MNK;
#endif
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = {
@ -215,43 +210,20 @@ void CNAME(enum CBLAS_ORDER order,
if (incx < 0) x -= (lenx - 1) * incx;
if (incy < 0) y -= (leny - 1) * incy;
#ifdef MAX_STACK_ALLOC
// make it volatile because some gemv implementation (ex: dgemv_n.S)
// do not restore all register
volatile int stack_alloc_size = 0;
//for gemv_n and gemv_t, try to allocate on stack
stack_alloc_size = m + n;
#ifdef ALIGNED_ACCESS
stack_alloc_size += 3;
#endif
if(stack_alloc_size < 128)
//dgemv_n.S require a 128 bytes buffer
stack_alloc_size = 128;
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
stack_alloc_size = 0;
FLOAT stack_buffer[stack_alloc_size];
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
// printf("stack_alloc_size=%d\n", stack_alloc_size);
#else
//Original OpenBLAS/GotoBLAS codes.
buffer = (FLOAT *)blas_memory_alloc(1);
buffer_size = m + n + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT) ;
#endif
// for alignment
buffer_size = (buffer_size + 3) & ~3;
STACK_ALLOC(buffer_size, FLOAT, buffer);
#ifdef SMP
nthreads_max = num_cpu_avail(2);
nthreads_avail = nthreads_max;
MNK = (double) m * (double) n;
if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) )
nthreads_max = 1;
if ( nthreads_max > nthreads_avail )
nthreads = nthreads_avail;
if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD )
nthreads = 1;
else
nthreads = nthreads_max;
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
#endif
@ -266,14 +238,7 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
#ifdef MAX_STACK_ALLOC
if(!stack_alloc_size){
blas_memory_free(buffer);
}
#else
blas_memory_free(buffer);
#endif
STACK_FREE(buffer);
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
IDEBUG_END;

View File

@ -171,19 +171,14 @@ void CNAME(enum CBLAS_ORDER order,
if (incy < 0) y -= (n - 1) * incy;
if (incx < 0) x -= (m - 1) * incx;
#ifdef MAX_STACK_ALLOC
volatile int stack_alloc_size = m;
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
stack_alloc_size = 0;
FLOAT stack_buffer[stack_alloc_size];
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
#else
buffer = (FLOAT *)blas_memory_alloc(1);
#endif
STACK_ALLOC(m, FLOAT, buffer);
#ifdef SMPTEST
nthreads = num_cpu_avail(2);
// Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
if(1L * m * n > 2048L * GEMM_MULTITHREAD_THRESHOLD)
nthreads = num_cpu_avail(2);
else
nthreads = 1;
if (nthreads == 1) {
#endif
@ -198,11 +193,7 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
#ifdef MAX_STACK_ALLOC
if(!stack_alloc_size)
#endif
blas_memory_free(buffer);
STACK_FREE(buffer);
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
IDEBUG_END;

View File

@ -95,7 +95,7 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
s = db / r;
z = ONE;
if (ada > adb) z = s;
if ((ada < adb) && (c != ZERO)) z = ONE / c;
if ((ada <= adb) && (c != ZERO)) z = ONE / c;
*C = c;
*S = s;

View File

@ -77,12 +77,13 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
if (incy < 0) y -= (n - 1) * incy;
#ifdef SMP
nthreads = num_cpu_avail(1);
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0)
nthreads = 1;
if (incx == 0 || incy == 0 || n < 2097152 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT))
nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) {
#endif

View File

@ -91,6 +91,27 @@
#endif
#endif
#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
#define MODE (BLAS_XDOUBLE | BLAS_REAL)
#elif defined(DOUBLE)
#define MODE (BLAS_DOUBLE | BLAS_REAL)
#else
#define MODE (BLAS_SINGLE | BLAS_REAL)
#endif
#else
#ifdef XDOUBLE
#define MODE (BLAS_XDOUBLE | BLAS_COMPLEX)
#elif defined(DOUBLE)
#define MODE (BLAS_DOUBLE | BLAS_COMPLEX)
#else
#define MODE (BLAS_SINGLE | BLAS_COMPLEX)
#endif
#endif
#endif
static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
#ifndef GEMM3M
#ifndef HEMM
@ -135,26 +156,6 @@ void NAME(char *SIDE, char *UPLO,
FLOAT *buffer;
FLOAT *sa, *sb;
#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
#endif
#endif
#if defined(SMP) && !defined(NO_AFFINITY)
int nodes;
#endif
@ -246,26 +247,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
FLOAT *buffer;
FLOAT *sa, *sb;
#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
#endif
#endif
#if defined(SMP) && !defined(NO_AFFINITY)
int nodes;
#endif
@ -407,7 +388,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
args.nthreads /= nodes;
gemm_thread_mn(mode, &args, NULL, NULL,
gemm_thread_mn(MODE, &args, NULL, NULL,
symm[4 | (side << 1) | uplo ], sa, sb, nodes);
} else {
@ -419,7 +400,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
#else
GEMM_THREAD(mode, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads);
GEMM_THREAD(MODE, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads);
#endif

View File

@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) {
FLOAT *buffer;
int trans, uplo;
int uplo;
blasint info;
#ifdef SMP
int nthreads;
@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
PRINT_DEBUG_CNAME;
trans = -1;
uplo = -1;
info = 0;

View File

@ -118,7 +118,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) {
FLOAT *buffer;
int trans, uplo;
int uplo;
blasint info;
#ifdef SMP
int nthreads;
@ -126,7 +126,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
PRINT_DEBUG_CNAME;
trans = -1;
uplo = -1;
info = 0;

View File

@ -77,11 +77,9 @@ void NAME(char *TRANS, blasint *M, blasint *N,
blasint incy = *INCY;
FLOAT *buffer;
int buffer_size;
#ifdef SMP
int nthreads;
int nthreads_max;
int nthreads_avail;
double MNK;
#endif
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG,
@ -144,13 +142,10 @@ void CNAME(enum CBLAS_ORDER order,
FLOAT *buffer;
blasint lenx, leny;
int trans;
int trans, buffer_size;
blasint info, t;
#ifdef SMP
int nthreads;
int nthreads_max;
int nthreads_avail;
double MNK;
#endif
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG,
@ -236,22 +231,26 @@ void CNAME(enum CBLAS_ORDER order,
if (incx < 0) x -= (lenx - 1) * incx * 2;
if (incy < 0) y -= (leny - 1) * incy * 2;
buffer = (FLOAT *)blas_memory_alloc(1);
buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT) ;
#endif
// for alignment
buffer_size = (buffer_size + 3) & ~3;
STACK_ALLOC(buffer_size, FLOAT, buffer);
#if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
// cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746)
if(trans && stack_alloc_size)
memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size));
#endif
#ifdef SMP
nthreads_max = num_cpu_avail(2);
nthreads_avail = nthreads_max;
MNK = (double) m * (double) n;
if ( MNK <= ( 256.0 * (double) (GEMM_MULTITHREAD_THRESHOLD * GEMM_MULTITHREAD_THRESHOLD) ))
nthreads_max = 1;
if ( nthreads_max > nthreads_avail )
nthreads = nthreads_avail;
if ( 1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD )
nthreads = 1;
else
nthreads = nthreads_max;
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
#endif
@ -267,7 +266,7 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
blas_memory_free(buffer);
STACK_FREE(buffer);
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);

View File

@ -210,10 +210,14 @@ void CNAME(enum CBLAS_ORDER order,
if (incy < 0) y -= (n - 1) * incy * 2;
if (incx < 0) x -= (m - 1) * incx * 2;
buffer = (FLOAT *)blas_memory_alloc(1);
STACK_ALLOC(2 * m, FLOAT, buffer);
#ifdef SMPTEST
nthreads = num_cpu_avail(2);
// Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
if(1L * m * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
nthreads = num_cpu_avail(2);
else
nthreads = 1;
if (nthreads == 1) {
#endif
@ -245,7 +249,7 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
blas_memory_free(buffer);
STACK_FREE(buffer);
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);

View File

@ -117,7 +117,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
FLOAT beta_i = BETA[1];
FLOAT *buffer;
int trans, uplo;
int uplo;
blasint info;
#ifdef SMP
int nthreads;
@ -135,7 +135,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
PRINT_DEBUG_CNAME;
trans = -1;
uplo = -1;
info = 0;

View File

@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) {
FLOAT *buffer;
int trans, uplo;
int uplo;
blasint info;
#ifdef SMP
int nthreads;
@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
PRINT_DEBUG_CNAME;
trans = -1;
uplo = -1;
info = 0;

View File

@ -121,7 +121,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
FLOAT alpha_r = ALPHA[0];
FLOAT alpha_i = ALPHA[1];
FLOAT *buffer;
int trans, uplo;
int uplo;
blasint info;
#ifdef SMP
int nthreads;
@ -129,7 +129,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
PRINT_DEBUG_CNAME;
trans = -1;
uplo = -1;
info = 0;

Some files were not shown because too many files have changed in this diff Show More