Compare commits
267 Commits
optimized_
...
z13
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
864e202afd | ||
|
|
9e4b6971e2 | ||
|
|
7f2a959e3e | ||
|
|
6418667818 | ||
|
|
dd43661cfd | ||
|
|
9253dadaa7 | ||
|
|
1e03a62b67 | ||
|
|
faa73690e4 | ||
|
|
f24d5307cf | ||
|
|
0a4276bc2f | ||
|
|
08bddde3f3 | ||
|
|
e173c51c04 | ||
|
|
9c42f0374a | ||
|
|
d4380c1fe4 | ||
|
|
a51102e9b7 | ||
|
|
7282419525 | ||
|
|
c5b1fbcb2e | ||
|
|
e1cdd15b30 | ||
|
|
d4c0330967 | ||
|
|
12540cedb5 | ||
|
|
99adc8b062 | ||
|
|
6a9bbfc227 | ||
|
|
3349e9debd | ||
|
|
dd7612358d | ||
|
|
e5a6ef3808 | ||
|
|
7aac0aff8e | ||
|
|
26d7f06206 | ||
|
|
68a69c5b50 | ||
|
|
a571359afd | ||
|
|
c2464a7c4a | ||
|
|
294f933869 | ||
|
|
f59c9bd6ef | ||
|
|
c53be46d78 | ||
|
|
bbb2d73d73 | ||
|
|
659ed16591 | ||
|
|
35c98a3556 | ||
|
|
f1a5dd06c5 | ||
|
|
e125a3dc33 | ||
|
|
35f1f21a7f | ||
|
|
7b4b7179ba | ||
|
|
7a92c1538e | ||
|
|
5727268141 | ||
|
|
3d9a50e841 | ||
|
|
828c849b44 | ||
|
|
ecc0bc9813 | ||
|
|
12f209b7b0 | ||
|
|
7316a87930 | ||
|
|
0bff057a87 | ||
|
|
7ee1d29dd4 | ||
|
|
1e6cf9808c | ||
|
|
278511ad2d | ||
|
|
3b5ffb49d3 | ||
|
|
8519e4ed9f | ||
|
|
55eda3813b | ||
|
|
53bfc83c26 | ||
|
|
13ca89f6f0 | ||
|
|
461cf9ea38 | ||
|
|
0664ba4c97 | ||
|
|
aa744dfa59 | ||
|
|
61cf8f74d9 | ||
|
|
de202fa375 | ||
|
|
6f93b53590 | ||
|
|
11c44dede1 | ||
|
|
f00d642592 | ||
|
|
9e4584d069 | ||
|
|
2a5679da5f | ||
|
|
a71e8c82f6 | ||
|
|
9b987badb0 | ||
|
|
1619b2f3c8 | ||
|
|
4f3153395a | ||
|
|
d7a1a7ff2a | ||
|
|
308e6195b7 | ||
|
|
7a3d7b1f52 | ||
|
|
74cc2d6623 | ||
|
|
fc3a558515 | ||
|
|
cd9fafc054 | ||
|
|
84b92e6373 | ||
|
|
c279a53ed8 | ||
|
|
e1df5a6e23 | ||
|
|
5c658f8746 | ||
|
|
ec4390a967 | ||
|
|
fced5744fb | ||
|
|
8c0fb1258d | ||
|
|
aae581d004 | ||
|
|
e17303933a | ||
|
|
f9226275f4 | ||
|
|
cf8c7e28b3 | ||
|
|
5ac02f6dc7 | ||
|
|
7aa1ad4923 | ||
|
|
dcd15b546c | ||
|
|
96284ab295 | ||
|
|
d5e1255ca7 | ||
|
|
587455868e | ||
|
|
323c237e7b | ||
|
|
faa5e2e5e3 | ||
|
|
551fdf53e8 | ||
|
|
fdf291be30 | ||
|
|
68eb4fa329 | ||
|
|
05196a8497 | ||
|
|
db9b611b12 | ||
|
|
2e6333f74e | ||
|
|
c99cc41cbd | ||
|
|
711ecb8bd5 | ||
|
|
10c2ebdfc5 | ||
|
|
26b3b3a3e6 | ||
|
|
acdff55a6a | ||
|
|
7d6b68eb4a | ||
|
|
0bbca5e803 | ||
|
|
cd5241d0cf | ||
|
|
8d652f11e7 | ||
|
|
6c86570e1f | ||
|
|
53ba1a77c8 | ||
|
|
d23c7c713c | ||
|
|
8c43d7fa5f | ||
|
|
085f215257 | ||
|
|
8f758eeff9 | ||
|
|
0afc76fd65 | ||
|
|
91e1c5080c | ||
|
|
73f04c2c72 | ||
|
|
3e633152c6 | ||
|
|
d5130ce7e3 | ||
|
|
4824b88fcb | ||
|
|
cc26d888b8 | ||
|
|
8577be2a95 | ||
|
|
1edf30b790 | ||
|
|
b752858d6c | ||
|
|
4fc8c937d4 | ||
|
|
efa4f5c936 | ||
|
|
17d655fa64 | ||
|
|
f68141cf1d | ||
|
|
aa90518201 | ||
|
|
6b85dbb6dc | ||
|
|
a0debd4293 | ||
|
|
937493bfeb | ||
|
|
74b0672223 | ||
|
|
6e7be06e07 | ||
|
|
a04d0555ba | ||
|
|
3761c30ba4 | ||
|
|
38593cd3a3 | ||
|
|
e3b7781c2b | ||
|
|
5e6965ea47 | ||
|
|
5cc0301fc3 | ||
|
|
19a6dedfd6 | ||
|
|
0e2b92e216 | ||
|
|
d06b92906a | ||
|
|
8e98478ff3 | ||
|
|
fb8968fb83 | ||
|
|
dae6b82a71 | ||
|
|
d73244b825 | ||
|
|
233c6b959f | ||
|
|
16ec5323c9 | ||
|
|
0ad02ef2d6 | ||
|
|
73397faf68 | ||
|
|
5fc2203d8a | ||
|
|
78dcf5c3d5 | ||
|
|
32f793195f | ||
|
|
be4e5fcd20 | ||
|
|
855e0cb700 | ||
|
|
7f7d04dcd2 | ||
|
|
4e1b521e27 | ||
|
|
a1a96589aa | ||
|
|
0e68beb89f | ||
|
|
926ba8b7ca | ||
|
|
9f080c47e1 | ||
|
|
52eba814ce | ||
|
|
935356c34f | ||
|
|
ff9388d625 | ||
|
|
4f05c23673 | ||
|
|
4a1263f609 | ||
|
|
962376664d | ||
|
|
5fef0d1b75 | ||
|
|
578f471808 | ||
|
|
5a8447e97e | ||
|
|
be95bdaf47 | ||
|
|
c44ff4d648 | ||
|
|
e003a1294c | ||
|
|
44062517eb | ||
|
|
13f0f8c10e | ||
|
|
f5df444ceb | ||
|
|
e382713423 | ||
|
|
aaa8551c57 | ||
|
|
0d87c1ffb6 | ||
|
|
0b194426f8 | ||
|
|
63a7d7fb24 | ||
|
|
b4ede558a5 | ||
|
|
de3e2d4349 | ||
|
|
a0e51e96f1 | ||
|
|
d6afac9624 | ||
|
|
c2891330bc | ||
|
|
ceaa931e48 | ||
|
|
eaa63165df | ||
|
|
c65357c566 | ||
|
|
e63e9f9f26 | ||
|
|
1fe3aab047 | ||
|
|
aafd3ab60e | ||
|
|
1a1935507b | ||
|
|
d2f84c9c8a | ||
|
|
ca32253f32 | ||
|
|
9066d1f982 | ||
|
|
8d85be770d | ||
|
|
7ba1d9b9ca | ||
|
|
31aff441ce | ||
|
|
e737e32fd1 | ||
|
|
8635d425c1 | ||
|
|
97cd4b8aee | ||
|
|
72390e3ffb | ||
|
|
b07d733a71 | ||
|
|
fa3018c30e | ||
|
|
6caa40302e | ||
|
|
a48b247e9e | ||
|
|
b1b115ecd6 | ||
|
|
07bba933ff | ||
|
|
e85f8af519 | ||
|
|
adfa0ab878 | ||
|
|
cbb6649e97 | ||
|
|
77abc9b280 | ||
|
|
81e8690763 | ||
|
|
dd04a8ac22 | ||
|
|
cb554b3a9c | ||
|
|
1153459d1b | ||
|
|
1a73390ffe | ||
|
|
8b981e41a1 | ||
|
|
c10b1f555d | ||
|
|
14db1ca508 | ||
|
|
66eafb16cf | ||
|
|
3ae30cd6b9 | ||
|
|
692d9c881c | ||
|
|
055b481386 | ||
|
|
ce2b1edd4e | ||
|
|
8cf3657fb6 | ||
|
|
44222a7fe0 | ||
|
|
3ac153180c | ||
|
|
96b486acee | ||
|
|
3602a2cd1f | ||
|
|
b65de4947a | ||
|
|
04ad946fc8 | ||
|
|
f704b8d32f | ||
|
|
708ad330ac | ||
|
|
c6a27bbe64 | ||
|
|
f16b4f10b6 | ||
|
|
87a2ccc37c | ||
|
|
e3e20e2242 | ||
|
|
594b9f4c73 | ||
|
|
c96c6a26fd | ||
|
|
c8f2c5d636 | ||
|
|
5f2fa15e04 | ||
|
|
7d144aaabc | ||
|
|
f9890a6452 | ||
|
|
2c7143459f | ||
|
|
3857581adf | ||
|
|
e9754e6250 | ||
|
|
76398c3233 | ||
|
|
ba024fcfc0 | ||
|
|
b9b52c295d | ||
|
|
285d042b10 | ||
|
|
01db7908b8 | ||
|
|
5f75df40d5 | ||
|
|
b3f100dc25 | ||
|
|
2f65aad626 | ||
|
|
25116788ef | ||
|
|
958f0de65e | ||
|
|
5d212f66a7 | ||
|
|
f88ee18409 | ||
|
|
d22917a58a | ||
|
|
640cccc2b1 | ||
|
|
fba6532502 | ||
|
|
3e8d6ea74f |
@@ -24,7 +24,12 @@ before_install:
|
||||
- if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi
|
||||
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi
|
||||
|
||||
script: make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
|
||||
script:
|
||||
- set -e
|
||||
- make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
|
||||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
|
||||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
|
||||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C utest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
|
||||
|
||||
# whitelist
|
||||
branches:
|
||||
|
||||
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4)
|
||||
project(OpenBLAS)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 2)
|
||||
set(OpenBLAS_PATCH_VERSION 16.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 19.dev)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
enable_language(ASM)
|
||||
@@ -54,10 +54,6 @@ if (NOT DYNAMIC_ARCH)
|
||||
list(APPEND BLASDIRS kernel)
|
||||
endif ()
|
||||
|
||||
if (DEFINED UTEST_CHECK)
|
||||
set(SANITY_CHECK 1)
|
||||
endif ()
|
||||
|
||||
if (DEFINED SANITY_CHECK)
|
||||
list(APPEND BLASDIRS reference)
|
||||
endif ()
|
||||
@@ -110,6 +106,10 @@ if (${NO_STATIC} AND ${NO_SHARED})
|
||||
message(FATAL_ERROR "Neither static nor shared are enabled.")
|
||||
endif ()
|
||||
|
||||
#Set default output directory
|
||||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
|
||||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
|
||||
|
||||
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
|
||||
set(TARGET_OBJS "")
|
||||
foreach (SUBDIR ${SUBDIRS})
|
||||
@@ -139,6 +139,17 @@ add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET
|
||||
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/export.cmake")
|
||||
|
||||
# Set output for libopenblas
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
endforeach()
|
||||
|
||||
enable_testing()
|
||||
add_subdirectory(utest)
|
||||
|
||||
if(NOT MSVC)
|
||||
#only build shared library for MSVC
|
||||
@@ -152,7 +163,6 @@ target_link_libraries(${OpenBLAS_LIBNAME}_static pthread)
|
||||
endif()
|
||||
|
||||
#build test and ctest
|
||||
enable_testing()
|
||||
add_subdirectory(test)
|
||||
if(NOT NO_CBLAS)
|
||||
add_subdirectory(ctest)
|
||||
|
||||
@@ -121,6 +121,17 @@ In chronological order:
|
||||
* [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
|
||||
ARMv8 support.
|
||||
|
||||
* Jerome Robert <jeromerobert@gmx.com>
|
||||
* [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478)
|
||||
* [2015-12-23] `stack_check` in `gemv.c` (bug #722)
|
||||
* [2015-12-28] Allow to force the number of parallel make job
|
||||
* [2015-12-28] Fix detection of AMD E2-3200 detection
|
||||
* [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected
|
||||
* [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731)
|
||||
* [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742)
|
||||
* [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760)
|
||||
* [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727)
|
||||
|
||||
* Dan Kortschak
|
||||
* [2015-01-07] Added test for drotmg bug #484.
|
||||
|
||||
@@ -130,5 +141,16 @@ In chronological order:
|
||||
* Martin Koehler <https://github.com/grisuthedragon/>
|
||||
* [2015-09-07] Improved imatcopy
|
||||
|
||||
* [Your name or handle] <[email or website]>
|
||||
* [Date] [Brief summary of your changes]
|
||||
* Ashwin Sekhar T K <https://github.com/ashwinyes/>
|
||||
* [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8)
|
||||
* [2015-11-20] lapack-test fixes for Cortex-A57
|
||||
* [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
|
||||
* [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
|
||||
|
||||
* theoractice <https://github.com/theoractice/>
|
||||
* [2016-03-20] Fix compiler error in VisualStudio with CMake
|
||||
* [2016-03-22] Fix access violation on Windows while static linking
|
||||
|
||||
* Abdelrauf <https://github.com/quickwritereader>
|
||||
* [2017-01-01] dgemm and dtrmm kernels for IBM z13
|
||||
|
||||
|
||||
@@ -1,4 +1,81 @@
|
||||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.2.18
|
||||
12-Apr-2016
|
||||
common:
|
||||
* If you set MAKE_NB_JOBS flag less or equal than zero,
|
||||
make will be without -j.
|
||||
|
||||
x86/x86_64:
|
||||
* Support building Visual Studio static library. (#813, Thanks, theoractice)
|
||||
* Fix bugs to pass buidbot CI tests (http://build.openblas.net)
|
||||
|
||||
ARM:
|
||||
* Provide DGEMM 8x4 kernel for Cortex-A57 (Thanks, Ashwin Sekhar T K)
|
||||
|
||||
POWER:
|
||||
* Optimize S and C BLAS3 on Power8
|
||||
* Optimize BLAS2/1 on Power8
|
||||
|
||||
====================================================================
|
||||
Version 0.2.17
|
||||
20-Mar-2016
|
||||
common:
|
||||
* Enable BUILD_LAPACK_DEPRECATED=1 by default.
|
||||
|
||||
====================================================================
|
||||
Version 0.2.16
|
||||
15-Mar-2016
|
||||
common:
|
||||
* Avoid potential getenv segfault. (#716)
|
||||
* Import LAPACK svn bugfix #142-#147,#150-#155
|
||||
|
||||
x86/x86_64:
|
||||
* Optimize c/zgemv for AMD Bulldozer, Piledriver, Steamroller
|
||||
* Fix bug with scipy linalg test.
|
||||
|
||||
ARM:
|
||||
* Improve DGEMM for ARM Cortex-A57. (Thanks, Ashwin Sekhar T K)
|
||||
|
||||
POWER:
|
||||
* Optimize D and Z BLAS3 functions for Power8.
|
||||
|
||||
====================================================================
|
||||
Version 0.2.16.rc1
|
||||
23-Feb-2016
|
||||
common:
|
||||
* Upgrade LAPACK to 3.6.0 version.
|
||||
Add BUILD_LAPACK_DEPRECATED option in Makefile.rule to build
|
||||
LAPACK deprecated functions.
|
||||
* Add MAKE_NB_JOBS option in Makefile.
|
||||
Force number of make jobs.This is particularly
|
||||
useful when using distcc. (#735. Thanks, Jerome Robert.)
|
||||
* Redesign unit test. Run unit/regression test at every build (Travis-CI and Appveyor).
|
||||
* Disable multi-threading for small size swap and ger. (#744. Thanks, Jerome Robert)
|
||||
* Improve small zger, zgemv, ztrmv using stack alloction (#727. Thanks, Jerome Robert)
|
||||
* Let openblas_get_num_threads return the number of active threads.
|
||||
(#760. Thanks, Jerome Robert)
|
||||
* Support illumos(OmniOS). (#749. Thanks, Lauri Tirkkonen)
|
||||
* Fix LAPACK Dormbr, Dormlq bug. (#711, #713. Thanks, Brendan Tracey)
|
||||
* Update scipy benchmark script. (#745. Thanks, John Kirkham)
|
||||
|
||||
x86/x86_64:
|
||||
* Optimize trsm kernels for AMD Bulldozer, Piledriver, Steamroller.
|
||||
* Detect Intel Avoton.
|
||||
* Detect AMD Trinity, Richland, E2-3200.
|
||||
* Fix gemv performance bug on Mac OSX Intel Haswell.
|
||||
* Fix some bugs with CMake and Visual Studio
|
||||
|
||||
ARM:
|
||||
* Support and optimize Cortex-A57 AArch64.
|
||||
(#686. Thanks, Ashwin Sekhar TK)
|
||||
* Fix Android build on ARMV7 (#778. Thanks, Paul Mustiere)
|
||||
* Update ARMV6 kernels.
|
||||
|
||||
POWER:
|
||||
* Fix detection of POWER architecture
|
||||
(#684. Thanks, Sebastien Villemot)
|
||||
|
||||
====================================================================
|
||||
Version 0.2.15
|
||||
27-Oct-2015
|
||||
|
||||
21
Makefile
21
Makefile
@@ -7,10 +7,6 @@ ifneq ($(DYNAMIC_ARCH), 1)
|
||||
BLASDIRS += kernel
|
||||
endif
|
||||
|
||||
ifdef UTEST_CHECK
|
||||
SANITY_CHECK = 1
|
||||
endif
|
||||
|
||||
ifdef SANITY_CHECK
|
||||
BLASDIRS += reference
|
||||
endif
|
||||
@@ -85,22 +81,22 @@ endif
|
||||
|
||||
shared :
|
||||
ifndef NO_SHARED
|
||||
ifeq ($(OSNAME), Linux)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||
@$(MAKE) -C exports so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
@$(MAKE) -C exports so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), NetBSD)
|
||||
@$(MAKE) -C exports so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
@$(MAKE) -C exports dyn
|
||||
@-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
@$(MAKE) -C exports dll
|
||||
@@ -117,10 +113,8 @@ ifndef CROSS
|
||||
touch $(LIBNAME)
|
||||
ifndef NO_FBLAS
|
||||
$(MAKE) -C test all
|
||||
ifdef UTEST_CHECK
|
||||
$(MAKE) -C utest all
|
||||
endif
|
||||
endif
|
||||
ifndef NO_CBLAS
|
||||
$(MAKE) -C ctest all
|
||||
endif
|
||||
@@ -263,6 +257,9 @@ endif
|
||||
else
|
||||
-@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
ifeq ($(BUILD_LAPACK_DEPRECATED), 1)
|
||||
-@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
-@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
|
||||
@@ -11,8 +11,8 @@ endif
|
||||
|
||||
ifeq ($(CORE), ARMV7)
|
||||
ifeq ($(OSNAME), Android)
|
||||
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
|
||||
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
|
||||
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
|
||||
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
|
||||
else
|
||||
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
|
||||
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
|
||||
@@ -29,5 +29,3 @@ ifeq ($(CORE), ARMV5)
|
||||
CCOMMON_OPT += -marm -march=armv5
|
||||
FCOMMON_OPT += -marm -march=armv5
|
||||
endif
|
||||
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ install : lib.grd
|
||||
#for inc
|
||||
@echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@awk 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@@ -48,10 +48,10 @@ endif
|
||||
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
|
||||
endif
|
||||
|
||||
#for install static library
|
||||
@@ -64,7 +64,7 @@ endif
|
||||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), Linux)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||
@install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.2.16.dev
|
||||
VERSION = 0.2.19.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
@@ -79,8 +79,11 @@ VERSION = 0.2.16.dev
|
||||
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
|
||||
# NO_LAPACKE = 1
|
||||
|
||||
# Build LAPACK Deprecated functions since LAPACK 3.6.0
|
||||
BUILD_LAPACK_DEPRECATED = 1
|
||||
|
||||
# If you want to use legacy threaded Level 3 implementation.
|
||||
USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
||||
# If you want to drive whole 64bit region by BLAS. Not all Fortran
|
||||
# compiler supports this. It's safe to keep comment it out if you
|
||||
@@ -108,6 +111,13 @@ NO_AFFINITY = 1
|
||||
# Don't use parallel make.
|
||||
# NO_PARALLEL_MAKE = 1
|
||||
|
||||
# Force number of make jobs. The default is the number of logical CPU of the host.
|
||||
# This is particularly useful when using distcc.
|
||||
# A negative value will disable adding a -j flag to make, allowing to use a parent
|
||||
# make -j value. This is useful to call OpenBLAS make from an other project
|
||||
# makefile
|
||||
# MAKE_NB_JOBS = 2
|
||||
|
||||
# If you would like to know minute performance report of GotoBLAS.
|
||||
# FUNCTION_PROFILE = 1
|
||||
|
||||
@@ -138,10 +148,6 @@ NO_AFFINITY = 1
|
||||
# slow (Not implemented yet).
|
||||
# SANITY_CHECK = 1
|
||||
|
||||
# Run testcases in utest/ . When you enable UTEST_CHECK, it would enable
|
||||
# SANITY_CHECK to compare the result with reference BLAS.
|
||||
# UTEST_CHECK = 1
|
||||
|
||||
# The installation directory.
|
||||
# PREFIX = /opt/OpenBLAS
|
||||
|
||||
@@ -159,10 +165,11 @@ COMMON_PROF = -pg
|
||||
# Build Debug version
|
||||
# DEBUG = 1
|
||||
|
||||
# Improve GEMV and GER for small matrices by stack allocation.
|
||||
# For details, https://github.com/xianyi/OpenBLAS/pull/482
|
||||
# Set maximum stack allocation.
|
||||
# The default value is 2048. 0 disable stack allocation a may reduce GER and GEMV
|
||||
# performance. For details, https://github.com/xianyi/OpenBLAS/pull/482
|
||||
#
|
||||
MAX_STACK_ALLOC=2048
|
||||
# MAX_STACK_ALLOC = 0
|
||||
|
||||
# Add a prefix or suffix to all exported symbol names in the shared library.
|
||||
# Avoid conflicts with other BLAS libraries, especially when using
|
||||
|
||||
@@ -139,6 +139,10 @@ NO_PARALLEL_MAKE=0
|
||||
endif
|
||||
GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE)
|
||||
|
||||
ifdef MAKE_NB_JOBS
|
||||
GETARCH_FLAGS += -DMAKE_NB_JOBS=$(MAKE_NB_JOBS)
|
||||
endif
|
||||
|
||||
ifeq ($(HOSTCC), loongcc)
|
||||
GETARCH_FLAGS += -static
|
||||
endif
|
||||
@@ -292,12 +296,14 @@ endif
|
||||
ifneq ($(OSNAME), WINNT)
|
||||
ifneq ($(OSNAME), CYGWIN_NT)
|
||||
ifneq ($(OSNAME), Interix)
|
||||
ifneq ($(OSNAME), Android)
|
||||
ifdef SMP
|
||||
EXTRALIB += -lpthread
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# ifeq logical or
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix))
|
||||
@@ -324,7 +330,8 @@ ifdef SANITY_CHECK
|
||||
CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU)
|
||||
endif
|
||||
|
||||
ifdef MAX_STACK_ALLOC
|
||||
MAX_STACK_ALLOC ?= 2048
|
||||
ifneq ($(MAX_STACK_ALLOC), 0)
|
||||
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
|
||||
endif
|
||||
|
||||
@@ -374,7 +381,7 @@ FCOMMON_OPT += -m128bit-long-double
|
||||
endif
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
EXPRECISION = 1
|
||||
CCOMMON_OPT += -DEXPRECISION
|
||||
CCOMMON_OPT += -DEXPRECISION
|
||||
FCOMMON_OPT += -m128bit-long-double
|
||||
endif
|
||||
endif
|
||||
@@ -388,7 +395,7 @@ endif
|
||||
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
|
||||
#check
|
||||
#check
|
||||
ifeq ($(USE_THREAD), 0)
|
||||
$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.)
|
||||
endif
|
||||
@@ -952,17 +959,18 @@ ifeq ($(OSNAME), SunOS)
|
||||
TAR = gtar
|
||||
PATCH = gpatch
|
||||
GREP = ggrep
|
||||
AWK = nawk
|
||||
else
|
||||
TAR = tar
|
||||
PATCH = patch
|
||||
GREP = grep
|
||||
AWK = awk
|
||||
endif
|
||||
|
||||
ifndef MD5SUM
|
||||
MD5SUM = md5sum
|
||||
endif
|
||||
|
||||
AWK = awk
|
||||
|
||||
REVISION = -r$(VERSION)
|
||||
MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION)))
|
||||
@@ -980,12 +988,8 @@ COMMON_OPT = -O2
|
||||
endif
|
||||
|
||||
ifndef FCOMMON_OPT
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
FCOMMON_OPT = -O0
|
||||
else
|
||||
FCOMMON_OPT = -O2 -frecursive
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
@@ -1183,4 +1187,3 @@ SUNPATH = /opt/sunstudio12.1
|
||||
else
|
||||
SUNPATH = /opt/SUNWspro
|
||||
endif
|
||||
|
||||
|
||||
6
Makefile.zarch
Normal file
6
Makefile.zarch
Normal file
@@ -0,0 +1,6 @@
|
||||
|
||||
ifeq ($(CORE), Z13)
|
||||
CCOMMON_OPT += -march=z13 -mzvector
|
||||
FCOMMON_OPT += -march=z13 -mzvector
|
||||
endif
|
||||
|
||||
@@ -75,10 +75,15 @@ Please read GotoBLAS_01Readme.txt
|
||||
|
||||
#### ARM64:
|
||||
- **ARMV8**: Experimental
|
||||
- **ARM Cortex-A57**: Experimental
|
||||
|
||||
#### IBM zEnterprise System:
|
||||
- **Z13**: blas3 for double
|
||||
|
||||
|
||||
### Support OS:
|
||||
- **GNU/Linux**
|
||||
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
|
||||
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
|
||||
|
||||
|
||||
199
USAGE.md
Normal file
199
USAGE.md
Normal file
@@ -0,0 +1,199 @@
|
||||
# Notes on OpenBLAS usage
|
||||
## Usage
|
||||
|
||||
#### Program is Terminated. Because you tried to allocate too many memory regions
|
||||
|
||||
In OpenBLAS, we mange a pool of memory buffers and allocate the number of
|
||||
buffers as the following.
|
||||
```
|
||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
|
||||
```
|
||||
This error indicates that the program exceeded the number of buffers.
|
||||
|
||||
Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
|
||||
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
|
||||
`MAX_CPU_NUMBER=NUM_THREADS`.
|
||||
|
||||
#### How can I use OpenBLAS in multi-threaded applications?
|
||||
|
||||
If your application is already multi-threaded, it will conflict with OpenBLAS
|
||||
multi-threading. Thus, you must set OpenBLAS to use single thread in any of the
|
||||
following ways:
|
||||
|
||||
* `export OPENBLAS_NUM_THREADS=1` in the environment variables.
|
||||
* Call `openblas_set_num_threads(1)` in the application on runtime.
|
||||
* Build OpenBLAS single thread version, e.g. `make USE_THREAD=0`
|
||||
|
||||
If the application is parallelized by OpenMP, please use OpenBLAS built with
|
||||
`USE_OPENMP=1`
|
||||
|
||||
#### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH
|
||||
|
||||
The environment variable which control the kernel selection is
|
||||
`OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export
|
||||
OPENBLAS_CORETYPE=Haswell` and the function `char* openblas_get_corename()`
|
||||
returns the used target.
|
||||
|
||||
#### How could I disable OpenBLAS threading affinity on runtime?
|
||||
|
||||
You can define the `OPENBLAS_MAIN_FREE` or `GOTOBLAS_MAIN_FREE` environment
|
||||
variable to disable threading affinity on runtime. For example, before the
|
||||
running,
|
||||
```
|
||||
export OPENBLAS_MAIN_FREE=1
|
||||
```
|
||||
|
||||
Alternatively, you can disable affinity feature with enabling `NO_AFFINITY=1`
|
||||
in `Makefile.rule`.
|
||||
|
||||
## Linking with the library
|
||||
|
||||
* Link with shared library
|
||||
|
||||
`gcc -o test test.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas`
|
||||
|
||||
If the library is multithreaded, please add `-lpthread`. If the library
|
||||
contains LAPACK functions, please add `-lgfortran` or other Fortran libs.
|
||||
|
||||
* Link with static library
|
||||
|
||||
`gcc -o test test.c /your/path/libopenblas.a`
|
||||
|
||||
You can download `test.c` from https://gist.github.com/xianyi/5780018
|
||||
|
||||
On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by
|
||||
default), custom programs statically linked against `libopenblas.a` should also
|
||||
link with the pthread library e.g.:
|
||||
|
||||
```
|
||||
gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread
|
||||
```
|
||||
|
||||
Failing to add the `-lpthread` flag will cause errors such as:
|
||||
|
||||
```
|
||||
/opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory':
|
||||
memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock'
|
||||
memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock'
|
||||
...
|
||||
```
|
||||
|
||||
## Code examples
|
||||
|
||||
#### Call CBLAS interface
|
||||
This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656
|
||||
```
|
||||
#include <cblas.h>
|
||||
#include <stdio.h>
|
||||
|
||||
void main()
|
||||
{
|
||||
int i=0;
|
||||
double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
|
||||
double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
|
||||
double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5};
|
||||
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3);
|
||||
|
||||
for(i=0; i<9; i++)
|
||||
printf("%lf ", C[i]);
|
||||
printf("\n");
|
||||
}
|
||||
```
|
||||
`gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran`
|
||||
|
||||
#### Call BLAS Fortran interface
|
||||
|
||||
This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018
|
||||
|
||||
```
|
||||
#include "stdio.h"
|
||||
#include "stdlib.h"
|
||||
#include "sys/time.h"
|
||||
#include "time.h"
|
||||
|
||||
extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*);
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int i;
|
||||
printf("test!\n");
|
||||
if(argc<4){
|
||||
printf("Input Error\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
int m = atoi(argv[1]);
|
||||
int n = atoi(argv[2]);
|
||||
int k = atoi(argv[3]);
|
||||
int sizeofa = m * k;
|
||||
int sizeofb = k * n;
|
||||
int sizeofc = m * n;
|
||||
char ta = 'N';
|
||||
char tb = 'N';
|
||||
double alpha = 1.2;
|
||||
double beta = 0.001;
|
||||
|
||||
struct timeval start,finish;
|
||||
double duration;
|
||||
|
||||
double* A = (double*)malloc(sizeof(double) * sizeofa);
|
||||
double* B = (double*)malloc(sizeof(double) * sizeofb);
|
||||
double* C = (double*)malloc(sizeof(double) * sizeofc);
|
||||
|
||||
srand((unsigned)time(NULL));
|
||||
|
||||
for (i=0; i<sizeofa; i++)
|
||||
A[i] = i%3+1;//(rand()%100)/10.0;
|
||||
|
||||
for (i=0; i<sizeofb; i++)
|
||||
B[i] = i%3+1;//(rand()%100)/10.0;
|
||||
|
||||
for (i=0; i<sizeofc; i++)
|
||||
C[i] = i%3+1;//(rand()%100)/10.0;
|
||||
//#if 0
|
||||
printf("m=%d,n=%d,k=%d,alpha=%lf,beta=%lf,sizeofc=%d\n",m,n,k,alpha,beta,sizeofc);
|
||||
gettimeofday(&start, NULL);
|
||||
dgemm_(&ta, &tb, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m);
|
||||
gettimeofday(&finish, NULL);
|
||||
|
||||
duration = ((double)(finish.tv_sec-start.tv_sec)*1000000 + (double)(finish.tv_usec-start.tv_usec)) / 1000000;
|
||||
double gflops = 2.0 * m *n*k;
|
||||
gflops = gflops/duration*1.0e-6;
|
||||
|
||||
FILE *fp;
|
||||
fp = fopen("timeDGEMM.txt", "a");
|
||||
fprintf(fp, "%dx%dx%d\t%lf s\t%lf MFLOPS\n", m, n, k, duration, gflops);
|
||||
fclose(fp);
|
||||
|
||||
free(A);
|
||||
free(B);
|
||||
free(C);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
` gcc -o time_dgemm time_dgemm.c /your/path/libopenblas.a`
|
||||
|
||||
` ./time_dgemm <m> <n> <k> `
|
||||
|
||||
## Troubleshooting
|
||||
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
|
||||
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
|
||||
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
|
||||
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
|
||||
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
|
||||
|
||||
## BLAS reference manual
|
||||
If you want to understand every BLAS function and definition, please read
|
||||
[Intel MKL reference manual](https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-F7ED9FB8-6663-4F44-A62B-61B63C4F0491.htm)
|
||||
or [netlib.org](http://netlib.org/blas/)
|
||||
|
||||
Here are [OpenBLAS extension functions](https://github.com/xianyi/OpenBLAS/wiki/OpenBLAS-Extensions)
|
||||
|
||||
## How to reference OpenBLAS.
|
||||
|
||||
You can reference our [papers](https://github.com/xianyi/OpenBLAS/wiki/publications).
|
||||
|
||||
Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net directly.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
version: 0.2.15.{build}
|
||||
version: 0.2.19.{build}
|
||||
|
||||
#environment:
|
||||
|
||||
@@ -39,4 +39,6 @@ before_build:
|
||||
- cmake -G "Visual Studio 12 Win64" .
|
||||
|
||||
test_script:
|
||||
- echo Build OK!
|
||||
- echo Running Test
|
||||
- cd c:\projects\OpenBLAS\utest
|
||||
- openblas_utest
|
||||
|
||||
@@ -33,6 +33,10 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread
|
||||
# Apple vecLib
|
||||
LIBVECLIB = -framework Accelerate
|
||||
|
||||
ESSL=/opt/ibm/lib
|
||||
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
|
||||
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
@@ -44,6 +48,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
|
||||
sger.goto dger.goto cger.goto zger.goto \
|
||||
sdot.goto ddot.goto \
|
||||
srot.goto drot.goto \
|
||||
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
|
||||
scopy.goto dcopy.goto ccopy.goto zcopy.goto \
|
||||
sswap.goto dswap.goto cswap.goto zswap.goto \
|
||||
@@ -151,6 +156,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
|
||||
sger.goto dger.goto cger.goto zger.goto \
|
||||
sdot.goto ddot.goto cdot.goto zdot.goto \
|
||||
srot.goto drot.goto \
|
||||
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
|
||||
scopy.goto dcopy.goto ccopy.goto zcopy.goto \
|
||||
sswap.goto dswap.goto cswap.goto zswap.goto \
|
||||
@@ -166,7 +172,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
|
||||
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
|
||||
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
|
||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto
|
||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
|
||||
smallscaling
|
||||
|
||||
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
||||
@@ -252,7 +259,9 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
|
||||
|
||||
endif
|
||||
|
||||
|
||||
essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \
|
||||
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \
|
||||
slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl
|
||||
|
||||
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
|
||||
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
|
||||
@@ -305,6 +314,9 @@ slinpack.mkl : slinpack.$(SUFFIX)
|
||||
slinpack.veclib : slinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
slinpack.essl : slinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Dlinpack ####################################################
|
||||
dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
@@ -321,6 +333,9 @@ dlinpack.mkl : dlinpack.$(SUFFIX)
|
||||
dlinpack.veclib : dlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
dlinpack.essl : dlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Clinpack ####################################################
|
||||
|
||||
clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME)
|
||||
@@ -338,6 +353,9 @@ clinpack.mkl : clinpack.$(SUFFIX)
|
||||
clinpack.veclib : clinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
clinpack.essl : clinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Zlinpack ####################################################
|
||||
|
||||
zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME)
|
||||
@@ -355,6 +373,9 @@ zlinpack.mkl : zlinpack.$(SUFFIX)
|
||||
zlinpack.veclib : zlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
zlinpack.essl : zlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Scholesky ###################################################
|
||||
|
||||
scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME)
|
||||
@@ -440,6 +461,9 @@ sgemm.mkl : sgemm.$(SUFFIX)
|
||||
sgemm.veclib : sgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
sgemm.essl : sgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Dgemm ####################################################
|
||||
dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
@@ -456,6 +480,9 @@ dgemm.mkl : dgemm.$(SUFFIX)
|
||||
dgemm.veclib : dgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
dgemm.essl : dgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Cgemm ####################################################
|
||||
|
||||
cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
@@ -473,6 +500,9 @@ cgemm.mkl : cgemm.$(SUFFIX)
|
||||
cgemm.veclib : cgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
cgemm.essl : cgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Zgemm ####################################################
|
||||
|
||||
zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
@@ -490,6 +520,9 @@ zgemm.mkl : zgemm.$(SUFFIX)
|
||||
zgemm.veclib : zgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
zgemm.essl : zgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Ssymm ####################################################
|
||||
ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
@@ -572,6 +605,9 @@ strmm.mkl : strmm.$(SUFFIX)
|
||||
strmm.veclib : strmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
strmm.essl : strmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Dtrmm ####################################################
|
||||
dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
@@ -588,6 +624,9 @@ dtrmm.mkl : dtrmm.$(SUFFIX)
|
||||
dtrmm.veclib : dtrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
dtrmm.essl : dtrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Ctrmm ####################################################
|
||||
|
||||
ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME)
|
||||
@@ -605,6 +644,9 @@ ctrmm.mkl : ctrmm.$(SUFFIX)
|
||||
ctrmm.veclib : ctrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
ctrmm.essl : ctrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Ztrmm ####################################################
|
||||
|
||||
ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME)
|
||||
@@ -622,6 +664,9 @@ ztrmm.mkl : ztrmm.$(SUFFIX)
|
||||
ztrmm.veclib : ztrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
ztrmm.essl : ztrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Strsm ####################################################
|
||||
strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
@@ -1412,6 +1457,39 @@ zdot.mkl : zdot-intel.$(SUFFIX)
|
||||
zdot.veclib : zdot-intel.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Srot ####################################################
|
||||
srot.goto : srot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
srot.acml : srot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
srot.atlas : srot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
srot.mkl : srot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
srot.veclib : srot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Drot ####################################################
|
||||
drot.goto : drot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
drot.acml : drot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
drot.atlas : drot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
drot.mkl : drot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
drot.veclib : drot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
|
||||
##################################### Saxpy ####################################################
|
||||
saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
@@ -2123,6 +2201,13 @@ cgesv.$(SUFFIX) : gesv.c
|
||||
zgesv.$(SUFFIX) : gesv.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
srot.$(SUFFIX) : rot.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
drot.$(SUFFIX) : rot.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -2132,9 +2217,11 @@ cgemm3m.$(SUFFIX) : gemm3m.c
|
||||
zgemm3m.$(SUFFIX) : gemm3m.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
smallscaling: smallscaling.c ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
|
||||
|
||||
clean ::
|
||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib
|
||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl
|
||||
|
||||
include $(TOPDIR)/Makefile.tail
|
||||
|
||||
|
||||
197
benchmark/rot.c
Normal file
197
benchmark/rot.c
Normal file
@@ -0,0 +1,197 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#undef DOT
|
||||
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define ROT BLASFUNC(drot)
|
||||
#else
|
||||
#define ROT BLASFUNC(srot)
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *y;
|
||||
// FLOAT result;
|
||||
blasint m, i;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
FLOAT c[1] = { 2.0 };
|
||||
FLOAT s[1] = { 2.0 };
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
ROT (&m, x, &inc_x, y, &inc_y, c, s);
|
||||
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
|
||||
timeg += time1;
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6);
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
58
benchmark/scripts/SCIPY/dsyrk.py
Executable file
58
benchmark/scripts/SCIPY/dsyrk.py
Executable file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import numpy
|
||||
from numpy import zeros
|
||||
from numpy.random import randn
|
||||
from scipy.linalg import blas
|
||||
|
||||
|
||||
def run_dsyrk(N, l):
|
||||
|
||||
A = randn(N, N).astype('float64', order='F')
|
||||
C = zeros((N, N), dtype='float64', order='F')
|
||||
|
||||
start = time.time()
|
||||
for i in range(0, l):
|
||||
blas.dsyrk(1.0, A, c=C, overwrite_c=True)
|
||||
end = time.time()
|
||||
|
||||
timediff = (end - start)
|
||||
mflops = (N * N * N) * l / timediff
|
||||
mflops *= 1e-6
|
||||
|
||||
size = "%dx%d" % (N, N)
|
||||
print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
N = 128
|
||||
NMAX = 2048
|
||||
NINC = 128
|
||||
LOOPS = 1
|
||||
|
||||
z = 0
|
||||
for arg in sys.argv:
|
||||
if z == 1:
|
||||
N = int(arg)
|
||||
elif z == 2:
|
||||
NMAX = int(arg)
|
||||
elif z == 3:
|
||||
NINC = int(arg)
|
||||
elif z == 4:
|
||||
LOOPS = int(arg)
|
||||
|
||||
z = z + 1
|
||||
|
||||
if 'OPENBLAS_LOOPS' in os.environ:
|
||||
p = os.environ['OPENBLAS_LOOPS']
|
||||
if p:
|
||||
LOOPS = int(p)
|
||||
|
||||
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
|
||||
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
|
||||
|
||||
for i in range(N, NMAX + NINC, NINC):
|
||||
run_dsyrk(i, LOOPS)
|
||||
58
benchmark/scripts/SCIPY/ssyrk.py
Executable file
58
benchmark/scripts/SCIPY/ssyrk.py
Executable file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import numpy
|
||||
from numpy import zeros
|
||||
from numpy.random import randn
|
||||
from scipy.linalg import blas
|
||||
|
||||
|
||||
def run_ssyrk(N, l):
|
||||
|
||||
A = randn(N, N).astype('float32', order='F')
|
||||
C = zeros((N, N), dtype='float32', order='F')
|
||||
|
||||
start = time.time()
|
||||
for i in range(0, l):
|
||||
blas.ssyrk(1.0, A, c=C, overwrite_c=True)
|
||||
end = time.time()
|
||||
|
||||
timediff = (end - start)
|
||||
mflops = (N * N * N) * l / timediff
|
||||
mflops *= 1e-6
|
||||
|
||||
size = "%dx%d" % (N, N)
|
||||
print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
N = 128
|
||||
NMAX = 2048
|
||||
NINC = 128
|
||||
LOOPS = 1
|
||||
|
||||
z = 0
|
||||
for arg in sys.argv:
|
||||
if z == 1:
|
||||
N = int(arg)
|
||||
elif z == 2:
|
||||
NMAX = int(arg)
|
||||
elif z == 3:
|
||||
NINC = int(arg)
|
||||
elif z == 4:
|
||||
LOOPS = int(arg)
|
||||
|
||||
z = z + 1
|
||||
|
||||
if 'OPENBLAS_LOOPS' in os.environ:
|
||||
p = os.environ['OPENBLAS_LOOPS']
|
||||
if p:
|
||||
LOOPS = int(p)
|
||||
|
||||
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
|
||||
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
|
||||
|
||||
for i in range(N, NMAX + NINC, NINC):
|
||||
run_ssyrk(i, LOOPS)
|
||||
196
benchmark/smallscaling.c
Normal file
196
benchmark/smallscaling.c
Normal file
@@ -0,0 +1,196 @@
|
||||
// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <cblas.h>
|
||||
#include <omp.h>
|
||||
#define MIN_SIZE 5
|
||||
#define MAX_SIZE 60
|
||||
#define NB_SIZE 10
|
||||
|
||||
// number of loop for a 1x1 matrix. Lower it if the test is
|
||||
// too slow on you computer.
|
||||
#define NLOOP 2e7
|
||||
|
||||
typedef struct {
|
||||
int matrix_size;
|
||||
int n_loop;
|
||||
void (* bench_func)();
|
||||
void (* blas_func)();
|
||||
void * (* create_matrix)(int size);
|
||||
} BenchParam;
|
||||
|
||||
void * s_create_matrix(int size) {
|
||||
float * r = malloc(size * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void * c_create_matrix(int size) {
|
||||
float * r = malloc(size * 2 * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < 2 * size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void * z_create_matrix(int size) {
|
||||
double * r = malloc(size * 2 * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < 2 * size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void * d_create_matrix(int size) {
|
||||
double * r = malloc(size * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void trmv_bench(BenchParam * param)
|
||||
{
|
||||
int i, n;
|
||||
int size = param->matrix_size;
|
||||
n = param->n_loop / size;
|
||||
int one = 1;
|
||||
void * A = param->create_matrix(size * size);
|
||||
void * y = param->create_matrix(size);
|
||||
for(i = 0; i < n; i++) {
|
||||
param->blas_func("U", "N", "N", &size, A, &size, y, &one);
|
||||
}
|
||||
free(A);
|
||||
free(y);
|
||||
}
|
||||
|
||||
void gemv_bench(BenchParam * param)
|
||||
{
|
||||
int i, n;
|
||||
int size = param->matrix_size;
|
||||
n = param->n_loop / size;
|
||||
double v = 1.01;
|
||||
int one = 1;
|
||||
void * A = param->create_matrix(size * size);
|
||||
void * y = param->create_matrix(size);
|
||||
for(i = 0; i < n; i++) {
|
||||
param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
|
||||
}
|
||||
free(A);
|
||||
free(y);
|
||||
}
|
||||
|
||||
void ger_bench(BenchParam * param) {
|
||||
int i, n;
|
||||
int size = param->matrix_size;
|
||||
n = param->n_loop / size;
|
||||
double v = 1.01;
|
||||
int one = 1;
|
||||
void * A = param->create_matrix(size * size);
|
||||
void * y = param->create_matrix(size);
|
||||
for(i = 0; i < n; i++) {
|
||||
param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size);
|
||||
}
|
||||
free(A);
|
||||
free(y);
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
void * pthread_func_wrapper(void * param) {
|
||||
((BenchParam *)param)->bench_func(param);
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
#endif
|
||||
|
||||
#define NB_TESTS 5
|
||||
void * TESTS[4 * NB_TESTS] = {
|
||||
trmv_bench, ztrmv_, z_create_matrix, "ztrmv",
|
||||
gemv_bench, dgemv_, d_create_matrix, "dgemv",
|
||||
gemv_bench, zgemv_, z_create_matrix, "zgemv",
|
||||
ger_bench, dger_, d_create_matrix, "dger",
|
||||
ger_bench, zgerc_, z_create_matrix, "zgerc",
|
||||
};
|
||||
|
||||
inline static double delta_time(struct timespec tick) {
|
||||
struct timespec tock;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tock);
|
||||
return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
|
||||
}
|
||||
|
||||
double pthread_bench(BenchParam * param, int nb_threads)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
return 0;
|
||||
#else
|
||||
BenchParam threaded_param = *param;
|
||||
pthread_t threads[nb_threads];
|
||||
int t, rc;
|
||||
struct timespec tick;
|
||||
threaded_param.n_loop /= nb_threads;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||
for(t=0; t<nb_threads; t++){
|
||||
rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param);
|
||||
if (rc){
|
||||
printf("ERROR; return code from pthread_create() is %d\n", rc);
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
for(t=0; t<nb_threads; t++){
|
||||
pthread_join(threads[t], NULL);
|
||||
}
|
||||
return delta_time(tick);
|
||||
#endif
|
||||
}
|
||||
|
||||
double seq_bench(BenchParam * param) {
|
||||
struct timespec tick;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||
param->bench_func(param);
|
||||
return delta_time(tick);
|
||||
}
|
||||
|
||||
double omp_bench(BenchParam * param) {
|
||||
BenchParam threaded_param = *param;
|
||||
struct timespec tick;
|
||||
int t;
|
||||
int nb_threads = omp_get_max_threads();
|
||||
threaded_param.n_loop /= nb_threads;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||
#pragma omp parallel for
|
||||
for(t = 0; t < nb_threads; t ++){
|
||||
param->bench_func(&threaded_param);
|
||||
}
|
||||
return delta_time(tick);
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE);
|
||||
BenchParam param;
|
||||
int test_id;
|
||||
printf ("Running on %d threads\n", omp_get_max_threads());
|
||||
for(test_id = 0; test_id < NB_TESTS; test_id ++) {
|
||||
double size = MIN_SIZE;
|
||||
param.bench_func = TESTS[test_id * 4];
|
||||
param.blas_func = TESTS[test_id * 4 + 1];
|
||||
param.create_matrix = TESTS[test_id * 4 + 2];
|
||||
printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]);
|
||||
param.n_loop = NLOOP;
|
||||
while(size <= MAX_SIZE) {
|
||||
param.matrix_size = (int)(size + 0.5);
|
||||
double seq_time = seq_bench(¶m);
|
||||
double omp_time = omp_bench(¶m);
|
||||
double pthread_time = pthread_bench(¶m, omp_get_max_threads());
|
||||
printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, "
|
||||
"pthread %gs, speedup %g\n",
|
||||
param.matrix_size, seq_time,
|
||||
omp_time, seq_time / omp_time,
|
||||
pthread_time, seq_time / pthread_time);
|
||||
size *= inc_factor;
|
||||
}
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
8
c_check
8
c_check
@@ -7,6 +7,7 @@ $hostarch = "x86_64" if ($hostarch eq "amd64");
|
||||
$hostarch = "arm" if ($hostarch =~ /^arm.*/);
|
||||
$hostarch = "arm64" if ($hostarch eq "aarch64");
|
||||
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
|
||||
$hostarch = "zarch" if ($hostarch eq "s390x");
|
||||
|
||||
$binary = $ENV{"BINARY"};
|
||||
|
||||
@@ -70,6 +71,7 @@ $architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
|
||||
$defined = 0;
|
||||
|
||||
@@ -89,6 +91,11 @@ if (($architecture eq "arm") || ($architecture eq "arm64")) {
|
||||
$defined = 1;
|
||||
}
|
||||
|
||||
if ($architecture eq "zarch") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "alpha") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
@@ -162,6 +169,7 @@ $architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
|
||||
$binformat = bin32;
|
||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
|
||||
@@ -14,12 +14,12 @@ if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64")
|
||||
if (NOT NO_EXPRECISION)
|
||||
if (${F_COMPILER} MATCHES "GFORTRAN")
|
||||
# N.B. I'm not sure if CMake differentiates between GCC and LSB -hpa
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB")
|
||||
set(EXPRECISION 1)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION -m128bit-long-double")
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double")
|
||||
endif ()
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "Clang")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
|
||||
set(EXPRECISION 1)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION")
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double")
|
||||
@@ -28,35 +28,35 @@ if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "Intel")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -wd981")
|
||||
endif ()
|
||||
|
||||
if (USE_OPENMP)
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp")
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "Clang")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
|
||||
message(WARNING "Clang doesn't support OpenMP yet.")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp")
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "Intel")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -openmp")
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "PGI")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
|
||||
set(CEXTRALIB "${CEXTRALIB} -lstdc++")
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
|
||||
endif ()
|
||||
endif ()
|
||||
@@ -87,7 +87,7 @@ if (${ARCH} STREQUAL "ia64")
|
||||
set(BINARY_DEFINED 1)
|
||||
|
||||
if (${F_COMPILER} MATCHES "GFORTRAN")
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "GNU")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
# EXPRECISION = 1
|
||||
# CCOMMON_OPT += -DEXPRECISION
|
||||
endif ()
|
||||
|
||||
@@ -48,18 +48,18 @@ set(SLASRC
|
||||
sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f
|
||||
sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f
|
||||
sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f
|
||||
sgegs.f sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f
|
||||
sgels.f sgelsd.f sgelss.f sgelsx.f sgelsy.f sgeql2.f sgeqlf.f
|
||||
sgeqp3.f sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f
|
||||
DEPRECATED/sgegs.f DEPRECATED/sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f
|
||||
sgels.f sgelsd.f sgelss.f DEPRECATED/sgelsx.f sgelsy.f sgeql2.f sgeqlf.f
|
||||
sgeqp3.f DEPRECATED/sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f
|
||||
sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvx.f
|
||||
sgetc2.f sgetri.f
|
||||
sggbak.f sggbal.f sgges.f sggesx.f sggev.f sggevx.f
|
||||
sggglm.f sgghrd.f sgglse.f sggqrf.f
|
||||
sggrqf.f sggsvd.f sggsvp.f sgtcon.f sgtrfs.f sgtsv.f
|
||||
sggrqf.f DEPRECATED/sggsvd.f DEPRECATED/sggsvp.f sgtcon.f sgtrfs.f sgtsv.f
|
||||
sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f
|
||||
shsein.f shseqr.f slabrd.f slacon.f slacn2.f
|
||||
slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f
|
||||
slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f
|
||||
DEPRECATED/slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f
|
||||
slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f
|
||||
slansy.f slantb.f slantp.f slantr.f slanv2.f
|
||||
slapll.f slapmt.f
|
||||
@@ -69,7 +69,7 @@ set(SLASRC
|
||||
slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slargv.f
|
||||
slarrv.f slartv.f
|
||||
slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f
|
||||
slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f slatzm.f
|
||||
slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f DEPRECATED/slatzm.f
|
||||
sopgtr.f sopmtr.f sorg2l.f sorg2r.f
|
||||
sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f
|
||||
sorgrq.f sorgtr.f sorm2l.f sorm2r.f
|
||||
@@ -97,7 +97,7 @@ set(SLASRC
|
||||
stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f
|
||||
stptrs.f
|
||||
strcon.f strevc.f strexc.f strrfs.f strsen.f strsna.f strsyl.f
|
||||
strtrs.f stzrqf.f stzrzf.f sstemr.f
|
||||
strtrs.f DEPRECATED/stzrqf.f stzrzf.f sstemr.f
|
||||
slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f
|
||||
stfttr.f stpttf.f stpttr.f strttf.f strttp.f
|
||||
sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f
|
||||
@@ -114,14 +114,14 @@ set(CLASRC
|
||||
cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f
|
||||
cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f
|
||||
cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f
|
||||
cgegs.f cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f
|
||||
cgels.f cgelsd.f cgelss.f cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
|
||||
cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f
|
||||
DEPRECATED/cgegs.f DEPRECATED/cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f
|
||||
cgels.f cgelsd.f cgelss.f DEPRECATED/cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
|
||||
DEPRECATED/cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f
|
||||
cgerq2.f cgerqf.f cgesc2.f cgesdd.f cgesvd.f
|
||||
cgesvx.f cgetc2.f cgetri.f
|
||||
cggbak.f cggbal.f cgges.f cggesx.f cggev.f cggevx.f cggglm.f
|
||||
cgghrd.f cgglse.f cggqrf.f cggrqf.f
|
||||
cggsvd.f cggsvp.f
|
||||
DEPRECATED/cggsvd.f DEPRECATED/cggsvp.f
|
||||
cgtcon.f cgtrfs.f cgtsv.f cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f
|
||||
chbevd.f chbevx.f chbgst.f chbgv.f chbgvd.f chbgvx.f chbtrd.f
|
||||
checon.f cheev.f cheevd.f cheevr.f cheevx.f chegs2.f chegst.f
|
||||
@@ -138,7 +138,7 @@ set(CLASRC
|
||||
claed0.f claed7.f claed8.f
|
||||
claein.f claesy.f claev2.f clags2.f clagtm.f
|
||||
clahef.f clahef_rook.f clahqr.f
|
||||
clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f
|
||||
DEPRECATED/clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f
|
||||
clanhb.f clanhe.f
|
||||
clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f
|
||||
clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f
|
||||
@@ -149,7 +149,7 @@ set(CLASRC
|
||||
clarfx.f clargv.f clarnv.f clarrv.f clartg.f clartv.f
|
||||
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f
|
||||
clasyf.f clasyf_rook.f clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f
|
||||
clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f
|
||||
DEPRECATED/clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f
|
||||
cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f
|
||||
cposv.f cposvx.f cpstrf.f cpstf2.f
|
||||
cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f
|
||||
@@ -166,7 +166,7 @@ set(CLASRC
|
||||
ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f
|
||||
ctprfs.f ctptri.f
|
||||
ctptrs.f ctrcon.f ctrevc.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f
|
||||
ctrsyl.f ctrtrs.f ctzrqf.f ctzrzf.f cung2l.f cung2r.f
|
||||
ctrsyl.f ctrtrs.f DEPRECATED/ctzrqf.f ctzrzf.f cung2l.f cung2r.f
|
||||
cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f
|
||||
cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f
|
||||
cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f
|
||||
@@ -186,18 +186,18 @@ set(DLASRC
|
||||
dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f
|
||||
dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f
|
||||
dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f
|
||||
dgegs.f dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f
|
||||
dgels.f dgelsd.f dgelss.f dgelsx.f dgelsy.f dgeql2.f dgeqlf.f
|
||||
dgeqp3.f dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f
|
||||
DEPRECATED/dgegs.f DEPRECATED/dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f
|
||||
dgels.f dgelsd.f dgelss.f DEPRECATED/dgelsx.f dgelsy.f dgeql2.f dgeqlf.f
|
||||
dgeqp3.f DEPRECATED/dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f
|
||||
dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvx.f
|
||||
dgetc2.f dgetri.f
|
||||
dggbak.f dggbal.f dgges.f dggesx.f dggev.f dggevx.f
|
||||
dggglm.f dgghrd.f dgglse.f dggqrf.f
|
||||
dggrqf.f dggsvd.f dggsvp.f dgtcon.f dgtrfs.f dgtsv.f
|
||||
dggrqf.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f dgtcon.f dgtrfs.f dgtsv.f
|
||||
dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f
|
||||
dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f
|
||||
dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f
|
||||
dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f
|
||||
DEPRECATED/dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f
|
||||
dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f
|
||||
dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f
|
||||
dlapll.f dlapmt.f
|
||||
@@ -207,7 +207,7 @@ set(DLASRC
|
||||
dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f
|
||||
dlargv.f dlarrv.f dlartv.f
|
||||
dlarz.f dlarzb.f dlarzt.f dlasy2.f dlasyf.f dlasyf_rook.f
|
||||
dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlatzm.f
|
||||
dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f DEPRECATED/dlatzm.f
|
||||
dopgtr.f dopmtr.f dorg2l.f dorg2r.f
|
||||
dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f
|
||||
dorgrq.f dorgtr.f dorm2l.f dorm2r.f
|
||||
@@ -235,7 +235,7 @@ set(DLASRC
|
||||
dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f
|
||||
dtptrs.f
|
||||
dtrcon.f dtrevc.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f
|
||||
dtrtrs.f dtzrqf.f dtzrzf.f dstemr.f
|
||||
dtrtrs.f DEPRECATED/dtzrqf.f dtzrzf.f dstemr.f
|
||||
dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f
|
||||
dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f
|
||||
dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f
|
||||
@@ -251,14 +251,14 @@ set(ZLASRC
|
||||
zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f
|
||||
zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
|
||||
zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f
|
||||
zgegs.f zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f
|
||||
zgels.f zgelsd.f zgelss.f zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
|
||||
zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
|
||||
DEPRECATED/zgegs.f DEPRECATED/zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f
|
||||
zgels.f zgelsd.f zgelss.f DEPRECATED/zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
|
||||
DEPRECATED/zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
|
||||
zgesc2.f zgesdd.f zgesvd.f zgesvx.f zgetc2.f
|
||||
zgetri.f
|
||||
zggbak.f zggbal.f zgges.f zggesx.f zggev.f zggevx.f zggglm.f
|
||||
zgghrd.f zgglse.f zggqrf.f zggrqf.f
|
||||
zggsvd.f zggsvp.f
|
||||
DEPRECATED/zggsvd.f DEPRECATED/zggsvp.f
|
||||
zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f
|
||||
zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f
|
||||
zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f
|
||||
@@ -275,7 +275,7 @@ set(ZLASRC
|
||||
zlaed0.f zlaed7.f zlaed8.f
|
||||
zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f
|
||||
zlahef.f zlahef_rook.f zlahqr.f
|
||||
zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f
|
||||
DEPRECATED/zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f
|
||||
zlangt.f zlanhb.f
|
||||
zlanhe.f
|
||||
zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f
|
||||
@@ -288,7 +288,7 @@ set(ZLASRC
|
||||
zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
|
||||
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
|
||||
zlassq.f zlasyf.f zlasyf_rook.f
|
||||
zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f zlatzm.f
|
||||
zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f DEPRECATED/zlatzm.f
|
||||
zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f
|
||||
zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f
|
||||
zposv.f zposvx.f zpotrs.f zpstrf.f zpstf2.f
|
||||
@@ -306,7 +306,7 @@ set(ZLASRC
|
||||
ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f
|
||||
ztprfs.f ztptri.f
|
||||
ztptrs.f ztrcon.f ztrevc.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f
|
||||
ztrsyl.f ztrtrs.f ztzrqf.f ztzrzf.f zung2l.f
|
||||
ztrsyl.f ztrtrs.f DEPRECATED/ztzrqf.f ztzrzf.f zung2l.f
|
||||
zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f
|
||||
zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f
|
||||
zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f
|
||||
|
||||
@@ -2038,6 +2038,59 @@ set(MATGEN
|
||||
lapacke_zlagsy_work.c
|
||||
)
|
||||
|
||||
set(Utils_SRC
|
||||
lapacke_cgb_nancheck.c lapacke_dpf_nancheck.c lapacke_ssy_trans.c
|
||||
lapacke_cgb_trans.c lapacke_dpf_trans.c lapacke_stb_nancheck.c
|
||||
lapacke_cge_nancheck.c lapacke_dpo_nancheck.c lapacke_stb_trans.c
|
||||
lapacke_cge_trans.c lapacke_dpo_trans.c lapacke_stf_nancheck.c
|
||||
lapacke_cgg_nancheck.c lapacke_dpp_nancheck.c lapacke_stf_trans.c
|
||||
lapacke_cgg_trans.c lapacke_dpp_trans.c lapacke_stp_nancheck.c
|
||||
lapacke_cgt_nancheck.c lapacke_dpt_nancheck.c lapacke_stp_trans.c
|
||||
lapacke_chb_nancheck.c lapacke_dsb_nancheck.c lapacke_str_nancheck.c
|
||||
lapacke_chb_trans.c lapacke_dsb_trans.c lapacke_str_trans.c
|
||||
lapacke_che_nancheck.c lapacke_dsp_nancheck.c lapacke_xerbla.c
|
||||
lapacke_che_trans.c lapacke_dsp_trans.c lapacke_zgb_nancheck.c
|
||||
lapacke_chp_nancheck.c lapacke_dst_nancheck.c lapacke_zgb_trans.c
|
||||
lapacke_chp_trans.c lapacke_dsy_nancheck.c lapacke_zge_nancheck.c
|
||||
lapacke_chs_nancheck.c lapacke_dsy_trans.c lapacke_zge_trans.c
|
||||
lapacke_chs_trans.c lapacke_dtb_nancheck.c lapacke_zgg_nancheck.c
|
||||
lapacke_c_nancheck.c lapacke_dtb_trans.c lapacke_zgg_trans.c
|
||||
lapacke_cpb_nancheck.c lapacke_dtf_nancheck.c lapacke_zgt_nancheck.c
|
||||
lapacke_cpb_trans.c lapacke_dtf_trans.c lapacke_zhb_nancheck.c
|
||||
lapacke_cpf_nancheck.c lapacke_dtp_nancheck.c lapacke_zhb_trans.c
|
||||
lapacke_cpf_trans.c lapacke_dtp_trans.c lapacke_zhe_nancheck.c
|
||||
lapacke_cpo_nancheck.c lapacke_dtr_nancheck.c lapacke_zhe_trans.c
|
||||
lapacke_cpo_trans.c lapacke_dtr_trans.c lapacke_zhp_nancheck.c
|
||||
lapacke_cpp_nancheck.c lapacke_lsame.c lapacke_zhp_trans.c
|
||||
lapacke_cpp_trans.c lapacke_make_complex_double.c lapacke_zhs_nancheck.c
|
||||
lapacke_cpt_nancheck.c lapacke_make_complex_float.c lapacke_zhs_trans.c
|
||||
lapacke_csp_nancheck.c lapacke_sgb_nancheck.c lapacke_z_nancheck.c
|
||||
lapacke_csp_trans.c lapacke_sgb_trans.c lapacke_zpb_nancheck.c
|
||||
lapacke_cst_nancheck.c lapacke_sge_nancheck.c lapacke_zpb_trans.c
|
||||
lapacke_csy_nancheck.c lapacke_sge_trans.c lapacke_zpf_nancheck.c
|
||||
lapacke_csy_trans.c lapacke_sgg_nancheck.c lapacke_zpf_trans.c
|
||||
lapacke_ctb_nancheck.c lapacke_sgg_trans.c lapacke_zpo_nancheck.c
|
||||
lapacke_ctb_trans.c lapacke_sgt_nancheck.c lapacke_zpo_trans.c
|
||||
lapacke_ctf_nancheck.c lapacke_shs_nancheck.c lapacke_zpp_nancheck.c
|
||||
lapacke_ctf_trans.c lapacke_shs_trans.c lapacke_zpp_trans.c
|
||||
lapacke_ctp_nancheck.c lapacke_s_nancheck.c lapacke_zpt_nancheck.c
|
||||
lapacke_ctp_trans.c lapacke_spb_nancheck.c lapacke_zsp_nancheck.c
|
||||
lapacke_ctr_nancheck.c lapacke_spb_trans.c lapacke_zsp_trans.c
|
||||
lapacke_ctr_trans.c lapacke_spf_nancheck.c lapacke_zst_nancheck.c
|
||||
lapacke_dgb_nancheck.c lapacke_spf_trans.c lapacke_zsy_nancheck.c
|
||||
lapacke_dgb_trans.c lapacke_spo_nancheck.c lapacke_zsy_trans.c
|
||||
lapacke_dge_nancheck.c lapacke_spo_trans.c lapacke_ztb_nancheck.c
|
||||
lapacke_dge_trans.c lapacke_spp_nancheck.c lapacke_ztb_trans.c
|
||||
lapacke_dgg_nancheck.c lapacke_spp_trans.c lapacke_ztf_nancheck.c
|
||||
lapacke_dgg_trans.c lapacke_spt_nancheck.c lapacke_ztf_trans.c
|
||||
lapacke_dgt_nancheck.c lapacke_ssb_nancheck.c lapacke_ztp_nancheck.c
|
||||
lapacke_dhs_nancheck.c lapacke_ssb_trans.c lapacke_ztp_trans.c
|
||||
lapacke_dhs_trans.c lapacke_ssp_nancheck.c lapacke_ztr_nancheck.c
|
||||
lapacke_d_nancheck.c lapacke_ssp_trans.c lapacke_ztr_trans.c
|
||||
lapacke_dpb_nancheck.c lapacke_sst_nancheck.c
|
||||
lapacke_dpb_trans.c lapacke_ssy_nancheck.c
|
||||
)
|
||||
|
||||
set(LAPACKE_REL_SRC "")
|
||||
if (BUILD_SINGLE)
|
||||
list(APPEND LAPACKE_REL_SRC ${SSRC})
|
||||
@@ -2058,10 +2111,14 @@ endif ()
|
||||
# add lapack-netlib folder to the sources
|
||||
set(LAPACKE_SOURCES "")
|
||||
foreach (LAE_FILE ${LAPACKE_REL_SRC})
|
||||
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/lapacke/src/${LAE_FILE}")
|
||||
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/src/${LAE_FILE}")
|
||||
endforeach ()
|
||||
|
||||
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/lapacke/include")
|
||||
foreach (Utils_FILE ${Utils_SRC})
|
||||
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/utils/${Utils_FILE}")
|
||||
endforeach ()
|
||||
|
||||
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h" "${lapacke_include_dir}/lapacke_mangling.h")
|
||||
include_directories(${lapacke_include_dir})
|
||||
set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
|
||||
|
||||
12
common.h
12
common.h
@@ -93,7 +93,7 @@ extern "C" {
|
||||
#include <sched.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD)
|
||||
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID)
|
||||
#include <sched.h>
|
||||
#endif
|
||||
|
||||
@@ -332,12 +332,13 @@ typedef int blasint;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
#ifdef PILEDRIVER
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
|
||||
/*
|
||||
#ifdef STEAMROLLER
|
||||
@@ -408,10 +409,14 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
||||
#include "common_arm64.h"
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_ZARCH
|
||||
#include "common_zarch.h"
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#ifdef OS_WINDOWS
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
#define readenv(p, n) GetEnvironmentVariable((n), (p), sizeof(p))
|
||||
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
|
||||
#else
|
||||
typedef char* env_var_t;
|
||||
#define readenv(p, n) ((p)=getenv(n))
|
||||
@@ -727,6 +732,7 @@ typedef struct {
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#include "common_stackalloc.h"
|
||||
#if 0
|
||||
#include "symcopy.h"
|
||||
#endif
|
||||
|
||||
@@ -70,7 +70,7 @@ extern long int syscall (long int __sysno, ...);
|
||||
static inline int my_mbind(void *addr, unsigned long len, int mode,
|
||||
unsigned long *nodemask, unsigned long maxnode,
|
||||
unsigned flags) {
|
||||
#if defined (__LSB_VERSION__)
|
||||
#if defined (__LSB_VERSION__) || defined(ARCH_ZARCH)
|
||||
// So far, LSB (Linux Standard Base) don't support syscall().
|
||||
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
|
||||
return 0;
|
||||
@@ -90,7 +90,7 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
|
||||
}
|
||||
|
||||
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {
|
||||
#if defined (__LSB_VERSION__)
|
||||
#if defined (__LSB_VERSION__) || defined(ARCH_ZARCH)
|
||||
// So far, LSB (Linux Standard Base) don't support syscall().
|
||||
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
|
||||
return 0;
|
||||
|
||||
@@ -1194,8 +1194,6 @@ extern gotoblas_t *gotoblas;
|
||||
#define XGEMM_DEFAULT_UNROLL_N 2
|
||||
#endif
|
||||
|
||||
#define GEMM_THREAD gemm_thread_m
|
||||
|
||||
#ifndef GEMM_THREAD
|
||||
#define GEMM_THREAD gemm_thread_n
|
||||
#endif
|
||||
|
||||
@@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
#define HAVE_PREFETCH
|
||||
#endif
|
||||
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL)
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
|
||||
#define DCBT_ARG 0
|
||||
#else
|
||||
#define DCBT_ARG 8
|
||||
@@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
#define L1_PREFETCH dcbtst
|
||||
#endif
|
||||
|
||||
#if defined(POWER8)
|
||||
#define L1_DUALFETCH
|
||||
#define L1_PREFETCHSIZE (16 + 128 * 100)
|
||||
#define L1_PREFETCH dcbtst
|
||||
#endif
|
||||
|
||||
#
|
||||
#ifndef L1_PREFETCH
|
||||
#define L1_PREFETCH dcbt
|
||||
#endif
|
||||
@@ -790,6 +797,8 @@ Lmcount$lazy_ptr:
|
||||
#define BUFFER_SIZE ( 2 << 20)
|
||||
#elif defined(PPC440FP2)
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#elif defined(POWER8)
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#endif
|
||||
|
||||
73
common_stackalloc.h
Normal file
73
common_stackalloc.h
Normal file
@@ -0,0 +1,73 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define STACK_ALLOC_PROTECT
|
||||
#ifdef STACK_ALLOC_PROTECT
|
||||
// Try to detect stack smashing
|
||||
#include <assert.h>
|
||||
#define STACK_ALLOC_PROTECT_SET volatile int stack_check = 0x7fc01234;
|
||||
#define STACK_ALLOC_PROTECT_CHECK assert(stack_check == 0x7fc01234);
|
||||
#else
|
||||
#define STACK_ALLOC_PROTECT_SET
|
||||
#define STACK_ALLOC_PROTECT_CHECK
|
||||
#endif
|
||||
|
||||
#if defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
|
||||
|
||||
/*
|
||||
* Allocate a buffer on the stack if the size is smaller than MAX_STACK_ALLOC.
|
||||
* Stack allocation is much faster than blas_memory_alloc or malloc, particularly
|
||||
* when OpenBLAS is used from a multi-threaded application.
|
||||
* SIZE must be carefully chosen to be:
|
||||
* - as small as possible to maximize the number of stack allocation
|
||||
* - large enough to support all architectures and kernel
|
||||
* Chosing a too small SIZE will lead to a stack smashing.
|
||||
*/
|
||||
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
|
||||
/* make it volatile because some function (ex: dgemv_n.S) */ \
|
||||
/* do not restore all register */ \
|
||||
volatile int stack_alloc_size = SIZE; \
|
||||
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \
|
||||
stack_alloc_size = 0; \
|
||||
STACK_ALLOC_PROTECT_SET \
|
||||
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \
|
||||
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
|
||||
#else
|
||||
//Original OpenBLAS/GotoBLAS codes.
|
||||
#define STACK_ALLOC(SIZE, TYPE, BUFFER) BUFFER = (TYPE *)blas_memory_alloc(1)
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
|
||||
#define STACK_FREE(BUFFER) \
|
||||
STACK_ALLOC_PROTECT_CHECK \
|
||||
if(!stack_alloc_size) \
|
||||
blas_memory_free(BUFFER);
|
||||
#else
|
||||
#define STACK_FREE(BUFFER) blas_memory_free(BUFFER)
|
||||
#endif
|
||||
|
||||
15
common_x86.h
15
common_x86.h
@@ -41,6 +41,10 @@
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
#ifdef C_MSVC
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#define MB
|
||||
#define WMB
|
||||
|
||||
@@ -58,7 +62,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
// use intrinsic instead of inline assembly
|
||||
ret = _InterlockedExchange(address, 1);
|
||||
ret = _InterlockedExchange((volatile LONG *)address, 1);
|
||||
// inline assembly
|
||||
/*__asm {
|
||||
mov eax, address
|
||||
@@ -170,12 +174,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
|
||||
if (y <= 1) return x;
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
result = x/y;
|
||||
return result;
|
||||
#else
|
||||
|
||||
y = blas_quick_divide_table[y];
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
(void*)result;
|
||||
return x*y;
|
||||
#else
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||
|
||||
return result;
|
||||
|
||||
@@ -396,7 +396,7 @@ REALNAME:
|
||||
|
||||
#define PROFCODE
|
||||
|
||||
#define EPILOGUE .end REALNAME
|
||||
#define EPILOGUE .end
|
||||
#endif
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
|
||||
|
||||
140
common_zarch.h
Normal file
140
common_zarch.h
Normal file
@@ -0,0 +1,140 @@
|
||||
/*****************************************************************************
|
||||
Copyright (c) 2011-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
#ifndef COMMON_ZARCH
|
||||
#define COMMON_ZARCH
|
||||
|
||||
#define MB
|
||||
//__asm__ __volatile__ ("dmb ish" : : : "memory")
|
||||
#define WMB
|
||||
//__asm__ __volatile__ ("dmb ishst" : : : "memory")
|
||||
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#define RETURN_BY_COMPLEX
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
/*
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
BLASULONG ret;
|
||||
|
||||
do {
|
||||
while (*address) {YIELDING;};
|
||||
|
||||
__asm__ __volatile__(
|
||||
"mov x4, #1 \n\t"
|
||||
"1: \n\t"
|
||||
"ldaxr x2, [%1] \n\t"
|
||||
"cbnz x2, 1b \n\t"
|
||||
"2: \n\t"
|
||||
"stxr w3, x4, [%1] \n\t"
|
||||
"cbnz w3, 1b \n\t"
|
||||
"mov %0, #0 \n\t"
|
||||
: "=r"(ret), "=r"(address)
|
||||
: "1"(address)
|
||||
: "memory", "x2" , "x3", "x4"
|
||||
|
||||
|
||||
);
|
||||
|
||||
|
||||
} while (ret);
|
||||
|
||||
}
|
||||
*/
|
||||
//#define BLAS_LOCK_DEFINED
|
||||
|
||||
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
}
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory")
|
||||
#else
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory")
|
||||
#endif
|
||||
|
||||
#define GET_IMAGE_CANCEL
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef F_INTERFACE
|
||||
#define REALNAME ASMNAME
|
||||
#else
|
||||
#define REALNAME ASMFNAME
|
||||
#endif
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.text ;\
|
||||
.align 256 ;\
|
||||
.global REALNAME ;\
|
||||
.type REALNAME, %function ;\
|
||||
REALNAME:
|
||||
|
||||
|
||||
#define EPILOGUE
|
||||
|
||||
#define PROFCODE
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#ifndef PAGESIZE
|
||||
#define PAGESIZE ( 4 << 10)
|
||||
#endif
|
||||
#define HUGE_PAGESIZE ( 4 << 20)
|
||||
|
||||
#if defined(CORTEXA57)
|
||||
#define BUFFER_SIZE (20 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE (16 << 20)
|
||||
#endif
|
||||
|
||||
|
||||
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
|
||||
|
||||
#ifndef MAP_ANONYMOUS
|
||||
#define MAP_ANONYMOUS MAP_ANON
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
24
cpuid_arm.c
24
cpuid_arm.c
@@ -115,6 +115,9 @@ int detect(void)
|
||||
if (strstr(p, "0xc0f")) {
|
||||
return CPU_CORTEXA15;
|
||||
}
|
||||
if (strstr(p, "0xd07")) {
|
||||
return CPU_ARMV7; //ARMV8 on 32-bit
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -158,6 +161,27 @@ int detect(void)
|
||||
|
||||
|
||||
}
|
||||
|
||||
p = (char *) NULL ;
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if ((!strncmp("CPU architecture", buffer, 16)))
|
||||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if(p != NULL) {
|
||||
if (strstr(p, "8")) {
|
||||
return CPU_ARMV7; //ARMV8 on 32-bit
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
return CPU_UNKNOWN;
|
||||
|
||||
@@ -191,6 +191,8 @@ void get_cpuconfig(void)
|
||||
printf("#define L2_SIZE 2097152\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,6 +55,7 @@
|
||||
#define CPUTYPE_POWER6 5
|
||||
#define CPUTYPE_CELL 6
|
||||
#define CPUTYPE_PPCG4 7
|
||||
#define CPUTYPE_POWER8 8
|
||||
|
||||
char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
@@ -65,6 +66,7 @@ char *cpuname[] = {
|
||||
"POWER6",
|
||||
"CELL",
|
||||
"PPCG4",
|
||||
"POWER8"
|
||||
};
|
||||
|
||||
char *lowercpuname[] = {
|
||||
@@ -76,6 +78,7 @@ char *lowercpuname[] = {
|
||||
"power6",
|
||||
"cell",
|
||||
"ppcg4",
|
||||
"power8"
|
||||
};
|
||||
|
||||
char *corename[] = {
|
||||
@@ -87,6 +90,7 @@ char *corename[] = {
|
||||
"POWER6",
|
||||
"CELL",
|
||||
"PPCG4",
|
||||
"POWER8"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
@@ -115,7 +119,7 @@ int detect(void){
|
||||
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
|
||||
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
||||
|
||||
|
||||
@@ -1172,6 +1172,9 @@ int get_cpuname(void){
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 13:
|
||||
// Avoton
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
@@ -1229,6 +1232,7 @@ int get_cpuname(void){
|
||||
case 2:
|
||||
return CPUTYPE_OPTERON;
|
||||
case 1:
|
||||
case 3:
|
||||
case 10:
|
||||
return CPUTYPE_BARCELONA;
|
||||
case 6:
|
||||
@@ -1674,6 +1678,9 @@ int get_coretype(void){
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 13:
|
||||
// Avoton
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
|
||||
93
cpuid_zarch.c
Normal file
93
cpuid_zarch.c
Normal file
@@ -0,0 +1,93 @@
|
||||
/**************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
{
|
||||
// return CPU_GENERIC;
|
||||
return CPU_Z13;
|
||||
|
||||
}
|
||||
|
||||
void get_libname(void)
|
||||
{
|
||||
|
||||
int d = detect();
|
||||
printf("%s", cpuname_lower[d]);
|
||||
}
|
||||
|
||||
char *get_corename(void)
|
||||
{
|
||||
return cpuname[detect()];
|
||||
}
|
||||
|
||||
void get_architecture(void)
|
||||
{
|
||||
printf("ZARCH");
|
||||
}
|
||||
|
||||
void get_subarchitecture(void)
|
||||
{
|
||||
int d = detect();
|
||||
printf("%s", cpuname[d]);
|
||||
}
|
||||
|
||||
void get_subdirname(void)
|
||||
{
|
||||
printf("zarch");
|
||||
}
|
||||
|
||||
|
||||
void get_cpuconfig(void)
|
||||
{
|
||||
|
||||
int d = detect();
|
||||
switch (d){
|
||||
case CPU_GENERIC:
|
||||
printf("#define ZARCH_GENERIC\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
case CPU_Z13:
|
||||
printf("#define Z13\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
4
ctest.c
4
ctest.c
@@ -105,6 +105,10 @@ ARCH_X86_64
|
||||
ARCH_POWER
|
||||
#endif
|
||||
|
||||
#if defined(__s390x__) || defined(__zarch__)
|
||||
ARCH_ZARCH
|
||||
#endif
|
||||
|
||||
#ifdef __mips64
|
||||
ARCH_MIPS64
|
||||
#endif
|
||||
|
||||
@@ -1365,8 +1365,9 @@
|
||||
*
|
||||
150 CONTINUE
|
||||
WRITE( NOUT, FMT = 9996 )SNAME
|
||||
CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
IF( TRACE )
|
||||
$ CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
*
|
||||
160 CONTINUE
|
||||
RETURN
|
||||
|
||||
@@ -1365,8 +1365,9 @@
|
||||
*
|
||||
150 CONTINUE
|
||||
WRITE( NOUT, FMT = 9996 )SNAME
|
||||
CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
IF( TRACE )
|
||||
$ CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
*
|
||||
160 CONTINUE
|
||||
RETURN
|
||||
|
||||
@@ -1335,8 +1335,9 @@
|
||||
*
|
||||
150 CONTINUE
|
||||
WRITE( NOUT, FMT = 9996 )SNAME
|
||||
CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
IF( TRACE )
|
||||
$ CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
*
|
||||
160 CONTINUE
|
||||
RETURN
|
||||
|
||||
@@ -1339,8 +1339,9 @@
|
||||
*
|
||||
150 CONTINUE
|
||||
WRITE( NOUT, FMT = 9996 )SNAME
|
||||
CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
IF( TRACE )
|
||||
$ CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
*
|
||||
160 CONTINUE
|
||||
RETURN
|
||||
|
||||
@@ -1350,7 +1350,7 @@
|
||||
*
|
||||
* Call the subroutine.
|
||||
*
|
||||
IF( SNAME( 4: 5 ).EQ.'mv' )THEN
|
||||
IF( SNAME( 10: 11 ).EQ.'mv' )THEN
|
||||
IF( FULL )THEN
|
||||
IF( TRACE )
|
||||
$ WRITE( NTRA, FMT = 9993 )NC, SNAME,
|
||||
@@ -1376,7 +1376,7 @@
|
||||
CALL CZTPMV( IORDER, UPLO, TRANS, DIAG,
|
||||
$ N, AA, XX, INCX )
|
||||
END IF
|
||||
ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN
|
||||
ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN
|
||||
IF( FULL )THEN
|
||||
IF( TRACE )
|
||||
$ WRITE( NTRA, FMT = 9993 )NC, SNAME,
|
||||
@@ -1465,7 +1465,7 @@
|
||||
END IF
|
||||
*
|
||||
IF( .NOT.NULL )THEN
|
||||
IF( SNAME( 4: 5 ).EQ.'mv' )THEN
|
||||
IF( SNAME( 10: 11 ).EQ.'mv' )THEN
|
||||
*
|
||||
* Check the result.
|
||||
*
|
||||
@@ -1473,7 +1473,7 @@
|
||||
$ INCX, ZERO, Z, INCX, XT, G,
|
||||
$ XX, EPS, ERR, FATAL, NOUT,
|
||||
$ .TRUE. )
|
||||
ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN
|
||||
ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN
|
||||
*
|
||||
* Compute approximation to original vector.
|
||||
*
|
||||
@@ -1611,7 +1611,7 @@
|
||||
* .. Common blocks ..
|
||||
COMMON /INFOC/INFOT, NOUTC, OK
|
||||
* .. Executable Statements ..
|
||||
CONJ = SNAME( 5: 5 ).EQ.'c'
|
||||
CONJ = SNAME( 11: 11 ).EQ.'c'
|
||||
* Define the number of arguments.
|
||||
NARGS = 9
|
||||
*
|
||||
|
||||
@@ -1366,8 +1366,9 @@
|
||||
*
|
||||
150 CONTINUE
|
||||
WRITE( NOUT, FMT = 9996 )SNAME
|
||||
CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
IF( TRACE )
|
||||
$ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
*
|
||||
160 CONTINUE
|
||||
RETURN
|
||||
|
||||
@@ -1366,8 +1366,9 @@
|
||||
*
|
||||
150 CONTINUE
|
||||
WRITE( NOUT, FMT = 9996 )SNAME
|
||||
CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
IF( TRACE )
|
||||
$ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
|
||||
$ M, N, ALPHA, LDA, LDB)
|
||||
*
|
||||
160 CONTINUE
|
||||
RETURN
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'ZBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||
#endif
|
||||
|
||||
x = buffer;
|
||||
buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
|
||||
buffer += ((COMPSIZE * args -> m + 3) & ~3);
|
||||
}
|
||||
|
||||
#ifndef TRANS
|
||||
@@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
||||
|
||||
if (num_cpu) {
|
||||
queue[0].sa = NULL;
|
||||
queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
|
||||
queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE;
|
||||
|
||||
queue[num_cpu - 1].next = NULL;
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
||||
@@ -48,8 +48,7 @@ foreach (float_type ${FLOAT_TYPES})
|
||||
# TRANS needs to be set/unset when CONJ is set/unset, so can't use it as a combination
|
||||
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK" 3 "herk_N" false ${float_type})
|
||||
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;TRANS;CONJ" 3 "herk_C" false ${float_type})
|
||||
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type})
|
||||
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type})
|
||||
|
||||
# Need to set CONJ for trmm and trsm
|
||||
GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trmm_LR" false ${float_type})
|
||||
GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trmm_LC" false ${float_type})
|
||||
@@ -72,6 +71,10 @@ foreach (float_type ${FLOAT_TYPES})
|
||||
GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER;TRANS;CONJ" "her2k_LC" false "" "" false ${float_type})
|
||||
|
||||
if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3)
|
||||
#herk
|
||||
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type})
|
||||
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type})
|
||||
|
||||
#hemm
|
||||
GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NN;THREADED_LEVEL3" 0 "hemm_thread_L" false ${float_type})
|
||||
GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NC;RSIDE;THREADED_LEVEL3" 0 "hemm_thread_R" false ${float_type})
|
||||
@@ -96,6 +99,17 @@ foreach (float_type ${FLOAT_TYPES})
|
||||
endif()
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
# for gemm3m
|
||||
if(USE_GEMM3M)
|
||||
foreach (GEMM_DEFINE ${GEMM_DEFINES})
|
||||
string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC)
|
||||
GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE}" "gemm3m_${GEMM_DEFINE_LC}" false "" "" false ${float_type})
|
||||
if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3)
|
||||
GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm3m_thread_${GEMM_DEFINE_LC}" false "" "" false ${float_type})
|
||||
endif ()
|
||||
endforeach ()
|
||||
endif()
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
|
||||
@@ -335,7 +335,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -367,7 +367,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
|
||||
START_RPCC();
|
||||
|
||||
@@ -33,6 +33,7 @@ set(COMMON_SOURCES
|
||||
xerbla.c
|
||||
openblas_set_num_threads.c
|
||||
openblas_error_handle.c
|
||||
openblas_env.c
|
||||
openblas_get_num_procs.c
|
||||
openblas_get_num_threads.c
|
||||
)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
TOPDIR = ../..
|
||||
include ../../Makefile.system
|
||||
|
||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
|
||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) openblas_env.$(SUFFIX)
|
||||
|
||||
#COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
|
||||
|
||||
@@ -118,6 +118,9 @@ openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c
|
||||
openblas_error_handle.$(SUFFIX) : openblas_error_handle.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
openblas_env.$(SUFFIX) : openblas_env.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
|
||||
@@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS)
|
||||
#include <dlfcn.h>
|
||||
#include <signal.h>
|
||||
#include <sys/resource.h>
|
||||
@@ -92,6 +92,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#endif
|
||||
#endif
|
||||
|
||||
extern unsigned int openblas_thread_timeout();
|
||||
|
||||
#ifdef SMP_SERVER
|
||||
|
||||
#undef MONITOR
|
||||
@@ -524,6 +526,7 @@ static int blas_monitor(void *arg){
|
||||
int blas_thread_init(void){
|
||||
BLASLONG i;
|
||||
int ret;
|
||||
int thread_timeout_env;
|
||||
#ifdef NEED_STACKATTR
|
||||
pthread_attr_t attr;
|
||||
#endif
|
||||
@@ -540,22 +543,12 @@ int blas_thread_init(void){
|
||||
|
||||
if (!blas_server_avail){
|
||||
|
||||
env_var_t p;
|
||||
|
||||
if (readenv(p,"THREAD_TIMEOUT")) {
|
||||
thread_timeout = atoi(p);
|
||||
if (thread_timeout < 4) thread_timeout = 4;
|
||||
if (thread_timeout > 30) thread_timeout = 30;
|
||||
thread_timeout = (1 << thread_timeout);
|
||||
}else{
|
||||
if (readenv(p,"GOTO_THREAD_TIMEOUT")) {
|
||||
thread_timeout = atoi(p);
|
||||
if (thread_timeout < 4) thread_timeout = 4;
|
||||
if (thread_timeout > 30) thread_timeout = 30;
|
||||
thread_timeout = (1 << thread_timeout);
|
||||
}
|
||||
}
|
||||
|
||||
thread_timeout_env=openblas_thread_timeout();
|
||||
if (thread_timeout_env>0) {
|
||||
if (thread_timeout_env < 4) thread_timeout_env = 4;
|
||||
if (thread_timeout_env > 30) thread_timeout_env = 30;
|
||||
thread_timeout = (1 << thread_timeout_env);
|
||||
}
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
|
||||
@@ -576,10 +569,12 @@ int blas_thread_init(void){
|
||||
struct rlimit rlim;
|
||||
const char *msg = strerror(ret);
|
||||
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg);
|
||||
#ifdef RLIMIT_NPROC
|
||||
if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
|
||||
fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
|
||||
"%ld current, %ld max\n", (long)(rlim.rlim_cur), (long)(rlim.rlim_max));
|
||||
}
|
||||
#endif
|
||||
if(0 != raise(SIGINT)) {
|
||||
fprintf(STDERR, "OpenBLAS blas_thread_init: calling exit(3)\n");
|
||||
exit(EXIT_FAILURE);
|
||||
|
||||
@@ -261,6 +261,11 @@ static gotoblas_t *get_coretype(void){
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Avoton
|
||||
if (model == 13) {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
return NULL;
|
||||
case 5:
|
||||
//Intel Broadwell
|
||||
@@ -386,7 +391,7 @@ static char *corename[] = {
|
||||
"Nehalem",
|
||||
"Athlon",
|
||||
"Opteron",
|
||||
"Opteron(SSE3)",
|
||||
"Opteron_SSE3",
|
||||
"Barcelona",
|
||||
"Nano",
|
||||
"Sandybridge",
|
||||
|
||||
@@ -144,7 +144,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#define CONSTRUCTOR __cdecl
|
||||
#define DESTRUCTOR __cdecl
|
||||
#elif defined(OS_DARWIN) && defined(C_GCC)
|
||||
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
|
||||
#define CONSTRUCTOR __attribute__ ((constructor))
|
||||
#define DESTRUCTOR __attribute__ ((destructor))
|
||||
#else
|
||||
@@ -169,7 +169,7 @@ void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
@@ -294,8 +294,11 @@ void openblas_fork_handler()
|
||||
#endif
|
||||
}
|
||||
|
||||
extern int openblas_num_threads_env();
|
||||
extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
env_var_t p;
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
@@ -310,18 +313,18 @@ int blas_get_cpu_number(void){
|
||||
|
||||
blas_goto_num = 0;
|
||||
#ifndef USE_OPENMP
|
||||
if (readenv(p,"OPENBLAS_NUM_THREADS")) blas_goto_num = atoi(p);
|
||||
blas_goto_num=openblas_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
|
||||
if (blas_goto_num == 0) {
|
||||
if (readenv(p,"GOTO_NUM_THREADS")) blas_goto_num = atoi(p);
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
blas_goto_num=openblas_goto_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
blas_omp_num = 0;
|
||||
if (readenv(p,"OMP_NUM_THREADS")) blas_omp_num = atoi(p);
|
||||
blas_omp_num=openblas_omp_num_threads_env();
|
||||
if (blas_omp_num < 0) blas_omp_num = 0;
|
||||
|
||||
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
|
||||
@@ -357,7 +360,9 @@ int openblas_get_num_threads(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
return blas_get_cpu_number();
|
||||
// init blas_cpu_number if needed
|
||||
blas_get_cpu_number();
|
||||
return blas_cpu_number;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1338,6 +1343,7 @@ static void gotoblas_memory_init(void) {
|
||||
/* Initialization for all function; this function should be called before main */
|
||||
|
||||
static int gotoblas_initialized = 0;
|
||||
extern void openblas_read_env();
|
||||
|
||||
void CONSTRUCTOR gotoblas_init(void) {
|
||||
|
||||
@@ -1347,6 +1353,8 @@ void CONSTRUCTOR gotoblas_init(void) {
|
||||
openblas_fork_handler();
|
||||
#endif
|
||||
|
||||
openblas_read_env();
|
||||
|
||||
#ifdef PROFILE
|
||||
moncontrol (0);
|
||||
#endif
|
||||
@@ -1363,7 +1371,8 @@ void CONSTRUCTOR gotoblas_init(void) {
|
||||
gotoblas_memory_init();
|
||||
#endif
|
||||
|
||||
#if defined(OS_LINUX)
|
||||
//#if defined(OS_LINUX)
|
||||
#if 0
|
||||
struct rlimit curlimit;
|
||||
if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
|
||||
{
|
||||
@@ -1443,6 +1452,31 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/*
|
||||
This is to allow static linking.
|
||||
Code adapted from Google performance tools:
|
||||
https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
|
||||
Reference:
|
||||
https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
|
||||
http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
|
||||
*/
|
||||
static int on_process_term(void)
|
||||
{
|
||||
gotoblas_quit();
|
||||
return 0;
|
||||
}
|
||||
#ifdef _WIN64
|
||||
#pragma comment(linker, "/INCLUDE:_tls_used")
|
||||
#else
|
||||
#pragma comment(linker, "/INCLUDE:__tls_used")
|
||||
#endif
|
||||
#pragma data_seg(push, old_seg)
|
||||
#pragma data_seg(".CRT$XLB")
|
||||
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
#pragma data_seg(".CRT$XTU")
|
||||
static int(*p_process_term)(void) = on_process_term;
|
||||
#pragma data_seg(pop, old_seg)
|
||||
#endif
|
||||
|
||||
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
|
||||
|
||||
84
driver/others/openblas_env.c
Normal file
84
driver/others/openblas_env.c
Normal file
@@ -0,0 +1,84 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2011-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static int openblas_env_verbose=0;
|
||||
static unsigned int openblas_env_thread_timeout=0;
|
||||
static int openblas_env_block_factor=0;
|
||||
static int openblas_env_openblas_num_threads=0;
|
||||
static int openblas_env_goto_num_threads=0;
|
||||
static int openblas_env_omp_num_threads=0;
|
||||
|
||||
int openblas_verbose() { return openblas_env_verbose;}
|
||||
unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;}
|
||||
int openblas_block_factor() { return openblas_env_block_factor;}
|
||||
int openblas_num_threads_env() { return openblas_env_openblas_num_threads;}
|
||||
int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;}
|
||||
int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;}
|
||||
|
||||
void openblas_read_env() {
|
||||
int ret=0;
|
||||
env_var_t p;
|
||||
if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_verbose=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OPENBLAS_BLOCK_FACTOR")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_block_factor=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OPENBLAS_THREAD_TIMEOUT")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_thread_timeout=(unsigned int)ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_openblas_num_threads=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_goto_num_threads=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OMP_NUM_THREADS")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_omp_num_threads=ret;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -33,13 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int openblas_verbose() {
|
||||
int ret=0;
|
||||
env_var_t p;
|
||||
if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
return ret;
|
||||
}
|
||||
extern int openblas_verbose();
|
||||
|
||||
void openblas_warning(int verbose, const char * msg) {
|
||||
int current_verbose;
|
||||
|
||||
@@ -40,6 +40,7 @@
|
||||
#include <string.h>
|
||||
#include "common.h"
|
||||
|
||||
extern int openblas_block_factor();
|
||||
int get_L2_size(void);
|
||||
|
||||
#define DEFAULT_GEMM_P 128
|
||||
@@ -249,7 +250,6 @@ int get_L2_size(void){
|
||||
|
||||
void blas_set_parameter(void){
|
||||
|
||||
env_var_t p;
|
||||
int factor;
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
|
||||
int size = 16;
|
||||
@@ -468,9 +468,8 @@ void blas_set_parameter(void){
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
if (readenv(p,"GOTO_BLOCK_FACTOR")) {
|
||||
factor = atoi(p);
|
||||
factor=openblas_block_factor();
|
||||
if (factor>0) {
|
||||
if (factor < 10) factor = 10;
|
||||
if (factor > 200) factor = 200;
|
||||
|
||||
|
||||
@@ -26,6 +26,10 @@ ifndef ONLY_CBLAS
|
||||
ONLY_CBLAS = 0
|
||||
endif
|
||||
|
||||
ifndef BUILD_LAPACK_DEPRECATED
|
||||
BUILD_LAPACK_DEPRECATED = 0
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifndef ONLY_CBLAS
|
||||
@@ -92,17 +96,17 @@ dll : ../$(LIBDLLNAME)
|
||||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB)
|
||||
|
||||
libopenblas.def : gensymbol
|
||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
libgoto_hpl.def : gensymbol
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
||||
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
||||
else
|
||||
../$(LIBNAME).renamed : ../$(LIBNAME) objconv.def
|
||||
$(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed
|
||||
$(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def
|
||||
../$(LIBNAME).osx.renamed : ../$(LIBNAME) objconv.def
|
||||
$(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).osx.renamed
|
||||
$(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
|
||||
endif
|
||||
ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2))
|
||||
#only build without Fortran
|
||||
@@ -114,7 +118,7 @@ endif
|
||||
dllinit.$(SUFFIX) : dllinit.c
|
||||
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
||||
|
||||
ifeq ($(OSNAME), Linux)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||
|
||||
so : ../$(LIBSONAME)
|
||||
|
||||
@@ -205,26 +209,26 @@ static : ../$(LIBNAME)
|
||||
rm -f goto.$(SUFFIX)
|
||||
|
||||
osx.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
aix.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
objcopy.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
objconv.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
test : linktest.c
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
|
||||
rm -f linktest
|
||||
|
||||
linktest.c : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > linktest.c
|
||||
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c
|
||||
|
||||
clean ::
|
||||
@rm -f *.def *.dylib __.SYMDEF*
|
||||
@rm -f *.def *.dylib __.SYMDEF* *.renamed
|
||||
|
||||
include ../Makefile.tail
|
||||
|
||||
|
||||
@@ -548,7 +548,6 @@
|
||||
slatmt,
|
||||
sorm22,
|
||||
spotrf2,
|
||||
xerbla,
|
||||
zgejsv,
|
||||
zgesvdx,
|
||||
zgesvj,
|
||||
@@ -591,6 +590,13 @@
|
||||
dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx,
|
||||
);
|
||||
|
||||
@lapack_deprecated_objs = (
|
||||
cgegs, cggsvd, ctzrqf, dgeqpf, dlatzm, sgelsx, slahrd, zgegv, zggsvp,
|
||||
cgegv, cggsvp, dgegs, dggsvd, dtzrqf, sgeqpf, slatzm, zgelsx, zlahrd,
|
||||
cgelsx, clahrd, dgegv, dggsvp, sgegs, sggsvd, stzrqf, zgeqpf, zlatzm,
|
||||
cgeqpf, clatzm, dgelsx, dlahrd, sgegv, sggsvp, zgegs, zggsvd, ztzrqf,
|
||||
);
|
||||
|
||||
@lapackeobjs = (
|
||||
# LAPACK C interface routines.
|
||||
#
|
||||
@@ -2985,6 +2991,11 @@ if ($ARGV[8] == 1) {
|
||||
@need_2underscore_objs = (@lapack_embeded_underscore_objs);
|
||||
};
|
||||
|
||||
if ($ARGV[11] == 1){
|
||||
#BUILD_LAPACK_DEPRECATED=1
|
||||
@underscore_objs =(@underscore_objs, @lapack_deprecated_objs);
|
||||
}
|
||||
|
||||
} else {
|
||||
@underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs);
|
||||
}
|
||||
|
||||
5
f_check
5
f_check
@@ -1,5 +1,7 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
|
||||
#
|
||||
# 1. Not specified
|
||||
# 1.1 Automatically detect, then check compiler
|
||||
@@ -272,8 +274,9 @@ if ($link ne "") {
|
||||
}
|
||||
|
||||
if ($flags =~ /^\-Y/) {
|
||||
next if ($hostos eq 'SunOS');
|
||||
$linker_L .= "-Wl,". $flags . " ";
|
||||
}
|
||||
}
|
||||
|
||||
if ($flags =~ /^\-rpath\@/) {
|
||||
$flags =~ s/\@/\,/g;
|
||||
|
||||
39
getarch.c
39
getarch.c
@@ -86,7 +86,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__sun__)
|
||||
#include <sys/sysinfo.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
@@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define CORENAME "POWER5"
|
||||
#endif
|
||||
|
||||
#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8)
|
||||
#if defined(FORCE_POWER6) || defined(FORCE_POWER7)
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "POWER"
|
||||
#define SUBARCHITECTURE "POWER6"
|
||||
@@ -565,6 +565,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define CORENAME "POWER6"
|
||||
#endif
|
||||
|
||||
#if defined(FORCE_POWER8)
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "POWER"
|
||||
#define SUBARCHITECTURE "POWER8"
|
||||
#define SUBDIRNAME "power"
|
||||
#define ARCHCONFIG "-DPOWER8 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \
|
||||
"-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
#define LIBNAME "power8"
|
||||
#define CORENAME "POWER8"
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FORCE_PPCG4
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "POWER"
|
||||
@@ -848,6 +862,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
#if defined(__zarch__) || defined(__s390x__)
|
||||
#define ZARCH
|
||||
#include "cpuid_zarch.c"
|
||||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
#ifdef INTEL_AMD
|
||||
#include "cpuid_x86.c"
|
||||
#define OPENBLAS_SUPPORTED
|
||||
@@ -906,7 +926,7 @@ static int get_num_cores(void) {
|
||||
size_t len;
|
||||
#endif
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__sun__)
|
||||
//returns the number of processors which are currently online
|
||||
return sysconf(_SC_NPROCESSORS_ONLN);
|
||||
|
||||
@@ -943,7 +963,7 @@ int main(int argc, char *argv[]){
|
||||
#ifdef FORCE
|
||||
printf("CORE=%s\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH)
|
||||
printf("CORE=%s\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
@@ -998,7 +1018,14 @@ int main(int argc, char *argv[]){
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if NO_PARALLEL_MAKE==1
|
||||
#ifdef MAKE_NB_JOBS
|
||||
#if MAKE_NB_JOBS > 0
|
||||
printf("MAKE += -j %d\n", MAKE_NB_JOBS);
|
||||
#else
|
||||
// Let make use parent -j argument or -j1 if there
|
||||
// is no make parent
|
||||
#endif
|
||||
#elif NO_PARALLEL_MAKE==1
|
||||
printf("MAKE += -j 1\n");
|
||||
#else
|
||||
#ifndef OS_WINDOWS
|
||||
@@ -1043,7 +1070,7 @@ int main(int argc, char *argv[]){
|
||||
#ifdef FORCE
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH)
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -64,10 +64,13 @@ int main(int argc, char **argv) {
|
||||
|
||||
|
||||
if ((argc >= 2) && (*argv[1] == '1')) {
|
||||
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64)
|
||||
printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float)));
|
||||
printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double)));
|
||||
printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float)));
|
||||
printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double)));
|
||||
#endif
|
||||
|
||||
#ifdef USE64BITINT
|
||||
printf("#define USE64BITINT\n");
|
||||
|
||||
@@ -79,11 +79,9 @@ void NAME(char *TRANS, blasint *M, blasint *N,
|
||||
FLOAT alpha = *ALPHA;
|
||||
FLOAT beta = *BETA;
|
||||
FLOAT *buffer;
|
||||
int buffer_size;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
int nthreads_max;
|
||||
int nthreads_avail;
|
||||
double MNK;
|
||||
#endif
|
||||
|
||||
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = {
|
||||
@@ -134,13 +132,10 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
FLOAT *buffer;
|
||||
blasint lenx, leny;
|
||||
int trans;
|
||||
int trans, buffer_size;
|
||||
blasint info, t;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
int nthreads_max;
|
||||
int nthreads_avail;
|
||||
double MNK;
|
||||
#endif
|
||||
|
||||
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = {
|
||||
@@ -215,43 +210,20 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (incx < 0) x -= (lenx - 1) * incx;
|
||||
if (incy < 0) y -= (leny - 1) * incy;
|
||||
|
||||
#ifdef MAX_STACK_ALLOC
|
||||
// make it volatile because some gemv implementation (ex: dgemv_n.S)
|
||||
// do not restore all register
|
||||
volatile int stack_alloc_size = 0;
|
||||
//for gemv_n and gemv_t, try to allocate on stack
|
||||
stack_alloc_size = m + n;
|
||||
#ifdef ALIGNED_ACCESS
|
||||
stack_alloc_size += 3;
|
||||
#endif
|
||||
if(stack_alloc_size < 128)
|
||||
//dgemv_n.S require a 128 bytes buffer
|
||||
stack_alloc_size = 128;
|
||||
|
||||
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
|
||||
stack_alloc_size = 0;
|
||||
|
||||
FLOAT stack_buffer[stack_alloc_size];
|
||||
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
|
||||
// printf("stack_alloc_size=%d\n", stack_alloc_size);
|
||||
#else
|
||||
//Original OpenBLAS/GotoBLAS codes.
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
buffer_size = m + n + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT) ;
|
||||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
nthreads_max = num_cpu_avail(2);
|
||||
nthreads_avail = nthreads_max;
|
||||
|
||||
MNK = (double) m * (double) n;
|
||||
if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) )
|
||||
nthreads_max = 1;
|
||||
|
||||
if ( nthreads_max > nthreads_avail )
|
||||
nthreads = nthreads_avail;
|
||||
if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD )
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = nthreads_max;
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
@@ -266,14 +238,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef MAX_STACK_ALLOC
|
||||
if(!stack_alloc_size){
|
||||
blas_memory_free(buffer);
|
||||
}
|
||||
#else
|
||||
blas_memory_free(buffer);
|
||||
#endif
|
||||
|
||||
STACK_FREE(buffer);
|
||||
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
@@ -171,19 +171,14 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (incy < 0) y -= (n - 1) * incy;
|
||||
if (incx < 0) x -= (m - 1) * incx;
|
||||
|
||||
#ifdef MAX_STACK_ALLOC
|
||||
volatile int stack_alloc_size = m;
|
||||
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
|
||||
stack_alloc_size = 0;
|
||||
FLOAT stack_buffer[stack_alloc_size];
|
||||
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
|
||||
#else
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
#endif
|
||||
STACK_ALLOC(m, FLOAT, buffer);
|
||||
|
||||
#ifdef SMPTEST
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
// Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
|
||||
if(1L * m * n > 2048L * GEMM_MULTITHREAD_THRESHOLD)
|
||||
nthreads = num_cpu_avail(2);
|
||||
else
|
||||
nthreads = 1;
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
@@ -198,11 +193,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef MAX_STACK_ALLOC
|
||||
if(!stack_alloc_size)
|
||||
#endif
|
||||
blas_memory_free(buffer);
|
||||
|
||||
STACK_FREE(buffer);
|
||||
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
@@ -95,7 +95,7 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||
s = db / r;
|
||||
z = ONE;
|
||||
if (ada > adb) z = s;
|
||||
if ((ada < adb) && (c != ZERO)) z = ONE / c;
|
||||
if ((ada <= adb) && (c != ZERO)) z = ONE / c;
|
||||
|
||||
*C = c;
|
||||
*S = s;
|
||||
|
||||
@@ -77,12 +77,13 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
|
||||
if (incy < 0) y -= (n - 1) * incy;
|
||||
|
||||
#ifdef SMP
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
//disable multi-thread when incx==0 or incy==0
|
||||
//In that case, the threads would be dependent.
|
||||
if (incx == 0 || incy == 0)
|
||||
nthreads = 1;
|
||||
if (incx == 0 || incy == 0 || n < 2097152 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT))
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -77,11 +77,9 @@ void NAME(char *TRANS, blasint *M, blasint *N,
|
||||
blasint incy = *INCY;
|
||||
|
||||
FLOAT *buffer;
|
||||
int buffer_size;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
int nthreads_max;
|
||||
int nthreads_avail;
|
||||
double MNK;
|
||||
#endif
|
||||
|
||||
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG,
|
||||
@@ -144,13 +142,10 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
FLOAT *buffer;
|
||||
blasint lenx, leny;
|
||||
int trans;
|
||||
int trans, buffer_size;
|
||||
blasint info, t;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
int nthreads_max;
|
||||
int nthreads_avail;
|
||||
double MNK;
|
||||
#endif
|
||||
|
||||
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG,
|
||||
@@ -236,22 +231,26 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (incx < 0) x -= (lenx - 1) * incx * 2;
|
||||
if (incy < 0) y -= (leny - 1) * incy * 2;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT) ;
|
||||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
|
||||
#if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
|
||||
// cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746)
|
||||
if(trans && stack_alloc_size)
|
||||
memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size));
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
nthreads_max = num_cpu_avail(2);
|
||||
nthreads_avail = nthreads_max;
|
||||
|
||||
MNK = (double) m * (double) n;
|
||||
if ( MNK <= ( 256.0 * (double) (GEMM_MULTITHREAD_THRESHOLD * GEMM_MULTITHREAD_THRESHOLD) ))
|
||||
nthreads_max = 1;
|
||||
|
||||
if ( nthreads_max > nthreads_avail )
|
||||
nthreads = nthreads_avail;
|
||||
if ( 1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD )
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = nthreads_max;
|
||||
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
@@ -267,7 +266,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
}
|
||||
#endif
|
||||
|
||||
blas_memory_free(buffer);
|
||||
STACK_FREE(buffer);
|
||||
|
||||
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
||||
|
||||
|
||||
@@ -210,10 +210,14 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (incy < 0) y -= (n - 1) * incy * 2;
|
||||
if (incx < 0) x -= (m - 1) * incx * 2;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
STACK_ALLOC(2 * m, FLOAT, buffer);
|
||||
|
||||
#ifdef SMPTEST
|
||||
nthreads = num_cpu_avail(2);
|
||||
// Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
|
||||
if(1L * m * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
|
||||
nthreads = num_cpu_avail(2);
|
||||
else
|
||||
nthreads = 1;
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
@@ -245,7 +249,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
}
|
||||
#endif
|
||||
|
||||
blas_memory_free(buffer);
|
||||
STACK_FREE(buffer);
|
||||
|
||||
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
||||
|
||||
|
||||
@@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
|
||||
blasint info;
|
||||
int uplo;
|
||||
int unit;
|
||||
int trans;
|
||||
int trans, buffer_size;
|
||||
FLOAT *buffer;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
@@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) {
|
||||
|
||||
int trans, uplo, unit;
|
||||
int trans, uplo, unit, buffer_size;
|
||||
blasint info;
|
||||
FLOAT *buffer;
|
||||
#ifdef SMP
|
||||
@@ -227,11 +227,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
|
||||
if (incx < 0 ) x -= (n - 1) * incx * 2;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
#ifdef SMP
|
||||
// Calibrated on a Xeon E5-2630
|
||||
if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) {
|
||||
nthreads = num_cpu_avail(2);
|
||||
if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
|
||||
nthreads = 2;
|
||||
} else
|
||||
nthreads = 1;
|
||||
|
||||
if(nthreads > 1) {
|
||||
buffer_size = n > 16 ? 0 : n * 4 + 40;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT);
|
||||
if(incx != 1)
|
||||
buffer_size += n * 2;
|
||||
}
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -245,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
}
|
||||
#endif
|
||||
|
||||
blas_memory_free(buffer);
|
||||
STACK_FREE(buffer);
|
||||
|
||||
FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n);
|
||||
|
||||
|
||||
@@ -227,6 +227,28 @@ foreach (float_type ${FLOAT_TYPES})
|
||||
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type})
|
||||
|
||||
#gemm3m
|
||||
if (USE_GEMM3M)
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM3MKERNEL}" "NN" "gemm3m_kernel" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA" "gemm3m_oncopyb" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;REAL_ONLY" "gemm3m_oncopyr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;IMAGE_ONLY" "gemm3m_oncopyi" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA" "gemm3m_otcopyb" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;REAL_ONLY" "gemm3m_otcopyr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;IMAGE_ONLY" "gemm3m_otcopyi" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY" "gemm3m_incopyb" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;REAL_ONLY" "gemm3m_incopyr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;IMAGE_ONLY" "gemm3m_incopyi" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY" "gemm3m_itcopyb" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;REAL_ONLY" "gemm3m_itcopyr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;IMAGE_ONLY" "gemm3m_itcopyi" false "" "" false ${float_type})
|
||||
|
||||
endif()
|
||||
|
||||
else () #For real
|
||||
GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type})
|
||||
|
||||
|
||||
@@ -36,6 +36,15 @@ ifeq ($(CORE), HASWELL)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER8)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z13)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
SKERNELOBJS += \
|
||||
|
||||
@@ -1,26 +1,4 @@
|
||||
SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
|
||||
DGEMVNKERNEL = ../arm/gemv_n.c
|
||||
DGEMVTKERNEL = ../arm/gemv_t.c
|
||||
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
|
||||
#ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
#ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
#ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
#ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
|
||||
|
||||
#STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
#SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
#SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
#SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
|
||||
|
||||
###############################################################################
|
||||
@@ -96,19 +74,19 @@ DSWAPKERNEL = swap_vfp.S
|
||||
CSWAPKERNEL = swap_vfp.S
|
||||
ZSWAPKERNEL = swap_vfp.S
|
||||
|
||||
# BAD SGEMVNKERNEL = gemv_n_vfp.S
|
||||
# BAD DGEMVNKERNEL = gemv_n_vfp.S
|
||||
# CGEMVNKERNEL = cgemv_n_vfp.S
|
||||
SGEMVNKERNEL = gemv_n_vfp.S
|
||||
DGEMVNKERNEL = gemv_n_vfp.S
|
||||
CGEMVNKERNEL = cgemv_n_vfp.S
|
||||
ZGEMVNKERNEL = zgemv_n_vfp.S
|
||||
|
||||
# BAD SGEMVTKERNEL = gemv_t_vfp.S
|
||||
# BAD DGEMVTKERNEL = gemv_t_vfp.S
|
||||
# CGEMVTKERNEL = cgemv_t_vfp.S
|
||||
SGEMVTKERNEL = gemv_t_vfp.S
|
||||
DGEMVTKERNEL = gemv_t_vfp.S
|
||||
CGEMVTKERNEL = cgemv_t_vfp.S
|
||||
ZGEMVTKERNEL = zgemv_t_vfp.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x2_vfp.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
|
||||
#CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
|
||||
CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_4x2_vfp.S
|
||||
@@ -131,9 +109,9 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
#CGEMMKERNEL = cgemm_kernel_2x2_vfp.S
|
||||
#CGEMMONCOPY = cgemm_ncopy_2_vfp.S
|
||||
#CGEMMOTCOPY = cgemm_tcopy_2_vfp.S
|
||||
CGEMMKERNEL = cgemm_kernel_2x2_vfp.S
|
||||
CGEMMONCOPY = cgemm_ncopy_2_vfp.S
|
||||
CGEMMOTCOPY = cgemm_tcopy_2_vfp.S
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
|
||||
@@ -1,8 +1,3 @@
|
||||
SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
|
||||
|
||||
#################################################################################
|
||||
SAMAXKERNEL = iamax_vfp.S
|
||||
@@ -77,14 +72,14 @@ DSCALKERNEL = scal.c
|
||||
CSCALKERNEL = zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
|
||||
# BAD SGEMVNKERNEL = gemv_n_vfp.S
|
||||
DGEMVNKERNEL = gemv_n_vfp.S
|
||||
#CGEMVNKERNEL = cgemv_n_vfp.S
|
||||
SGEMVNKERNEL = gemv_n_vfpv3.S
|
||||
DGEMVNKERNEL = gemv_n_vfpv3.S
|
||||
CGEMVNKERNEL = cgemv_n_vfp.S
|
||||
ZGEMVNKERNEL = zgemv_n_vfp.S
|
||||
|
||||
# BAD SGEMVTKERNEL = gemv_t_vfp.S
|
||||
SGEMVTKERNEL = gemv_t_vfp.S
|
||||
DGEMVTKERNEL = gemv_t_vfp.S
|
||||
#CGEMVTKERNEL = cgemv_t_vfp.S
|
||||
CGEMVTKERNEL = cgemv_t_vfp.S
|
||||
ZGEMVTKERNEL = zgemv_t_vfp.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
|
||||
@@ -92,24 +87,15 @@ DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
|
||||
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S
|
||||
|
||||
#SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S
|
||||
SGEMMINCOPY =
|
||||
SGEMMITCOPY =
|
||||
SGEMMONCOPY = sgemm_ncopy_4_vfp.S
|
||||
SGEMMOTCOPY = sgemm_tcopy_4_vfp.S
|
||||
SGEMMINCOPYOBJ =
|
||||
SGEMMITCOPYOBJ =
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S
|
||||
DGEMMINCOPY =
|
||||
DGEMMITCOPY =
|
||||
DGEMMONCOPY = dgemm_ncopy_4_vfp.S
|
||||
DGEMMOTCOPY = dgemm_tcopy_4_vfp.S
|
||||
DGEMMINCOPYOBJ =
|
||||
DGEMMITCOPYOBJ =
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
|
||||
@@ -367,12 +367,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.align 5
|
||||
|
||||
#if defined(DOUBLE)
|
||||
vsub.f64 d0 , d0 , d0
|
||||
vsub.f64 d1 , d1 , d1
|
||||
#else
|
||||
vsub.f32 s0 , s0 , s0
|
||||
vsub.f32 s1 , s1 , s1
|
||||
movs r12, #0 // clear floating point register
|
||||
vmov s0, r12
|
||||
vmov s1, r12
|
||||
#if defined(DOUBLE)
|
||||
vcvt.f64.f32 d0, s0
|
||||
vcvt.f64.f32 d1, s1
|
||||
#endif
|
||||
|
||||
cmp N, #0
|
||||
|
||||
@@ -185,14 +185,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
sub r4, fp, #128
|
||||
vstm r4, { s8 - s15} // store floating point registers
|
||||
|
||||
movs r4, #0 // clear floating point register
|
||||
vmov s0, r4
|
||||
vmov s1, s0
|
||||
vmov s2, s0
|
||||
vmov s3, s0
|
||||
|
||||
mov Y, OLD_Y
|
||||
ldr INC_Y, OLD_INC_Y
|
||||
|
||||
vsub.f32 s0 , s0 , s0
|
||||
vsub.f32 s1 , s1 , s1
|
||||
vsub.f32 s2 , s2 , s2
|
||||
vsub.f32 s3 , s3 , s3
|
||||
|
||||
cmp N, #0
|
||||
ble cdot_kernel_L999
|
||||
|
||||
|
||||
@@ -57,6 +57,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define N [fp, #-260 ]
|
||||
#define K [fp, #-264 ]
|
||||
|
||||
#define FP_ZERO [fp, #-240]
|
||||
#define FP_ZERO_0 [fp, # -240]
|
||||
#define FP_ZERO_1 [fp, # -236]
|
||||
|
||||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
@@ -138,7 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x2
|
||||
|
||||
vsub.f32 s8 , s8 , s8
|
||||
flds s8 , FP_ZERO
|
||||
vmov.f32 s9 , s8
|
||||
vmov.f32 s10, s8
|
||||
vmov.f32 s11, s8
|
||||
@@ -340,7 +344,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x2
|
||||
|
||||
vsub.f32 s8 , s8 , s8
|
||||
flds s8 , FP_ZERO
|
||||
vmov.f32 s9 , s8
|
||||
vmov.f32 s12, s8
|
||||
vmov.f32 s13, s8
|
||||
@@ -514,7 +518,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x1
|
||||
|
||||
vsub.f32 s8 , s8 , s8
|
||||
flds s8 , FP_ZERO
|
||||
vmov.f32 s9 , s8
|
||||
vmov.f32 s10, s8
|
||||
vmov.f32 s11, s8
|
||||
@@ -681,7 +685,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x1
|
||||
|
||||
vsub.f32 s8 , s8 , s8
|
||||
flds s8 , FP_ZERO
|
||||
vmov.f32 s9 , s8
|
||||
|
||||
.endm
|
||||
@@ -822,6 +826,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
sub r3, fp, #128
|
||||
vstm r3, { s8 - s15} // store floating point registers
|
||||
|
||||
movs r4, #0
|
||||
str r4, FP_ZERO
|
||||
str r4, FP_ZERO_1
|
||||
|
||||
ldr r3, OLD_LDC
|
||||
lsl r3, r3, #3 // ldc = ldc * 4 * 2
|
||||
str r3, LDC
|
||||
|
||||
@@ -73,6 +73,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define N [fp, #-260 ]
|
||||
#define K [fp, #-264 ]
|
||||
|
||||
#define FP_ZERO [fp, #-240]
|
||||
#define FP_ZERO_0 [fp, # -240]
|
||||
#define FP_ZERO_1 [fp, # -236]
|
||||
|
||||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
@@ -147,7 +151,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x2
|
||||
|
||||
vsub.f32 s16 , s16 , s16
|
||||
flds s16, FP_ZERO
|
||||
vmov.f32 s17, s16
|
||||
vmov.f32 s18, s16
|
||||
vmov.f32 s19, s16
|
||||
@@ -368,7 +372,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x2
|
||||
|
||||
vsub.f32 s16 , s16 , s16
|
||||
flds s16, FP_ZERO
|
||||
vmov.f32 s17, s16
|
||||
vmov.f32 s20, s16
|
||||
vmov.f32 s21, s16
|
||||
@@ -550,7 +554,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x1
|
||||
|
||||
vsub.f32 s16 , s16 , s16
|
||||
flds s16, FP_ZERO
|
||||
vmov.f32 s17, s16
|
||||
vmov.f32 s18, s16
|
||||
vmov.f32 s19, s16
|
||||
@@ -730,7 +734,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x1
|
||||
|
||||
vsub.f32 s16 , s16 , s16
|
||||
flds s16, FP_ZERO
|
||||
vmov.f32 s17, s16
|
||||
vmov.f32 s24, s16
|
||||
vmov.f32 s25, s16
|
||||
@@ -879,6 +883,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
sub r3, fp, #128
|
||||
vstm r3, { s8 - s31} // store floating point registers
|
||||
|
||||
movs r4, #0
|
||||
str r4, FP_ZERO
|
||||
str r4, FP_ZERO_1
|
||||
|
||||
ldr r3, OLD_LDC
|
||||
lsl r3, r3, #3 // ldc = ldc * 4 * 2
|
||||
str r3, LDC
|
||||
|
||||
@@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define I r12
|
||||
|
||||
#define FP_ZERO [fp, #-228]
|
||||
#define FP_ZERO_0 [fp, #-228]
|
||||
#define FP_ZERO_1 [fp, #-224]
|
||||
|
||||
#define ALPHA_I [fp, #-236]
|
||||
#define ALPHA_R [fp, #-244]
|
||||
|
||||
@@ -117,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro INIT_F4
|
||||
|
||||
pld [ YO, #Y_PRE ]
|
||||
vsub.f32 s8 , s8 , s8
|
||||
flds s8 , FP_ZERO
|
||||
vmov.f32 s9 , s8
|
||||
vmov.f32 s10, s8
|
||||
vmov.f32 s11, s8
|
||||
@@ -220,7 +224,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_F1
|
||||
|
||||
vsub.f32 s8 , s8 , s8
|
||||
flds s8 , FP_ZERO
|
||||
vmov.f32 s9 , s8
|
||||
|
||||
.endm
|
||||
@@ -267,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S4
|
||||
|
||||
vsub.f32 s8 , s8 , s8
|
||||
flds s8 , FP_ZERO
|
||||
vmov.f32 s9 , s8
|
||||
vmov.f32 s10, s8
|
||||
vmov.f32 s11, s8
|
||||
@@ -384,7 +388,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S1
|
||||
|
||||
vsub.f32 s8 , s8 , s8
|
||||
flds s8 , FP_ZERO
|
||||
vmov.f32 s9 , s8
|
||||
|
||||
.endm
|
||||
@@ -448,6 +452,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vstm r12, { s8 - s15 } // store floating point registers
|
||||
#endif
|
||||
|
||||
movs r12, #0
|
||||
str r12, FP_ZERO
|
||||
str r12, FP_ZERO_1
|
||||
|
||||
cmp OLD_M, #0
|
||||
ble cgemvn_kernel_L999
|
||||
|
||||
|
||||
@@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define I r12
|
||||
|
||||
#define FP_ZERO [fp, #-228]
|
||||
#define FP_ZERO_0 [fp, #-228]
|
||||
#define FP_ZERO_1 [fp, #-224]
|
||||
|
||||
#define N [fp, #-252 ]
|
||||
#define A [fp, #-256 ]
|
||||
|
||||
@@ -116,10 +120,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_F2
|
||||
|
||||
vsub.f32 s12, s12, s12
|
||||
vsub.f32 s13, s13, s13
|
||||
vsub.f32 s14, s14, s14
|
||||
vsub.f32 s15, s15, s15
|
||||
flds s12, FP_ZERO
|
||||
vmov.f32 s13, s12
|
||||
vmov.f32 s14, s12
|
||||
vmov.f32 s15, s12
|
||||
|
||||
.endm
|
||||
|
||||
@@ -172,8 +176,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_F1
|
||||
|
||||
vsub.f32 s12, s12, s12
|
||||
vsub.f32 s13, s13, s13
|
||||
flds s12, FP_ZERO
|
||||
vmov.f32 s13, s12
|
||||
|
||||
.endm
|
||||
|
||||
@@ -215,10 +219,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S2
|
||||
|
||||
vsub.f32 s12, s12, s12
|
||||
vsub.f32 s13, s13, s13
|
||||
vsub.f32 s14, s14, s14
|
||||
vsub.f32 s15, s15, s15
|
||||
flds s12, FP_ZERO
|
||||
vmov.f32 s13, s12
|
||||
vmov.f32 s14, s12
|
||||
vmov.f32 s15, s12
|
||||
|
||||
.endm
|
||||
|
||||
@@ -281,8 +285,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S1
|
||||
|
||||
vsub.f32 s12, s12, s12
|
||||
vsub.f32 s13, s13, s13
|
||||
flds s12, FP_ZERO
|
||||
vmov.f32 s13, s12
|
||||
|
||||
.endm
|
||||
|
||||
@@ -345,6 +349,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vstm r12, { s8 - s15 } // store floating point registers
|
||||
#endif
|
||||
|
||||
movs r12, #0
|
||||
str r12, FP_ZERO
|
||||
str r12, FP_ZERO_1
|
||||
|
||||
cmp M, #0
|
||||
ble cgemvt_kernel_L999
|
||||
|
||||
|
||||
@@ -59,6 +59,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define N [fp, #-260 ]
|
||||
#define K [fp, #-264 ]
|
||||
|
||||
#define FP_ZERO [fp, #-232]
|
||||
#define FP_ZERO_0 [fp, #-232]
|
||||
#define FP_ZERO_1 [fp, #-228]
|
||||
|
||||
|
||||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
@@ -136,7 +141,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x2
|
||||
|
||||
vsub.f32 s8 , s8 , s8
|
||||
flds s8 , FP_ZERO
|
||||
vmov.f32 s9 , s8
|
||||
vmov.f32 s10, s8
|
||||
vmov.f32 s11, s8
|
||||
@@ -301,10 +306,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
vsub.f32 s4, s4, s4
|
||||
vsub.f32 s5, s5, s5
|
||||
vsub.f32 s6, s6, s6
|
||||
vsub.f32 s7, s7, s7
|
||||
flds s4, FP_ZERO
|
||||
vmov.f32 s5, s4
|
||||
vmov.f32 s6, s4
|
||||
vmov.f32 s7, s4
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
@@ -318,10 +323,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
|
||||
vsub.f32 s4, s4, s4
|
||||
vsub.f32 s5, s5, s5
|
||||
vsub.f32 s6, s6, s6
|
||||
vsub.f32 s7, s7, s7
|
||||
flds s4, FP_ZERO
|
||||
vmov.f32 s5, s4
|
||||
vmov.f32 s6, s4
|
||||
vmov.f32 s7, s4
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
@@ -343,7 +348,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x2
|
||||
|
||||
vsub.f32 s8 , s8 , s8
|
||||
flds s8 , FP_ZERO
|
||||
vmov.f32 s9 , s8
|
||||
vmov.f32 s12, s8
|
||||
vmov.f32 s13, s8
|
||||
@@ -490,8 +495,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
vsub.f32 s4, s4, s4
|
||||
vsub.f32 s5, s5, s5
|
||||
flds s4, FP_ZERO
|
||||
vmov.f32 s5, s4
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
@@ -500,8 +505,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
|
||||
vsub.f32 s4, s4, s4
|
||||
vsub.f32 s5, s5, s5
|
||||
flds s4, FP_ZERO
|
||||
vmov.f32 s5, s4
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
@@ -519,7 +524,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x1
|
||||
|
||||
vsub.f32 s8 , s8 , s8
|
||||
flds s8 , FP_ZERO
|
||||
vmov.f32 s9 , s8
|
||||
vmov.f32 s10, s8
|
||||
vmov.f32 s11, s8
|
||||
@@ -663,10 +668,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
vsub.f32 s4, s4, s4
|
||||
vsub.f32 s5, s5, s5
|
||||
vsub.f32 s6, s6, s6
|
||||
vsub.f32 s7, s7, s7
|
||||
flds s4, FP_ZERO
|
||||
vmov.f32 s5, s4
|
||||
vmov.f32 s6, s4
|
||||
vmov.f32 s7, s4
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
@@ -689,7 +694,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x1
|
||||
|
||||
vsub.f32 s8 , s8 , s8
|
||||
flds s8 , FP_ZERO
|
||||
vmov.f32 s9 , s8
|
||||
|
||||
.endm
|
||||
@@ -795,8 +800,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
vsub.f32 s4, s4, s4
|
||||
vsub.f32 s5, s5, s5
|
||||
flds s4, FP_ZERO
|
||||
vmov.f32 s5, s4
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
@@ -831,6 +836,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
sub r3, fp, #128
|
||||
vstm r3, { s8 - s15} // store floating point registers
|
||||
|
||||
movs r4, #0
|
||||
str r4, FP_ZERO
|
||||
str r4, FP_ZERO_1
|
||||
|
||||
ldr r3, OLD_LDC
|
||||
lsl r3, r3, #3 // ldc = ldc * 4 * 2
|
||||
str r3, LDC
|
||||
|
||||
@@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define N [fp, #-260 ]
|
||||
#define K [fp, #-264 ]
|
||||
|
||||
#define FP_ZERO [fp, #-236]
|
||||
#define FP_ZERO_0 [fp, #-236]
|
||||
#define FP_ZERO_1 [fp, #-232]
|
||||
|
||||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
@@ -134,7 +138,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x2
|
||||
|
||||
vsub.f32 s16 , s16 , s16
|
||||
flds s16 , FP_ZERO
|
||||
vmov.f32 s17, s16
|
||||
vmov.f32 s18, s16
|
||||
vmov.f32 s19, s16
|
||||
@@ -351,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x2
|
||||
|
||||
vsub.f32 s16 , s16 , s16
|
||||
flds s16 , FP_ZERO
|
||||
vmov.f32 s17, s16
|
||||
vmov.f32 s20, s16
|
||||
vmov.f32 s21, s16
|
||||
@@ -529,7 +533,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x1
|
||||
|
||||
vsub.f32 s16 , s16 , s16
|
||||
flds s16 , FP_ZERO
|
||||
vmov.f32 s17, s16
|
||||
vmov.f32 s18, s16
|
||||
vmov.f32 s19, s16
|
||||
@@ -706,7 +710,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x1
|
||||
|
||||
vsub.f32 s16 , s16 , s16
|
||||
flds s16 , FP_ZERO
|
||||
vmov.f32 s17, s16
|
||||
vmov.f32 s24, s16
|
||||
vmov.f32 s25, s16
|
||||
@@ -852,6 +856,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
sub r3, fp, #128
|
||||
vstm r3, { s8 - s31} // store floating point registers
|
||||
|
||||
movs r4, #0
|
||||
str r4, FP_ZERO
|
||||
str r4, FP_ZERO_1
|
||||
|
||||
ldr r3, OLD_LDC
|
||||
lsl r3, r3, #3 // ldc = ldc * 4 * 2
|
||||
str r3, LDC
|
||||
|
||||
@@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
* 2016/01/23 Saar
|
||||
* Bugfix for Refs #750 and #740
|
||||
**************************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
@@ -152,8 +154,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
mov Y, OLD_Y
|
||||
ldr INC_Y, OLD_INC_Y
|
||||
|
||||
vsub.f64 d0 , d0 , d0
|
||||
vsub.f64 d1 , d1 , d1
|
||||
movs r4, #0 // clear floating point register
|
||||
vmov s0, r4
|
||||
vmov s1, r4
|
||||
vcvt.f64.f32 d0, s0
|
||||
vcvt.f64.f32 d1, s1
|
||||
|
||||
|
||||
cmp N, #0
|
||||
ble ddot_kernel_L999
|
||||
|
||||
@@ -56,8 +56,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define K [fp, #-264 ]
|
||||
#define A [fp, #-268 ]
|
||||
|
||||
#define FP_ZERO [fp, #-240]
|
||||
#define FP_ZERO_0 [fp, # -240]
|
||||
#define FP_ZERO_1 [fp, # -236]
|
||||
|
||||
#define ALPHA [fp, #-280]
|
||||
|
||||
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
@@ -85,7 +90,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT4x2
|
||||
|
||||
vsub.f64 d8 , d8 , d8
|
||||
fldd d8, FP_ZERO
|
||||
vmov.f64 d9, d8
|
||||
vmov.f64 d10, d8
|
||||
vmov.f64 d11, d8
|
||||
@@ -173,7 +178,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x2
|
||||
|
||||
vsub.f64 d8 , d8 , d8
|
||||
fldd d8, FP_ZERO
|
||||
vmov.f64 d9, d8
|
||||
vmov.f64 d12, d8
|
||||
vmov.f64 d13, d8
|
||||
@@ -233,7 +238,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x2
|
||||
|
||||
vsub.f64 d8 , d8 , d8
|
||||
fldd d8, FP_ZERO
|
||||
vmov.f64 d12, d8
|
||||
|
||||
.endm
|
||||
@@ -283,7 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT4x1
|
||||
|
||||
vsub.f64 d8 , d8 , d8
|
||||
fldd d8, FP_ZERO
|
||||
vmov.f64 d9, d8
|
||||
vmov.f64 d10, d8
|
||||
vmov.f64 d11, d8
|
||||
@@ -338,7 +343,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x1
|
||||
|
||||
vsub.f64 d8 , d8 , d8
|
||||
fldd d8, FP_ZERO
|
||||
vmov.f64 d9 , d8
|
||||
|
||||
.endm
|
||||
@@ -380,7 +385,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x1
|
||||
|
||||
vsub.f64 d8 , d8 , d8
|
||||
fldd d8, FP_ZERO
|
||||
|
||||
.endm
|
||||
|
||||
@@ -433,6 +438,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
sub r3, fp, #128
|
||||
vstm r3, { d8 - d15} // store floating point registers
|
||||
|
||||
movs r4, #0
|
||||
str r4, FP_ZERO
|
||||
str r4, FP_ZERO_1
|
||||
|
||||
ldr r3, OLD_LDC
|
||||
lsl r3, r3, #3 // ldc = ldc * 8
|
||||
str r3, LDC
|
||||
|
||||
@@ -73,6 +73,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define K [fp, #-264 ]
|
||||
#define A [fp, #-268 ]
|
||||
|
||||
#define FP_ZERO [fp, #-240]
|
||||
#define FP_ZERO_0 [fp, # -240]
|
||||
#define FP_ZERO_1 [fp, # -236]
|
||||
|
||||
#define ALPHA [fp, #-280]
|
||||
|
||||
#define B [fp, #4 ]
|
||||
@@ -102,7 +106,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT4x4
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d17, d16
|
||||
vmov.f64 d18, d16
|
||||
vmov.f64 d19, d16
|
||||
@@ -376,7 +380,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x4
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d17, d16
|
||||
vmov.f64 d20, d16
|
||||
vmov.f64 d21, d16
|
||||
@@ -470,7 +474,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x4
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d20, d16
|
||||
vmov.f64 d24, d16
|
||||
vmov.f64 d28, d16
|
||||
@@ -533,7 +537,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT4x2
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d17, d16
|
||||
vmov.f64 d18, d16
|
||||
vmov.f64 d19, d16
|
||||
@@ -617,7 +621,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x2
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d17, d16
|
||||
vmov.f64 d20, d16
|
||||
vmov.f64 d21, d16
|
||||
@@ -678,7 +682,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x2
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d20, d16
|
||||
|
||||
.endm
|
||||
@@ -723,7 +727,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT4x1
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d17, d16
|
||||
vmov.f64 d18, d16
|
||||
vmov.f64 d19, d16
|
||||
@@ -782,7 +786,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x1
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d17, d16
|
||||
|
||||
.endm
|
||||
@@ -826,7 +830,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x1
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
|
||||
.endm
|
||||
|
||||
@@ -880,6 +884,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
str OLD_A, A
|
||||
vstr OLD_ALPHA, ALPHA
|
||||
|
||||
movs r4, #0
|
||||
str r4, FP_ZERO
|
||||
str r4, FP_ZERO_1
|
||||
|
||||
sub r3, fp, #128
|
||||
vstm r3, { d8 - d15} // store floating point registers
|
||||
|
||||
|
||||
@@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define K [fp, #-264 ]
|
||||
#define A [fp, #-268 ]
|
||||
|
||||
#define FP_ZERO [fp, #-232]
|
||||
#define FP_ZERO_0 [fp, #-232]
|
||||
#define FP_ZERO_1 [fp, #-228]
|
||||
|
||||
#define ALPHA [fp, #-276 ]
|
||||
|
||||
#define B [fp, #4 ]
|
||||
@@ -90,7 +94,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT4x2
|
||||
|
||||
vsub.f64 d8 , d8 , d8
|
||||
fldd d8 , FP_ZERO
|
||||
vmov.f64 d9, d8
|
||||
vmov.f64 d10, d8
|
||||
vmov.f64 d11, d8
|
||||
@@ -165,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x2
|
||||
|
||||
vsub.f64 d8 , d8 , d8
|
||||
fldd d8 , FP_ZERO
|
||||
vmov.f64 d9, d8
|
||||
vmov.f64 d12, d8
|
||||
vmov.f64 d13, d8
|
||||
@@ -220,7 +224,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x2
|
||||
|
||||
vsub.f64 d8 , d8 , d8
|
||||
fldd d8 , FP_ZERO
|
||||
vmov.f64 d12, d8
|
||||
|
||||
.endm
|
||||
@@ -268,7 +272,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT4x1
|
||||
|
||||
vsub.f64 d8 , d8 , d8
|
||||
fldd d8 , FP_ZERO
|
||||
vmov.f64 d9, d8
|
||||
vmov.f64 d10, d8
|
||||
vmov.f64 d11, d8
|
||||
@@ -318,7 +322,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x1
|
||||
|
||||
vsub.f64 d8 , d8 , d8
|
||||
fldd d8 , FP_ZERO
|
||||
vmov.f64 d9 , d8
|
||||
|
||||
.endm
|
||||
@@ -357,7 +361,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x1
|
||||
|
||||
vsub.f64 d8 , d8 , d8
|
||||
fldd d8 , FP_ZERO
|
||||
|
||||
.endm
|
||||
|
||||
@@ -409,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
sub r3, fp, #128
|
||||
vstm r3, { d8 - d15} // store floating point registers
|
||||
|
||||
movs r4, #0
|
||||
str r4, FP_ZERO
|
||||
str r4, FP_ZERO_1
|
||||
|
||||
ldr r3, OLD_LDC
|
||||
lsl r3, r3, #3 // ldc = ldc * 8
|
||||
str r3, LDC
|
||||
|
||||
@@ -59,6 +59,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define K [fp, #-264 ]
|
||||
#define A [fp, #-268 ]
|
||||
|
||||
#define FP_ZERO [fp, #-236]
|
||||
#define FP_ZERO_0 [fp, #-236]
|
||||
#define FP_ZERO_1 [fp, #-232]
|
||||
|
||||
|
||||
#define ALPHA [fp, #-276 ]
|
||||
|
||||
#define B [fp, #4 ]
|
||||
@@ -89,7 +94,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT4x4
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d17, d16
|
||||
vmov.f64 d18, d16
|
||||
vmov.f64 d19, d16
|
||||
@@ -386,7 +391,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x4
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d17, d16
|
||||
vmov.f64 d20, d16
|
||||
vmov.f64 d21, d16
|
||||
@@ -468,7 +473,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x4
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d20, d16
|
||||
vmov.f64 d24, d16
|
||||
vmov.f64 d28, d16
|
||||
@@ -527,7 +532,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT4x2
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d17, d16
|
||||
vmov.f64 d18, d16
|
||||
vmov.f64 d19, d16
|
||||
@@ -601,7 +606,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x2
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d17, d16
|
||||
vmov.f64 d20, d16
|
||||
vmov.f64 d21, d16
|
||||
@@ -656,7 +661,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x2
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d20, d16
|
||||
|
||||
.endm
|
||||
@@ -699,7 +704,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT4x1
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d17, d16
|
||||
vmov.f64 d18, d16
|
||||
vmov.f64 d19, d16
|
||||
@@ -753,7 +758,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT2x1
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
vmov.f64 d17, d16
|
||||
|
||||
.endm
|
||||
@@ -794,7 +799,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT1x1
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
fldd d16, FP_ZERO
|
||||
|
||||
.endm
|
||||
|
||||
@@ -850,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
sub r3, fp, #128
|
||||
vstm r3, { d8 - d15} // store floating point registers
|
||||
|
||||
movs r4, #0
|
||||
str r4, FP_ZERO
|
||||
str r4, FP_ZERO_1
|
||||
|
||||
ldr r3, OLD_LDC
|
||||
lsl r3, r3, #3 // ldc = ldc * 8
|
||||
str r3, LDC
|
||||
|
||||
@@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define I r12
|
||||
|
||||
#define FP_ZERO [fp, #-228]
|
||||
#define FP_ZERO_0 [fp, #-228]
|
||||
#define FP_ZERO_1 [fp, #-224]
|
||||
|
||||
#define M [fp, #-252 ]
|
||||
#define A [fp, #-256 ]
|
||||
|
||||
@@ -79,7 +83,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
pld [ YO , #Y_PRE ]
|
||||
pld [ YO , #Y_PRE+32 ]
|
||||
|
||||
vsub.f64 d8 , d8 , d8
|
||||
fldd d8 , FP_ZERO
|
||||
vmov.f64 d9 , d8
|
||||
vmov.f64 d10 , d8
|
||||
vmov.f64 d11 , d8
|
||||
@@ -158,7 +162,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_F1
|
||||
|
||||
vsub.f64 d12 , d12 , d12
|
||||
fldd d12 , FP_ZERO
|
||||
|
||||
.endm
|
||||
|
||||
@@ -185,7 +189,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S4
|
||||
|
||||
vsub.f64 d12 , d12 , d12
|
||||
fldd d12 , FP_ZERO
|
||||
vmov.f64 d13 , d12
|
||||
vmov.f64 d14 , d12
|
||||
vmov.f64 d15 , d12
|
||||
@@ -245,7 +249,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S1
|
||||
|
||||
vsub.f64 d12 , d12 , d12
|
||||
fldd d12 , FP_ZERO
|
||||
|
||||
.endm
|
||||
|
||||
@@ -279,7 +283,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
pld [ YO , #Y_PRE ]
|
||||
|
||||
vsub.f32 s8 , s8 , s8
|
||||
flds s8 , FP_ZERO
|
||||
vmov.f32 s9 , s8
|
||||
vmov.f32 s10 , s8
|
||||
vmov.f32 s11 , s8
|
||||
@@ -357,7 +361,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_F1
|
||||
|
||||
vsub.f32 s12 , s12 , s12
|
||||
flds s12 , FP_ZERO
|
||||
|
||||
.endm
|
||||
|
||||
@@ -384,7 +388,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S4
|
||||
|
||||
vsub.f32 s12 , s12 , s12
|
||||
flds s12 , FP_ZERO
|
||||
vmov.f32 s13 , s12
|
||||
vmov.f32 s14 , s12
|
||||
vmov.f32 s15 , s12
|
||||
@@ -445,7 +449,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S1
|
||||
|
||||
vsub.f32 s12 , s12 , s12
|
||||
flds s12 , FP_ZERO
|
||||
|
||||
.endm
|
||||
|
||||
@@ -494,6 +498,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vstm r12, { s8 - s15 } // store floating point registers
|
||||
#endif
|
||||
|
||||
movs r12, #0
|
||||
str r12, FP_ZERO
|
||||
str r12, FP_ZERO_1
|
||||
|
||||
cmp OLD_M, #0
|
||||
ble gemvn_kernel_L999
|
||||
|
||||
|
||||
@@ -62,6 +62,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define M [fp, #-252 ]
|
||||
#define A [fp, #-256 ]
|
||||
|
||||
#define FP_ZERO [fp, #-228]
|
||||
#define FP_ZERO_0 [fp, #-228]
|
||||
#define FP_ZERO_1 [fp, #-224]
|
||||
|
||||
|
||||
#define X_PRE 64
|
||||
#define Y_PRE 0
|
||||
@@ -79,7 +83,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
pld [ YO , #Y_PRE ]
|
||||
pld [ YO , #Y_PRE+32 ]
|
||||
|
||||
vsub.f64 d24 , d24 , d24
|
||||
fldd d24 , FP_ZERO
|
||||
vmov.f64 d25 , d24
|
||||
vmov.f64 d26 , d24
|
||||
vmov.f64 d27 , d24
|
||||
@@ -147,7 +151,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_F1
|
||||
|
||||
vsub.f64 d24 , d24 , d24
|
||||
fldd d24 , FP_ZERO
|
||||
|
||||
.endm
|
||||
|
||||
@@ -175,7 +179,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S8
|
||||
|
||||
vsub.f64 d24 , d24 , d24
|
||||
fldd d24 , FP_ZERO
|
||||
vmov.f64 d25 , d24
|
||||
vmov.f64 d26 , d24
|
||||
vmov.f64 d27 , d24
|
||||
@@ -269,7 +273,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S1
|
||||
|
||||
vsub.f64 d24 , d24 , d24
|
||||
fldd d24 , FP_ZERO
|
||||
|
||||
.endm
|
||||
|
||||
@@ -302,7 +306,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
pld [ YO , #Y_PRE ]
|
||||
|
||||
vsub.f32 s24 , s24 , s24
|
||||
flds s24 , FP_ZERO
|
||||
vmov.f32 s25 , s24
|
||||
vmov.f32 s26 , s24
|
||||
vmov.f32 s27 , s24
|
||||
@@ -368,7 +372,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_F1
|
||||
|
||||
vsub.f32 s24 , s24 , s24
|
||||
flds s24 , FP_ZERO
|
||||
|
||||
.endm
|
||||
|
||||
@@ -396,7 +400,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S8
|
||||
|
||||
vsub.f32 s24 , s24 , s24
|
||||
flds s24 , FP_ZERO
|
||||
vmov.f32 s25 , s24
|
||||
vmov.f32 s26 , s24
|
||||
vmov.f32 s27 , s24
|
||||
@@ -489,7 +493,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S1
|
||||
|
||||
vsub.f32 s24 , s24 , s24
|
||||
flds s24 , FP_ZERO
|
||||
|
||||
.endm
|
||||
|
||||
@@ -538,6 +542,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vstm r12, { s8 - s31 } // store floating point registers
|
||||
#endif
|
||||
|
||||
movs r12, #0
|
||||
str r12, FP_ZERO
|
||||
str r12, FP_ZERO_1
|
||||
|
||||
cmp OLD_M, #0
|
||||
ble gemvn_kernel_L999
|
||||
|
||||
|
||||
@@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define I r12
|
||||
|
||||
#define FP_ZERO [fp, #-228]
|
||||
#define FP_ZERO_0 [fp, #-228]
|
||||
#define FP_ZERO_1 [fp, #-224]
|
||||
|
||||
#define N [fp, #-252 ]
|
||||
#define A [fp, #-256 ]
|
||||
|
||||
@@ -75,8 +79,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_F2
|
||||
|
||||
vsub.f64 d2 , d2 , d2
|
||||
vsub.f64 d3 , d3 , d3
|
||||
fldd d2, FP_ZERO
|
||||
vmov.f64 d3 , d2
|
||||
|
||||
.endm
|
||||
|
||||
@@ -123,7 +127,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_F1
|
||||
|
||||
vsub.f64 d2 , d2 , d2
|
||||
fldd d2, FP_ZERO
|
||||
vmov.f64 d3 , d2
|
||||
|
||||
.endm
|
||||
|
||||
@@ -160,8 +165,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S2
|
||||
|
||||
vsub.f64 d2 , d2 , d2
|
||||
vsub.f64 d3 , d3 , d3
|
||||
fldd d2, FP_ZERO
|
||||
vmov.f64 d3 , d2
|
||||
|
||||
.endm
|
||||
|
||||
@@ -224,7 +229,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S1
|
||||
|
||||
vsub.f64 d2 , d2 , d2
|
||||
fldd d2, FP_ZERO
|
||||
vmov.f64 d3 , d2
|
||||
|
||||
.endm
|
||||
|
||||
@@ -276,8 +282,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_F2
|
||||
|
||||
vsub.f32 s2 , s2 , s2
|
||||
vsub.f32 s3 , s3 , s3
|
||||
flds s2 , FP_ZERO
|
||||
vmov.f32 s3 , s2
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
@@ -321,7 +328,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_F1
|
||||
|
||||
vsub.f32 s2 , s2 , s2
|
||||
flds s2 , FP_ZERO
|
||||
|
||||
.endm
|
||||
|
||||
@@ -356,8 +363,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S2
|
||||
|
||||
vsub.f32 s2 , s2 , s2
|
||||
vsub.f32 s3 , s3 , s3
|
||||
flds s2 , FP_ZERO
|
||||
vmov.f32 s3 , s2
|
||||
|
||||
.endm
|
||||
|
||||
@@ -418,7 +425,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S1
|
||||
|
||||
vsub.f32 s2 , s2 , s2
|
||||
flds s2 , FP_ZERO
|
||||
|
||||
.endm
|
||||
|
||||
@@ -488,6 +495,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vstm r12, { s8 - s15 } // store floating point registers
|
||||
#endif
|
||||
|
||||
movs r12, #0
|
||||
str r12, FP_ZERO
|
||||
str r12, FP_ZERO_1
|
||||
|
||||
cmp M, #0
|
||||
ble gemvt_kernel_L999
|
||||
|
||||
|
||||
@@ -341,11 +341,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.align 5
|
||||
push {r4}
|
||||
|
||||
#if defined(DOUBLE)
|
||||
vsub.f64 d0 , d0 , d0
|
||||
#else
|
||||
vsub.f32 s0 , s0 , s0
|
||||
movs r12, #0 // clear floating point register
|
||||
vmov s0, r12
|
||||
#if defined(DOUBLE)
|
||||
vcvt.f64.f32 d0, s0
|
||||
#endif
|
||||
|
||||
mov INDEX, #0
|
||||
|
||||
cmp N, #0
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user