Compare commits

..

703 Commits

Author SHA1 Message Date
Martin Kroeker
5fdf9ad24f Merge pull request #2228 from martin-frbg/issue2227
Add Intel Goldmont Plus CPUID
2019-08-19 18:26:51 +02:00
Martin Kroeker
2fe967c542 Merge branch 'develop' into issue2227 2019-08-19 14:20:39 +02:00
Martin Kroeker
6d8595351c Add Intel Goldmont Plus CPUID
fixes #2227
2019-08-19 14:19:21 +02:00
Martin Kroeker
f40200f559 Merge pull request #2223 from martin-frbg/getarch-pgi
Make getarch compile with PGI
2019-08-16 12:21:30 +02:00
Martin Kroeker
a95a5e52b8 Fix PGI compiler detection for getarch 2019-08-16 09:00:11 +02:00
Martin Kroeker
e3d846ab57 Do not use -march=native with the PGI compiler 2019-08-16 08:58:10 +02:00
Martin Kroeker
8506386d82 Merge pull request #1 from xianyi/develop
rebase
2019-08-16 08:56:15 +02:00
Martin Kroeker
9ef96b32a6 Add multithreading support to the x86_64 zdot kernel (#2222)
* Add multithreading support

copied from the ThunderX2T99 kernel. For #2221
2019-08-15 22:09:12 +02:00
Martin Kroeker
b48c025974 Merge pull request #2218 from martin-frbg/issue2215
Make the new DGEMM regression test properly depend on CBLAS and LAPACKE
2019-08-14 07:32:31 +02:00
Martin Kroeker
a1fce67743 Make the new DGEMM regression test properly depend on CBLAS and LAPACKE
fixes #2215
2019-08-13 22:29:48 +02:00
Martin Kroeker
103b32fdb7 Merge pull request #2216 from martin-frbg/issue2214
Remove case-sensitivity in x86 LSAME on (AMD) cpus without CMOV
2019-08-13 13:59:33 +02:00
Martin Kroeker
aef9804089 Fix unwanted case-sensitivity in x86 LSAME for (AMD) processors without CMOV
Problem was already noticed some years ago in #238, but back then the problem was only corrected in one of the #ifdef branches.
Fixes #2214
2019-08-13 10:19:10 +02:00
Martin Kroeker
303869f572 Update with changes from 0.3.7 2019-08-11 23:31:36 +02:00
Martin Kroeker
02d9203981 Increment version to 0.3.8.dev 2019-08-11 23:28:47 +02:00
Martin Kroeker
7b6808b69c Increment version to 0.3.8.dev 2019-08-11 23:28:13 +02:00
Martin Kroeker
321288597c Merge pull request #2212 from martin-frbg/nofort-nolib
Avoid spurious dependency on the fortran runtime despite NOFORTRAN=1
2019-08-11 20:26:34 +02:00
Martin Kroeker
be147a9f28 Avoid adding a spurious dependency on the fortran runtime despite NOFORTRAN=1
for cases where a fortran compiler is present but not wanted (e.g. not fully functional)
2019-08-11 16:24:39 +02:00
Martin Kroeker
c275290ea6 Merge pull request #2211 from martin-frbg/arm64_gcc_trivial
Silence two nuisance warnings from gcc
2019-08-11 16:08:05 +02:00
Martin Kroeker
b7bbb02447 Silence two nuisance warnings from gcc 2019-08-11 12:46:05 +02:00
Martin Kroeker
bf1430f7d7 Merge pull request #2208 from martin-frbg/munmap-debug
Provide more information on mmap/munmap failure
2019-08-09 07:55:35 +02:00
Martin Kroeker
dccff2e785 Merge pull request #2206 from martin-frbg/zen-dtrmm
Replace vpermpd with vpermilpd in the Haswell DTRMM kernel
2019-08-09 07:55:20 +02:00
Martin Kroeker
5c3458a6e7 Merge pull request #2199 from martin-frbg/zen-dtrsm
Replace most vpermpd calls in the Haswell DTRSM_RN kernel
2019-08-09 07:55:02 +02:00
Martin Kroeker
1776ad82c0 Add files via upload 2019-08-09 00:08:11 +02:00
Martin Kroeker
4e2f81cfa1 Provide more information on mmap/munmap failure
for #2207
2019-08-08 23:15:35 +02:00
Martin Kroeker
acf6002ab2 Replace most vpermpd calls in the Haswell DTRSM_RN kernel 2019-08-03 12:40:13 +02:00
Martin Kroeker
96a794e9fd Merge pull request #2198 from martin-frbg/icelake
Update CPUID recognition for Intel Ice Lake
2019-08-02 08:36:14 +02:00
Martin Kroeker
3d36c45116 Add CPUID identification of Intel Ice Lake 2019-08-01 22:52:35 +02:00
Martin Kroeker
648491e1aa Autodetect Intel Ice Lake (as SKYLAKEX target) 2019-08-01 22:51:09 +02:00
Martin Kroeker
2dfb804cb9 Replace vpermpd with vpermilpd in the Haswell DTRMM kernel
to improve performance on AMD Zen (#2180) applying wjc404's improvement of the DGEMM kernel from #2186
2019-07-28 23:17:28 +02:00
Martin Kroeker
4c153ec9da Merge pull request #2196 from wjc404/develop
Add vbroadcastsd kernel to dgemm_kernel_4x8_haswell.S
2019-07-28 23:11:40 +02:00
wjc404
7eecd8e39c Add files via upload 2019-07-28 07:39:09 +08:00
Martin Kroeker
f0406a7708 Merge pull request #2112 from ffontaine/develop
Makefile.arm: remove -march flags
2019-07-27 13:00:13 +02:00
Martin Kroeker
561f3fd995 Merge pull request #2193 from martin-frbg/makeutest
Override special make variables
2019-07-24 20:19:21 +02:00
Martin Kroeker
30efed14d1 Unset special make variables in ctest Makefile as well 2019-07-24 15:26:09 +02:00
Martin Kroeker
af2e7f28fc Override special make variables
as seen in https://github.com/xianyi/OpenBLAS/issues/1912#issuecomment-514183900 , any external setting of TARGET_ARCH (which could result from building OpenBLAS as part of a larger project that actually uses this variable) would cause the utest build to fail. 
(Other subtargets appear to be unaffected as they do not use implicit make rules)
2019-07-23 16:56:40 +02:00
Martin Kroeker
4250e6ed64 Merge pull request #2191 from tylerjereddy/conditional_updates
MAINT: remove legacy CMake endif()
2019-07-23 16:20:39 +02:00
Martin Kroeker
7b0b7c11d2 Merge pull request #2190 from martin-frbg/zdot-zen
Replace vpermpd with vpermilpd in the Haswell/Zen zdot microkernel
2019-07-23 16:15:08 +02:00
Martin Kroeker
d14cf1ccf4 Merge pull request #2189 from wjc404/develop
Update dgemm_kernel_4x8_haswell.S for reducing cache misses
2019-07-23 08:32:56 +02:00
Tyler Reddy
3f6ab1582a MAINT: remove legacy CMake endif()
* clean up a case where CMake endif()
contained the conditional used in the
if(), which is no longer needed /
discouraged since our minimum required
CMake version supports the modern syntax
2019-07-22 21:24:57 -06:00
Martin Kroeker
28e96458e5 Replace vpermpd with vpermilpd
to improve performance on Zen/Zen2 (as demonstrated by wjc404 in #2180)
2019-07-22 08:28:16 +02:00
wjc404
95fb98f556 Update dgemm_kernel_4x8_haswell.S 2019-07-21 01:10:32 +08:00
wjc404
4801c6d36b Update dgemm_kernel_4x8_haswell.S 2019-07-21 00:47:45 +08:00
wjc404
9440fa607d Add files via upload 2019-07-20 22:08:22 +08:00
wjc404
94db259e5b Add files via upload 2019-07-20 22:04:41 +08:00
wjc404
f49f8047ac Add files via upload 2019-07-20 14:33:37 +08:00
wjc404
825777faab Update dgemm_kernel_4x8_haswell.S 2019-07-19 23:58:24 +08:00
wjc404
9c89757562 Add files via upload 2019-07-19 23:47:58 +08:00
Martin Kroeker
b0b7600bef Merge pull request #2186 from wjc404/develop
Update "dgemm_kernel_4x8_haswell.S" for improving performance on zen2 chips
2019-07-18 16:04:44 +02:00
wjc404
9b04baeaee Update dgemm_kernel_4x8_haswell.S 2019-07-17 23:50:03 +08:00
wjc404
8a074b3965 Update dgemm_kernel_4x8_haswell.S 2019-07-17 23:47:30 +08:00
wjc404
211ab03b14 Update dgemm_kernel_4x8_haswell.S 2019-07-17 22:39:15 +08:00
wjc404
1733f927e6 Update dgemm_kernel_4x8_haswell.S 2019-07-17 21:27:41 +08:00
wjc404
182b06d6ad Update dgemm_kernel_4x8_haswell.S 2019-07-17 17:02:35 +08:00
wjc404
7a9050d681 Update dgemm_kernel_4x8_haswell.S 2019-07-17 00:55:06 +08:00
wjc404
0ba29fd262 Update dgemm_kernel_4x8_haswell.S for zen2
replaced a bunch of vpermpd instructions with vpermilpd and vperm2f128
2019-07-17 00:46:51 +08:00
Martin Kroeker
bafa021ed6 Merge pull request #2181 from isuruf/install_name
Change install_name on osx to match linux
2019-07-09 20:08:52 +02:00
Isuru Fernando
b89d9762a2 Change install_name on osx to match linux 2019-07-08 17:14:35 -05:00
Martin Kroeker
08dedf4c5e Merge pull request #2177 from martin-frbg/noaff
Fix surprising behaviour of NO_AFFINITY=0
2019-07-07 18:28:21 +02:00
Martin Kroeker
b89c781637 Fix surprising behaviour of NO_AFFINITY=0 2019-07-07 16:04:45 +02:00
Martin Kroeker
dd7ff77f4b Merge pull request #2175 from martin-frbg/cmake-mingw-fixes
Fix CMAKE compilation with MinGW32 and add it to Appveyor
2019-07-06 18:07:19 +02:00
Martin Kroeker
8fb76134bc Mingw32 needs leading underscore on object names
(also copy BUNDERSCORE settings for FORTRAN from the corresponding Makefile)
2019-07-06 15:07:15 +02:00
Martin Kroeker
04d671aae2 Make disabling DYNAMIC_ARCH on unsupported systems work
needs to be unset in the cache for the change to have any effect
2019-07-06 15:05:04 +02:00
Martin Kroeker
f69a0be712 Add getarch flags to disable AVX on x86
(and other small fixes to match Makefile behaviour)
2019-07-06 15:02:39 +02:00
Martin Kroeker
ae9e8b131e Add mingw builds to Appveyor config 2019-07-06 14:30:33 +02:00
Martin Kroeker
9086543f50 Utest needs CBLAS but not necessarily FORTRAN 2019-07-06 14:29:47 +02:00
Martin Kroeker
abea977ded Merge pull request #2162 from martin-frbg/pgi
Fixes for PGI compiler
2019-07-03 19:16:30 +02:00
Martin Kroeker
6b6c9b1441 Merge pull request #2172 from quickwritereader/develop
power9 cgemm/ctrmm. new sgemm 8x16
2019-07-01 21:06:02 +02:00
AbdelRauf
a97b301aaa cgemm/ctrmm power9 2019-07-01 14:07:54 +00:00
Martin Kroeker
2f13f04224 Merge pull request #2170 from pkubaj/patch-1
Fix build on PPC970 for FreeBSD
2019-06-30 23:29:02 +02:00
pkubaj
7c7505a778 Fix build for PPC970 on FreeBSD pt.2
FreeBSD needs those macros too.
2019-06-28 10:31:45 +00:00
pkubaj
5a4f1a2118 Fix build for PPC970 on FreeBSD pt. 1
FreeBSD needs DCBT_ARG=0 as well.
2019-06-28 10:29:44 +00:00
Martin Kroeker
3b761892df Merge pull request #2169 from pkubaj/develop
Fix build on FreeBSD/powerpc64.
2019-06-25 12:56:33 +02:00
Piotr Kubaj
eebfeba768 Fix build on FreeBSD/powerpc64.
Signed-off-by: Piotr Kubaj <pkubaj@anongoth.pl>
2019-06-25 10:58:56 +02:00
Martin Kroeker
7684c4f8f8 PGI compiler does not like -march=native 2019-06-20 19:56:01 +02:00
Martin Kroeker
7faf42b7bb Merge pull request #2167 from kavanabhat/dtrmm_power8_segfault
Fix DTRMMKERNEL register save for power8 64-bit mode (Fix for #2166)
2019-06-19 14:38:01 +02:00
kavanabhat
a575f1e4c7 Update dtrmm_kernel_16x4_power8.S 2019-06-19 15:27:14 +05:30
AbdelRauf
cdbfb891da new sgemm 8x16 2019-06-17 15:33:38 +00:00
Martin Kroeker
280552b988 Fix mov syntax 2019-06-16 18:35:43 +02:00
Martin Kroeker
bbd4bb0154 Zero ecx with a mov instruction
PGI assembler does not like the initialization in the constraints.
2019-06-16 15:04:10 +02:00
Martin Kroeker
6d3efb2b58 Update Makefile.x86_64 2019-06-14 08:08:11 +02:00
Martin Kroeker
d9ff2cd90d Do not force gcc options on non-gcc compilers
fixes compile failure with pgi 18.10 as reported on OpenBLAS-users
2019-06-13 23:01:35 +02:00
Martin Kroeker
2a43062de7 Merge pull request #2159 from martin-frbg/issue2149
Avoid unintentional activation of TLS codepath via USE_TLS=0
2019-06-10 19:12:45 +02:00
Martin Kroeker
4ea794a522 Avoid unintentional activation of TLS code via USE_TLS=0
fixes #2149
2019-06-10 17:24:15 +02:00
Martin Kroeker
ece0bfb881 Merge pull request #2158 from martin-frbg/issue2143
Remove any inadvertent use of -march=native from DYNAMIC_ARCH builds
2019-06-10 14:08:11 +02:00
Martin Kroeker
1f4b6a5d5d Remove any inadvertent use of -march=native from DYNAMIC_ARCH builds
from #2143, -march=native precludes use of more specific options like -march=skylake-avx512 in individual kernels, and defeats the purpose of dynamic arch anyway.
2019-06-10 09:50:13 +02:00
Martin Kroeker
be8f70d269 Merge pull request #2157 from martin-frbg/2154-2
Add gfortran workaround for potential ABI violation
2019-06-09 12:19:08 +02:00
Martin Kroeker
e674e1c735 Update fc.cmake 2019-06-09 09:31:13 +02:00
Martin Kroeker
6ca898b63b Add gfortran workaround for potential ABI violation
for #2154
2019-06-08 23:17:03 +02:00
Martin Kroeker
26411acd56 Merge pull request #2148 from TiborGY/cpp_thread_test_2
Thread safety tester using C++11 threading (cleaned history)
2019-06-07 13:23:07 +02:00
Martin Kroeker
0ab4076dd8 Merge pull request #2156 from martin-frbg/issue2154
Add gfortran workaround for C->FORTRAN ABI violation
2019-06-06 13:43:12 +02:00
Martin Kroeker
a0caa762b3 Add gfortran workaround for ABI violations
for #2154 (see gcc bug 90329)
2019-06-06 10:24:16 +02:00
Martin Kroeker
900d5a3205 Add gfortran workaround for ABI violations in LAPACKE
for #2154 (see gcc bug 90329)
2019-06-06 10:18:40 +02:00
Martin Kroeker
a17cf36225 Merge pull request #2153 from quickwritereader/develop
improved power9 zgemm,sgemm
2019-06-06 07:42:56 +02:00
AbdelRauf
148c4cc5fd conflict resolve 2019-06-05 20:50:50 +00:00
AbdelRauf
d0c3543c3f power9 zgemm ztrmm optimized 2019-06-05 20:07:16 +00:00
Martin Kroeker
909ad04aef Merge pull request #2145 from martin-frbg/1912-3
Separate implementations of AMAX and IAMAX on arm
2019-06-05 20:27:45 +02:00
Martin Kroeker
417efd41c6 Merge pull request #2110 from pc2/cpu-detection
Fix detection of Skylake processors when using GCC
2019-06-05 20:27:05 +02:00
Michael Lass
9cdc828afa c_check: Unlink correct file 2019-06-05 17:31:01 +02:00
Michael Lass
7a9a4dbc4f Fix detection of AVX512 capable compilers in getarch
21eda8b5 introduced a check in getarch.c to test if the compiler is capable of
AVX512. This check currently fails, since the used __AVX2__ macro is only
defined if getarch itself was compiled with AVX2/AVX512 support. Make sure this
is the case by building getarch with -march=native on x86_64. It is only
supposed to run on the build host anyway.
2019-06-05 17:30:56 +02:00
AbdelRauf
a469b32cf4 sgemm pipeline improved, zgemm rewritten without inner packs, ABI lxvx v20 fixed with vs52 2019-06-04 07:11:30 +00:00
Martin Kroeker
27649b9543 Document NO_AVX512
for #2151
2019-06-03 11:01:33 +02:00
TiborGY
16f3df5d35 add c++ thread test option to Makefile.rule 2019-06-01 21:36:41 +02:00
TiborGY
1aded69821 hook up c++ thread safety test (main Makefile) 2019-06-01 21:32:52 +02:00
TiborGY
c00289ba54 upload thread safety test folder 2019-06-01 21:30:06 +02:00
AbdelRauf
8fe794f059 improved zgemm power9 based on power8 2019-05-30 15:31:25 +00:00
Martin Kroeker
74c10b57c6 Use generic kernels for complex (I)AMAX to support softfp 2019-05-30 11:38:11 +02:00
Martin Kroeker
c5495d2056 Ensure correct output for DAMAX with softfp 2019-05-30 11:25:43 +02:00
Martin Kroeker
c70496b108 Separate implementations of AMAX and IAMAX on arm
As noted in #1912 and comment on #1942, the combined implementation happens to "do the right thing" on hardfp, but cannot return both value and index on softfp where they would have to share the return register
2019-05-29 15:02:51 +02:00
Martin Kroeker
ca8d8835f5 Merge pull request #2144 from xianyi/revert-2142-issue1912-2
Revert "Add softfp support in min/max kernels"
2019-05-29 14:09:10 +02:00
Martin Kroeker
d76b20b4d2 Revert "Add softfp support in min/max kernels" 2019-05-29 14:07:17 +02:00
Martin Kroeker
85af04da3c Merge pull request #2142 from martin-frbg/issue1912-2
Add softfp support in min/max kernels
2019-05-28 22:56:08 +02:00
Martin Kroeker
11e0dcbffb Merge pull request #2141 from martin-frbg/issue1912
Build and run utests independently of fortran
2019-05-28 20:50:40 +02:00
Martin Kroeker
79366ff7a9 Add softfp support in min/max kernels
fix for #1912
2019-05-28 20:34:22 +02:00
Martin Kroeker
21d05a4835 Merge pull request #2140 from martin-frbg/pgi19
Do not try ancient PGI hacks with recent versions of that compiler
2019-05-26 12:39:20 +02:00
Martin Kroeker
940f38f6dd Build and run utests in any case, they do their own checks for fortran availability 2019-05-24 13:02:23 +02:00
Martin Kroeker
1778fd4219 Do not try ancient PGI hacks with recent versions of that compiler
should fix #2139
2019-05-22 13:48:27 +02:00
Martin Kroeker
969dd6175e Merge pull request #2136 from martin-frbg/issue2126
Add option to allow combining USE_THREAD=0 with thread locking support
2019-05-16 12:08:16 +02:00
Martin Kroeker
d8d5682481 Merge pull request #2134 from tylerjereddy/skylake_regress_guard_may14
TST: add SkylakeX AVX512 CI test
2019-05-15 23:40:06 +02:00
Martin Kroeker
f66c11fc22 Remove unrelated change 2019-05-15 23:38:12 +02:00
Martin Kroeker
5ecffc28f2 Add option USE_LOCKING but keep default settings intact 2019-05-15 23:36:17 +02:00
Martin Kroeker
86dda5c2fa Add option USE_LOCKING for SMP-like locking in USE_THREAD=0 builds 2019-05-15 23:21:20 +02:00
Martin Kroeker
1e52572be3 Add option USE_LOCKING for single-threaded build with locking support 2019-05-15 23:19:30 +02:00
Martin Kroeker
d2cb610272 Add option USE_LOCKING for single-threaded build with locking support
for calling from concurrent threads
2019-05-15 23:18:43 +02:00
Tyler Reddy
a211bc9b6a TST: add SkylakeX AVX512 CI test
* adapt the C-level reproducer code for some
recent SkylakeX AVX512 kernel issues, provided
by Isuru Fernando and modified by Martin Kroeker,
for usage in the utest suite

* add an Intel SDE SkylakeX emulation utest run to
the Azure CI matrix; a custom Docker build was required
because Ubuntu image provided by Azure does not support
AVX512VL instructions
2019-05-14 11:32:23 -07:00
Martin Kroeker
9208ab8603 Merge pull request #2130 from isuruf/drone
Drone CI for arm64 native builds
2019-05-14 09:37:00 +02:00
Isuru Fernando
b43deb4ad6 Fix typo 2019-05-12 15:26:18 -05:00
Isuru Fernando
b911525c81 arm32 build 2019-05-12 15:21:43 -05:00
Isuru Fernando
7ff44e0016 Remove qemu armv8 builds 2019-05-12 15:09:53 -05:00
Isuru Fernando
e3cb8ad2d6 See if ubuntu 19.04 fixes the ICE 2019-05-12 14:28:48 -05:00
Isuru Fernando
7aa6faad5f parallel build 2019-05-12 14:22:36 -05:00
Isuru Fernando
3d94ab660f build without lapack on cmake 2019-05-12 14:17:12 -05:00
Isuru Fernando
cd99dfe034 Add cmake builds and print options 2019-05-12 14:10:10 -05:00
Isuru Fernando
dadafcdcd8 Add a cmake build as well 2019-05-12 14:10:10 -05:00
Isuru Fernando
d40c109eb0 no need of gcc in clang build 2019-05-12 14:10:10 -05:00
Isuru Fernando
608cd69b66 update yes 2019-05-12 14:10:10 -05:00
Isuru Fernando
231472c4c6 Fix typo 2019-05-12 14:10:10 -05:00
Isuru Fernando
612c2d78e0 apt update 2019-05-12 14:10:10 -05:00
Isuru Fernando
dc110e179d Switch to ubuntu and parallel jobs 2019-05-12 14:10:09 -05:00
Isuru Fernando
9184590c33 gfortran->gcc-gfortran 2019-05-12 14:10:09 -05:00
Isuru Fernando
a0aaf308ed Install gfortran and add a clang job 2019-05-12 14:10:09 -05:00
Isuru Fernando
15f925fe9a Install perl 2019-05-12 14:10:09 -05:00
Isuru Fernando
21acf03e9a Install gcc 2019-05-12 14:10:09 -05:00
Isuru Fernando
ff807473bb remove sudo 2019-05-12 14:10:09 -05:00
Isuru Fernando
58829c0988 install make 2019-05-12 14:10:09 -05:00
Isuru Fernando
d86f0b9e74 Test drone CI 2019-05-12 14:10:09 -05:00
Martin Kroeker
63554d5dec Merge pull request #2129 from martin-frbg/armv8azure
Move ARMv8/gcc CI job from Travis to Azure
2019-05-12 09:55:57 +02:00
Martin Kroeker
43068288e9 Update .travis.yml 2019-05-11 22:37:06 +02:00
Martin Kroeker
999a04f101 Move ARMv8 gcc build from Travis to Azure 2019-05-11 16:08:23 +02:00
Martin Kroeker
3cb1c8d210 Move ARMv8 gcc build from Travis to Azure 2019-05-11 16:07:30 +02:00
Martin Kroeker
ff1bfe7b16 Merge pull request #2127 from martin-frbg/issue2114_2
Add NO_AFFINITY to available CMAKE options on Linux, and set it to ON
2019-05-09 15:25:09 +02:00
Martin Kroeker
9ea30f3788 Replace ISMIN and ISAMIN kernels on all x86_64 platforms (#2125)
* Mark iamax_sse.S as unsuitable for MIN due to issue #2116
* Use iamax.S rather than iamax_sse.S for ISMIN/ISAMIN on all x86_64 as workaround for #2116
2019-05-09 14:42:36 +02:00
Martin Kroeker
a3d4c65d62 Add NO_AFFINITY to available options on Linux, and set it to ON
to match the gmake default. Fixes second part of #2114
2019-05-09 11:52:02 +02:00
Martin Kroeker
e1fc02095c Merge pull request #2124 from tylerjereddy/manylinux1_azure
TST: Azure manylinux1 & clean-up
2019-05-09 08:57:37 +02:00
Martin Kroeker
0cd6d8508f Merge pull request #2123 from tylerjereddy/azure_readme_badge
DOC: Add Azure CI status badge to README
2019-05-09 08:10:19 +02:00
Martin Kroeker
c2f152c470 Merge pull request #2120 from brada4/getrf-2113
Address redundant code concern #2113
2019-05-09 08:10:00 +02:00
Tyler Reddy
4efbac28ed TST: Azure manylinux1 & clean-up
* remove some of the steps & comments
from the original Azure yml template

* modify the trigger section to use
develop since OpenBLAS primarily uses
this branch; use the same batching
behavior as downstream projects NumPy/
SciPy

* remove Travis emulated ARMv6 gcc build
because this now happens in Azure

* use documented Ubuntu vmImage name for Azure
and add in a manylinux1 test run to the matrix

[skip appveyor]
2019-05-08 21:58:49 -07:00
Martin Kroeker
406c7242f4 Add ARMV6 build to azure CI setup (#2122)
using aytekinar's Alpine image and docker script from the Travis setup

[skip ci]
2019-05-09 00:47:44 +02:00
Tyler Reddy
53703585aa DOC: Add Azure CI status badge 2019-05-08 15:15:50 -07:00
Martin Kroeker
ad20ceaa68 Update azure-pipelines.yml 2019-05-08 19:07:58 +02:00
Martin Kroeker
dd77a3f0e2 Update azure-pipelines.yml 2019-05-08 15:25:43 +02:00
Martin Kroeker
a598ab1d32 Update azure-pipelines.yml 2019-05-08 15:23:54 +02:00
Martin Kroeker
16fd8e3dbe Update azure-pipelines.yml 2019-05-08 14:14:22 +02:00
Martin Kroeker
aa4c41bad2 Update azure-pipelines.yml
take out offending lines (although stolen from https://github.com/conda-forge/opencv-feedstock azure-pipelines fiie)
2019-05-08 14:12:02 +02:00
Martin Kroeker
5cf434167a fix tabbing in azure commands 2019-05-08 13:58:59 +02:00
Martin Kroeker
3a49e8c05a first try migrating one of the arm builds from travis 2019-05-08 13:52:22 +02:00
Martin Kroeker
95e2cf32e1 Merge pull request #2121 from tylerjereddy/ppc64le-travis
TST: add native POWER8 to CI
2019-05-08 13:31:46 +02:00
Martin Kroeker
70cea0b96b Update link to IBM MASS library, update cpu support status 2019-05-08 12:20:00 +02:00
Martin Kroeker
ae0dec77ec Merge pull request #2118 from Diazonium/develop
Change two http links to https
2019-05-08 11:41:17 +02:00
Tyler Reddy
e47b63466b TST: add native POWER8 to CI
* add native POWER8 testing to
Travis CI matrix with ppc64le
os entry
2019-05-07 19:11:08 -07:00
Zhang Xianyi
7d1b468d9d Set up CI with Azure Pipelines
[skip ci]
2019-05-08 09:58:01 +08:00
Andrew
575a84398a remove redundant code #2113 2019-05-07 23:46:54 +03:00
Martin Kroeker
5cabda79d0 Merge pull request #2117 from martin-frbg/issue2114
Fix errors in cpu affinity setup with glibc 2.6
2019-05-07 18:18:16 +02:00
Diazonium
c516209581 Change two http links to https
Closes #2109
2019-05-07 14:55:20 +02:00
Martin Kroeker
a6a8cc2b7f Fix errors in cpu enumeration with glibc 2.6
for #2114
2019-05-07 13:34:52 +02:00
Andrew
3d7debbb28 init 2019-05-07 13:15:08 +03:00
Fabrice Fontaine
5a9cce2bf6 Makefile.arm: remove -march flags
The provided -march flags, especially for ARMv5 and ARMv6 may not
necessarily match the needed ones: for ARMv5, it might be armv5,
armv5te, armv5t, etc. If the wrong one is used, the incorrect toolchain
sysroot can be used in a multilib toolchain.

Therefore, let the user building OpenBLAS pass the appropriate -march
flag.

The other flags, such as -mfpu=vfp or -mfloat-abi=hard are kept, as they
are actually required for the build to proceed (OpenBLAS uses VFP
instructions, and assume an EABIhf ABI).

[Peter: update for v0.2.20]
Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: Peter Korsgaard <peter@korsgaard.com>
[Retrieved from:
https://git.buildroot.net/buildroot/tree/package/openblas/0001-Makefile.arm-remove-march-flags.patch]
Signed-off-by: Fabrice Fontaine <fontaine.fabrice@gmail.com>
2019-05-05 18:37:28 +02:00
Martin Kroeker
6a8b4269b5 Merge pull request #2111 from martin-frbg/issue1955
Disable the SkyLakeX DGEMMIxCOPY kernels as well
2019-05-05 18:08:49 +02:00
Martin Kroeker
b1561ecc68 Disable DGEMMINCOPY as well for now
#1955
2019-05-05 15:52:01 +02:00
Martin Kroeker
7ed8431527 Disable the SkyLakeX DGEMMITCOPY kernel as well
as a stopgap measure for https://github.com/numpy/numpy/issues/13401 as mentioned in #1955
2019-05-04 22:54:41 +02:00
Martin Kroeker
a387a23518 Merge pull request #2101 from luzpaz/misc-typos
Misc. typo fixes in comments and documentation
2019-05-04 22:28:29 +02:00
luz.paz
b46875b76b Revert Changelog.txt typos 2019-05-04 15:43:17 -04:00
luz.paz
858e609e1f Revert reference/ fixes 2019-05-04 15:01:29 -04:00
Martin Kroeker
3f427c0cf9 Merge pull request #2107 from quickwritereader/develop
sgemm/strmm kernel for power9
2019-05-02 07:56:57 +02:00
Martin Kroeker
c95317158f Merge pull request #2105 from martin-frbg/issue2104
Correct argument of CPU_ISSET for glibc <2.5
2019-05-02 07:56:37 +02:00
AbdelRauf
47f892198c conflict resolve 2019-05-01 19:36:22 +00:00
Martin Kroeker
b43c8382c8 Correct argument of CPU_ISSET for glibc <2.5
fixes #2104
2019-05-01 10:46:46 +02:00
luz.paz
daf2fec12d Misc. typo fixes
Found via `codespell -q 3 -w -L ith,als,dum,nd,amin,nto,wis,ba -S ./relapack,./kernel,./lapack-netlib`
2019-04-29 17:03:56 -04:00
Martin Kroeker
4f8143b098 Increment version to 0.3.7.dev 2019-04-29 19:25:32 +02:00
Martin Kroeker
bfeb9c16b0 Increment version to 0.3.7.dev 2019-04-29 19:24:53 +02:00
Martin Kroeker
97d5034ed3 Merge branch 'release-0.3.0' into develop 2019-04-29 19:21:54 +02:00
Martin Kroeker
9763f872fc Update Changelog with changes from 0.3.6 2019-04-29 19:18:26 +02:00
AbdelRauf
628b335e83 Merge branch 'develop' of https://github.com/quickwritereader/OpenBLAS into develop 2019-04-29 08:57:44 +00:00
AbdelRauf
0f105dd8a5 sgemm/strmm 2019-04-29 08:49:50 +00:00
Martin Kroeker
9c4edd38f2 Merge pull request #2099 from martin-frbg/rela-gbtrf
Disable repeated recursion on Ab_BR in ReLAPACK xGBTRF
2019-04-29 09:25:19 +02:00
Martin Kroeker
1036299da0 Disable repeated recursion on Ab_BR in ReLAPACK xGBTRF
due to crashes in LAPACK tests
2019-04-29 00:12:37 +02:00
Martin Kroeker
5b0398186e Merge pull request #2098 from martin-frbg/rela-malloc
Disable reallocation of work array in ReLAPACK xSYTRF
2019-04-28 19:31:01 +02:00
Martin Kroeker
452859f4e1 Merge pull request #2097 from martin-frbg/rela-getrf
Correct INFO=4 condition in ReLAPACK xGETRF
2019-04-28 19:28:57 +02:00
Martin Kroeker
2cd463eabd Disable reallocation of work array in xSYTRF
as it appears to cause memory management problems (seen in the LAPACK tests)
2019-04-28 10:02:28 +02:00
Martin Kroeker
11530b76f7 Correct INFO=4 condition 2019-04-28 09:58:56 +02:00
Martin Kroeker
91943b7325 Merge pull request #2096 from martin-frbg/eig-testing
Avoid out-of-bounds accesses in LAPACK EIG tests
2019-04-28 09:55:42 +02:00
Martin Kroeker
268c28db7d Merge pull request #2095 from martin-frbg/trsm
Correct length of name string in xerbla call
2019-04-28 09:55:25 +02:00
Martin Kroeker
2aad88d5b9 Avoid out-of-bounds accesses in LAPACK EIG tests
see https://github.com/Reference-LAPACK/lapack/issues/333
2019-04-27 23:01:49 +02:00
Martin Kroeker
0bd956fd21 Correct length of name string in xerbla call 2019-04-27 22:49:04 +02:00
Martin Kroeker
bbd9d98664 Merge pull request #2094 from martin-frbg/issue2066
Fix ReLAPACK integration problems
2019-04-27 22:45:47 +02:00
Martin Kroeker
798c448b0c Add support for INTERFACE64 and fix XERBLA calls
1. Replaced all instances of "int" with "blasint"
2. Added string length as "hidden" third parameter in calls to fortran XERBLA
2019-04-27 19:06:00 +02:00
Martin Kroeker
9a19616a28 Support INTERFACE64=1 2019-04-27 18:55:47 +02:00
Martin Kroeker
6b41eb9c0c Merge pull request #2092 from jeffbaylor/snprintf_with_MSC_VER
snprintf define consolidated to common.h
2019-04-23 20:12:06 +02:00
Martin Kroeker
ccfb7ead15 Merge pull request #2072 from martin-frbg/sum
Add (C)BLAS extension ?sum
2019-04-23 20:11:36 +02:00
Jeff Baylor
40e53e52d6 snprintf define consolidated to common.h 2019-04-22 17:01:34 -07:00
Martin Kroeker
744779d335 Merge pull request #2084 from RashmicaG/develop
Add in runtime CPU detection for POWER.
2019-04-14 21:40:07 +02:00
Rashmica Gupta
bcdf1d4917 Add in runtime CPU detection for POWER. 2019-04-09 14:20:16 +10:00
Martin Kroeker
e06b8438b4 Merge pull request #2080 from martin-frbg/issue2075
Add -lm and disable EXPRECISION support on *BSD
2019-04-02 21:40:58 +02:00
Martin Kroeker
9229d6859b Add -lm and disable EXPRECISION support on *BSD
fixes #2075
2019-04-02 09:38:18 +02:00
Martin Kroeker
21d146a8de Add declarations for ?sum 2019-03-31 22:12:23 +02:00
Martin Kroeker
7f4e36d219 Merge pull request #2073 from martin-frbg/issue2056-2
Detect 32bit environment on 64bit ARM hardware
2019-03-31 13:56:08 +02:00
Martin Kroeker
c04a729081 Add ?sum definitions for generic kernel 2019-03-31 13:55:49 +02:00
Martin Kroeker
100d94f94e Add ?sum 2019-03-31 13:55:05 +02:00
Martin Kroeker
d17da6c6a4 Add cmake defaults for ?sum kernels 2019-03-31 11:57:01 +02:00
Martin Kroeker
1679de5e59 Detect 32bit environment on 64bit ARM hardware
for #2056, using same approach as #2058
2019-03-31 10:50:43 +02:00
Martin Kroeker
246ca29679 Add ZARCH implementation of ?sum
as trivial copies of the respective ?asum kernels with the ABS and vflpsb calls removed
2019-03-30 22:49:05 +01:00
Martin Kroeker
9d717cb5ee Add x86_64 implementation of ?sum
as trivial copy of ?asum with the fabs calls removed
2019-03-30 22:27:04 +01:00
Martin Kroeker
e3bc83f2a8 Add x86 implementation of ?sum
as trivial copy of ?asum with the fabs calls removed
2019-03-30 22:26:10 +01:00
Martin Kroeker
70f2a4e0d7 Add SPARC implementation of ?sum
as trivial copy of ?asum with the fabs replaced by fmov to preserve code structure
2019-03-30 22:25:06 +01:00
Martin Kroeker
706dfe263b Add POWER implementation of ?sum
as trivial copy of ?asum with the fabs replaced by fmr to preserve code structure
2019-03-30 22:23:42 +01:00
Martin Kroeker
688fa9201c Add MIPS64 implementation of ?sum
as trivial copy of ?asum with the fabs replaced by mov to preserve code structure
2019-03-30 22:22:15 +01:00
Martin Kroeker
cdbe0f0235 Add MIPS implementation of ?sum
as trivial copy of ?asum with the fabs calls removed
2019-03-30 22:20:14 +01:00
Martin Kroeker
f8b82bc6dc Add ia64 implementation of ?sum
as trivial copy of asum with the fabs calls removed
2019-03-30 22:18:03 +01:00
Martin Kroeker
3e3ccb9011 Add ARM64 implementations of ?sum
as trivial copies of the respective ?asum kernels with the fabs calls removed
2019-03-30 22:13:36 +01:00
Martin Kroeker
94ab4e6fb2 Add ARM implementations of ?sum
(trivial copies of the respective ?asum with the fabs calls removed)
2019-03-30 22:11:38 +01:00
Martin Kroeker
c3cfc6986b Add implementations of ssum/dsum and csum/zsum
as trivial copies of asum/zsasum with the fabs calls replaced by fmov to preserve code structure
2019-03-30 22:05:11 +01:00
Martin Kroeker
b9f4943a14 Add ?sum 2019-03-30 22:01:13 +01:00
Martin Kroeker
79cfc24a62 Add interface for ?sum (derived from ?asum) 2019-03-30 21:59:18 +01:00
Martin Kroeker
5c42287c4f Add declarations for ?sum and cblas_?sum 2019-03-30 21:58:03 +01:00
Martin Kroeker
32c7063cb0 Merge pull request #2061 from martin-frbg/martin-frbg-patch-1
Disable the AVX512 DGEMM kernel (again)
2019-03-30 21:21:38 +01:00
Martin Kroeker
c19a449096 Merge pull request #2071 from martin-frbg/issue2068
Provide CBLAS interfaces to I?MIN and I?MAX
2019-03-30 14:54:28 +01:00
Martin Kroeker
3d1e36d4cb Build CBLAS interfaces for I?MIN and I?MAX 2019-03-30 12:38:41 +01:00
Martin Kroeker
4f9d3e4b28 Expose CBLAS interfaces for I?MIN and I?MAX 2019-03-30 12:37:13 +01:00
Martin Kroeker
4dec151d0b Merge pull request #2070 from quickwritereader/develop
power9 makefile. dgemm based on power8 kernel with following changes …
2019-03-29 21:46:21 +01:00
Martin Kroeker
7c51cc8527 Merge branch 'develop' into develop 2019-03-29 19:36:29 +01:00
AbdelRauf
853a18bc17 power9 makefile. dgemm based on power8 kernel with following changes : 32x unrolled 16x4 kernel and 8x4 kernel using (lxv stxv butterfly rank1 update). improvement from 17 to 22-23gflops. dtrmm cases were added into dgemm itself 2019-03-29 15:49:40 +00:00
Martin Kroeker
3ae122e2c7 Merge pull request #2069 from aixoss/aix-asm-change
AIX asm syntax changes needed for shared object creation
2019-03-25 21:34:30 +01:00
Ayappan P
b043a5962e AIX asm syntax changes needed for shared object creation 2019-03-25 18:53:25 +05:30
Martin Kroeker
8502030e5e Merge pull request #2064 from embray/cygwin/use-tls-thread-memory-cleanup
Fix for #2063
2019-03-19 22:12:51 +01:00
Erik M. Bray
8ba9e2a61a Also call CloseHandle on each thread, as well as on the event so as to not leak thread handles. 2019-03-19 11:21:44 +01:00
Erik M. Bray
4ad694eda1 Fix for #2063: The DllMain used in Cygwin did not run the thread memory
pool cleanup upon THREAD_DETACH which is needed when compiled with
USE_TLS=1.
2019-03-19 09:26:50 +01:00
Martin Kroeker
dff4a197a5 Merge pull request #2058 from xsacha/patch-3
Change 64-bit detection as explained in #2056
2019-03-16 11:57:23 +01:00
Martin Kroeker
a5425575b1 Merge pull request #2060 from embray/cygwin/readenv
Use POSIX getenv on Cygwin
2019-03-16 11:56:51 +01:00
Erik M. Bray
1006ff8a7b Use POSIX getenv on Cygwin
The Windows-native GetEnvironmentVariable cannot be relied on, as
Cygwin does not always copy environment variables set through Cygwin
to the Windows environment block, particularly after fork().
2019-03-15 15:06:30 +01:00
Martin Kroeker
e608d4f7fe Disable the AVX512 DGEMM kernel (again)
Due to as yet unresolved errors seen in #1955 and #2029
2019-03-13 22:10:28 +01:00
Martin Kroeker
4fc17d0d75 Trivial typo fix
as suggested in #2022
2019-03-13 19:20:23 +01:00
Sacha
c3e30b2bc2 Change 64-bit detection as explained in #2056 2019-03-13 23:21:54 +10:00
Martin Kroeker
03d7110900 Merge pull request #2042 from maomao194313/develop
add TARGET support for HiSilicon tsv110 CPUs
2019-03-12 22:57:39 +01:00
Martin Kroeker
3ce28fb81a Merge pull request #2055 from martin-frbg/atomid
Add CPUID data for Intel Denverton (as Nehalem)
2019-03-12 22:57:07 +01:00
Martin Kroeker
04f2226ea6 Add Intel Denverton 2019-03-12 16:09:55 +01:00
Martin Kroeker
b1393c7a97 Add Intel Denverton
for #2048
2019-03-12 16:03:56 +01:00
maomao194313
7e3eb9b25d make DYNAMIC_ARCH=1 package work on TSV110 2019-03-12 16:11:01 +08:00
maomao194313
f074d7d146 make DYNAMIC_ARCH=1 package work on TSV110. 2019-03-12 16:05:19 +08:00
Martin Kroeker
f18ab6c17b Merge pull request #2051 from martin-frbg/issue2048
Make TARGET=GENERIC compatible with DYNAMIC_ARCH=1
2019-03-09 16:39:35 +01:00
Martin Kroeker
946ec6c3b8 Merge pull request #2050 from kencu/PowerMacFix
PowerMac 970 fixes
2019-03-09 16:39:08 +01:00
Martin Kroeker
5b95534afc Make TARGET=GENERIC compatible with DYNAMIC_ARCH=1
for issue #2048
2019-03-09 11:21:16 +01:00
ken-cunningham-webuse
f7a06463d9 common_power.h: force DCBT_ARG 0 on PPC970 Darwin
without this, we see
../kernel/power/gemv_n.S:427:Parameter syntax error
and many more similar entries

that relates to this assembly command
dcbt 8, r24, r18

this change makes the DCBT_ARG = 0
and openblas builds through to completion on PowerMac 970
Tests pass
2019-03-07 12:03:45 -08:00
ken-cunningham-webuse
b0c714ef60 param.h : enable defines for PPC970 on DarwinOS
fixes:
gemm.c: In function 'sgemm_':
../common_param.h:981:18: error: 'SGEMM_DEFAULT_P' undeclared (first use in this function)
 #define SGEMM_P  SGEMM_DEFAULT_P
                  ^
2019-03-07 12:03:25 -08:00
Martin Kroeker
8d3d29e4d7 Merge pull request #2049 from Celelibi/fix_crash_sgemm_sse_x64
Fix crash in sgemm SSE/nano kernel on x86_64
2019-03-07 19:28:06 +01:00
Celelibi
b7f59da42d Fix crash in sgemm SSE/nano kernel on x86_64
Fix bug #2047.

Signed-off-by: Celelibi <celelibi@gmail.com>
2019-03-07 16:55:13 +01:00
Martin Kroeker
db3dc9e282 Merge pull request #2046 from kencu/powermac
ctest.c : add __POWERPC__ for PowerMac
2019-03-07 14:51:41 +01:00
ken-cunningham-webuse
4290afdae2 ctest.c : add __POWERPC__ for PowerMac 2019-03-06 20:55:06 -08:00
Martin Kroeker
4741ce803b Merge pull request #2045 from martin-frbg/2033-3
Do not compile in AVX512 check if AVX support is disabled
2019-03-06 22:40:26 +01:00
Martin Kroeker
11cfd0bd75 Do not compile in AVX512 check if AVX support is disabled
xgetbv is function depends on NO_AVX being undefined - we could change that too, but that combo is unlikely to work anyway
2019-03-05 16:04:25 +01:00
Martin Kroeker
651ab01d2b Merge pull request #2044 from martin-frbg/issue2043
Fix module definition conflicts between LAPACK and ReLAPACK
2019-03-05 12:11:32 +01:00
Martin Kroeker
d7b2c53c0b Merge pull request #2039 from brada4/meminit
Address warning in memory.c
2019-03-05 12:11:15 +01:00
Martin Kroeker
e4864a8933 Fix module definition conflicts between LAPACK and ReLAPACK
for #2043
2019-03-04 21:17:08 +01:00
Martin Kroeker
10d841d8b9 Merge pull request #2026 from martin-frbg/trmv_threads
Correct range limiting in trmv_thread and re-enable TRMV multithreading
2019-03-04 15:08:31 +01:00
Martin Kroeker
12f2b76748 Merge pull request #2038 from martin-frbg/issue2035
Improve handling of NO_STATIC and NO_SHARED
2019-03-04 15:07:48 +01:00
Martin Kroeker
6c83b878f6 Merge pull request #2040 from martin-frbg/locks2002
Restore locking optimizations for OpenMP case
2019-03-04 15:07:14 +01:00
maomao194313
fb4dae7124 add TARGET support for HiSilicon tsv110 CPUs 2019-03-04 16:48:49 +08:00
maomao194313
760842dda1 add TARGET support for HiSilicon tsv110 CPUs 2019-03-04 16:45:22 +08:00
maomao194313
53f482ee72 add TARGET support for HiSilicon tsv110 CPUs 2019-03-04 16:41:21 +08:00
maomao194313
783ba8058f HiSilicon tsv110 CPUs optimization branch
add HiSilicon tsv110 CPUs  optimization branch
2019-03-04 16:30:50 +08:00
Martin Kroeker
af480b02a4 Restore locking optimizations for OpenMP case
restore another accidentally dropped part of #1468 that was missed in #2004 to address performance regression reported in #1461
2019-03-03 14:17:07 +01:00
Andrew
e4a79be6bb address warning introed with #1814 et al 2019-03-03 09:05:11 +02:00
Andrew
e5c316c6b9 init 2019-03-03 08:59:27 +02:00
Martin Kroeker
25427926bc Improve handling of NO_STATIC and NO_SHARED
to avoid surprises from defining either as zero. Fixes #2035 by addressing some concerns from #1422
2019-03-02 23:36:36 +01:00
Martin Kroeker
edb8143141 Merge pull request #2037 from martin-frbg/issue2033-2
Make sure that AVX512 is disabled in 32bit builds
2019-03-01 11:45:02 +01:00
Martin Kroeker
c4868d11c0 Make sure that AVX512 is disabled in 32bit builds
for #2033
2019-03-01 09:23:03 +01:00
Martin Kroeker
4c321ae571 Merge pull request #2034 from martin-frbg/issue2033
Make x86_32 imply NO_AVX2, NO_AVX512 in addition to NO_AVX
2019-02-28 22:10:12 +01:00
Martin Kroeker
2ffb727187 Keep xcode8.3 for osx BINARY=32 build
as xcode10 deprecated i386
2019-02-28 10:51:54 +01:00
Martin Kroeker
d66214c946 Make x86_32 imply NO_AVX2, NO_AVX512 in addition to NO_AVX
fixes #2033
2019-02-28 09:58:25 +01:00
Martin Kroeker
fd34820b99 Fix AVX512 test always returning false due to missing compiler option 2019-02-25 17:58:31 +01:00
Martin Kroeker
918a0cc4d1 Fix missing -c option in AVX512 test 2019-02-25 17:55:36 +01:00
Martin Kroeker
0db9c03e7e Merge pull request #2028 from brada4/mv
Move one of clobber fixes to right place
2019-02-24 19:50:23 +01:00
Andrew
6eee1beac5 move fix to right place 2019-02-24 20:41:02 +02:00
Andrew
e5df5958cc init 2019-02-24 20:39:25 +02:00
Martin Kroeker
343b301d14 Reduce list of kernels in the dynamic arch build
to make compilation complete reliably within the 1h limit again
2019-02-20 10:27:48 +01:00
Martin Kroeker
45333d5793 Fix error introduced during cleanup 2019-02-19 22:16:33 +01:00
Martin Kroeker
e29b0cfcc4 Allow multithreading TRMV again
revert workaround introduced for issue #1332 as the actual cause appears to be my incorrect fix from #1262 (see #1388)
2019-02-19 21:03:30 +01:00
Martin Kroeker
78d9910236 Correct range_n limiting
same bug as seen in #1388, somehow missed in corresponding PR #1389
2019-02-19 20:59:48 +01:00
Martin Kroeker
e12cdf58ef Merge pull request #2024 from martin-frbg/gcc9fixes4
Fix inline assembly constraints in Bulldozer TRSM kernels
2019-02-17 11:49:15 +01:00
Martin Kroeker
1860c9456d Merge pull request #2023 from martin-frbg/gcc9fixes3
Fix inline assembly constraints in various x86_64 GEMVN kernels
2019-02-17 11:48:57 +01:00
Martin Kroeker
aec905498f Merge pull request #1988 from TiborGY/patch-1
Reword/expand comments in Makefile.rule
2019-02-17 11:36:04 +01:00
TiborGY
56089991e2 fix the the 2019-02-16 23:26:13 +01:00
Martin Kroeker
f9bb76d29a Fix inline assembly constraints in Bulldozer TRSM kernels
rework indices to allow marking i,as and bs as both input and output (marked operand n1 as well for simplicity). For #2009
2019-02-16 20:06:48 +01:00
Martin Kroeker
8242b1fe3f Fix inline assembly constraints 2019-02-16 18:51:09 +01:00
Martin Kroeker
efb9038f72 Fix inline assembly constraints 2019-02-16 18:46:17 +01:00
Martin Kroeker
e976557d29 Fix inline assembly constraints
rework indices to allow marking argument lda as input and output.
2019-02-16 18:36:39 +01:00
Martin Kroeker
9d8be15789 Fix inline assembly constraints
rework indices to allow marking argument lda4 as input and output. For #2009
2019-02-16 18:24:11 +01:00
Martin Kroeker
d752799a0f Merge pull request #2021 from martin-frbg/gcc9fixes2
Fix wrong constraints in inline assembly of Haswell DTRSM kernel
2019-02-16 18:05:40 +01:00
TiborGY
f209fc7fa9 Update Makefile.rule
add note about NUM_THREADS for package maintainers, add examples of programs that cause affinity troubles
2019-02-16 12:12:39 +01:00
Martin Kroeker
c26c0b77a7 Fix wrong constraints in inline assembly
for #2009
2019-02-15 15:08:16 +01:00
Martin Kroeker
1c6da2d03c Merge pull request #2019 from martin-frbg/gcc9fixes
Fix unannounced modification of input operand 8 (lda4) in Haswell GEMVN microkernel
2019-02-15 15:02:54 +01:00
Martin Kroeker
4255a58cd2 Rename operands to put lda on the input/output constraint list 2019-02-15 10:10:04 +01:00
Martin Kroeker
d3e4725548 Merge pull request #2020 from martin-frbg/issue1956
With the Intel compiler on Linux, prefer ifort for the final link step
2019-02-15 09:57:59 +01:00
Martin Kroeker
adb419ed67 With the Intel compiler on Linux, prefer ifort for the final link step
icc has known problems with mixed-language builds that ifort can handle just fine. Fixes #1956
2019-02-14 22:57:30 +01:00
Martin Kroeker
46e415b140 Save and restore input argument 8 (lda4)
Fixes miscompilation with gcc9 -ftree-vectorize (related to issue #2009)
2019-02-14 22:43:18 +01:00
Martin Kroeker
cd5a59b9cf Merge pull request #2018 from bartoldeman/fix-dgemv-znver1-tree-vectorize
dgemv_kernel_4x4(Haswell): add missing clobbers for xmm0,xmm1,xmm2,xmm3
2019-02-14 21:55:11 +01:00
Bart Oldeman
69a97ca7b9 dgemv_kernel_4x4(Haswell): add missing clobbers for xmm0,xmm1,xmm2,xmm3
This fixes a crash in dblat2 when OpenBLAS is compiled using
-march=znver1 -ftree-vectorize -O2

See also:
https://github.com/easybuilders/easybuild-easyconfigs/issues/7180
2019-02-14 16:27:58 +00:00
Martin Kroeker
b55c586fac Fix missing clobber in x86/x86_64 blas_quickdivide inline assembly function (#2017)
* Fix missing clobber in blas_quickdivide assembly
2019-02-14 15:21:36 +01:00
Martin Kroeker
056917d616 Merge pull request #2013 from martin-frbg/issue2011
Fix invalid memory access in PPC gemm_beta
2019-02-14 09:29:34 +01:00
Martin Kroeker
718efcec6f Fix out-of-bounds memory access in gemm_beta
Fixes #2011 (as suggested by davemq), assuming typo by K.Goto
2019-02-13 22:08:37 +01:00
Martin Kroeker
f9d67bb5e8 Fix out-of-bounds memory access in gemm_beta
Fixes #2011 (as suggested by davemq) presuming typo by K.Goto
2019-02-13 22:06:41 +01:00
Martin Kroeker
76bb74fcd4 Merge pull request #2012 from maamountki/z14
[ZARCH] Many improvements
2019-02-13 20:15:56 +01:00
maamountki
0a54c98b9d [ZARCH] Modify constraints 2019-02-13 21:06:25 +02:00
maamountki
bec54ae366 [ZARCH] Fix caxpy 2019-02-13 12:54:35 +02:00
Martin Kroeker
63d7bad8a5 Merge pull request #2010 from martin-frbg/issue2009
Fix declaration of input arguments in x86_64 GEMV, SYMV and DSCAL
2019-02-12 23:24:02 +01:00
Martin Kroeker
ab1630f9fa Fix declaration of arguments in inline assembly
Argument 0 is modified so should be input and output
2019-02-12 16:14:02 +01:00
Martin Kroeker
b824fa70eb Fix declaration of assembly arguments in SSYMV and DSYMV microkernels
Arguments 0 and 1 are both input and output
2019-02-12 16:00:18 +01:00
Martin Kroeker
91481a3e4e Fix declaration of input arguments in inline assembly
Argument 0 is modified as it doubles as a counter
2019-02-12 15:51:43 +01:00
Martin Kroeker
dc6ac9eab0 Fix declaration of input arguments in the x86_64 s/dGEMV_T and s/dGEMV_N kernels
Arguments 0 and 1 need to be tagged as both input and output
2019-02-12 15:33:48 +01:00
maamountki
f583674109 [ZARCH] Fix cgemv_t_4 2019-02-12 13:12:28 +02:00
maamountki
77fe70019f [ZARCH] Fix constraints and source code formatting 2019-02-11 16:01:13 +02:00
Martin Kroeker
03a2bf2602 Fix potential memory leak in cpu enumeration on Linux (#2008)
* Fix potential memory leak in cpu enumeration with glibc

An early return after a failed call to sched_getaffinity would leak the previously allocated cpu_set_t. Wrong calculation of the size argument in that call increased the likelyhood of that failure. Fixes #2003
2019-02-10 23:24:45 +01:00
Martin Kroeker
69edc5bbe7 Restore dropped patches in the non-TLS branch of memory.c (#2004)
* Restore dropped patches in the non-TLS branch of memory.c

As discovered in #2002, the reintroduction of the "original" non-TLS version of memory.c as an alternate branch had inadvertently used ba1f91f rather than a8002e2 , thereby dropping the commits for #1450, #1468, #1501, #1504 and #1520.
2019-02-07 20:06:13 +01:00
maamountki
7039770165 [ZARCH] Undo the last commit 2019-02-06 20:11:44 +02:00
Martin Kroeker
641767f846 Merge pull request #2001 from martin-frbg/cmake-dynlist
Support DYNAMIC_LIST option in cmake
2019-02-06 08:39:24 +01:00
Martin Kroeker
af6e2253a2 Merge pull request #2000 from martin-frbg/issue1989
Make c_check robust against old or incomplete perl installations
2019-02-06 00:29:30 +01:00
Martin Kroeker
5952e586ce Support DYNAMIC_LIST option in cmake
e.g. cmake -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST="NEHALEM;HASWELL;ZEN" ..
original issue was #1639
2019-02-05 23:51:40 +01:00
Martin Kroeker
f10408aae8 Merge pull request #1999 from martin-frbg/issue1996-2
fix second instance of complex.h for c++ as well
2019-02-05 22:02:11 +01:00
Martin Kroeker
d70ae3ab43 Make c_check robust against old or incomplete perl installations
by catching and working around failures to load modules, and avoiding object-oriented syntax in tempfile creation.
Fixes #1989
2019-02-05 20:06:34 +01:00
Martin Kroeker
1391fc46d2 fix second instance of complex.h for c++ as well 2019-02-05 19:29:33 +01:00
maamountki
11a43e8116 [ZARCH] Set alignment hint for vl/vst 2019-02-05 19:17:08 +02:00
Martin Kroeker
817fe9865c Merge pull request #1998 from martin-frbg/issue1992
Include complex rather than complex.h in C++ contexts
2019-02-05 17:39:59 +01:00
Martin Kroeker
f4b82d7bc4 Include complex rather than complex.h in C++ contexts
to avoid name clashes e.g. with boost headers that use I as a generic placeholder.
Fixes #1992 as suggested by aprokop in that issue ticket.
2019-02-05 13:30:13 +01:00
maamountki
61526480f9 [ZARCH] Fix copy constraint 2019-02-05 07:51:19 +02:00
maamountki
81daf6bc38 [ZARCH] Format source code, Fix constraints 2019-02-05 07:30:38 +02:00
maamountki
a38aa56e76 Merge pull request #1 from xianyi/develop
Update
2019-02-05 07:25:38 +02:00
Martin Kroeker
729e925174 Merge pull request #1996 from quickwritereader/develop
NBMAX=4096 for gemvn, added sgemvn 8x8 for future
2019-02-04 16:52:04 +01:00
Ubuntu
498ac98581 Note for unused kernels 2019-02-04 15:41:56 +00:00
Ubuntu
cd9ea45463 NBMAX=4096 for gemvn, added sgemvn 8x8 for future 2019-02-04 06:57:11 +00:00
Martin Kroeker
f9c5023e04 Merge pull request #1994 from quickwritereader/develop
sgemv cgemv pairs
2019-02-01 21:04:47 +01:00
Ubuntu
4abc375a91 sgemv cgemv pairs 2019-02-01 13:45:00 +00:00
Martin Kroeker
874df65491 Fix incorrect sgemv results for IBM z14
part of PR #1993 that was inadvertently misplaced into the toplevel directory
2019-02-01 12:58:59 +01:00
Martin Kroeker
1f4b61f572 Delete misplaced file sgemv_t_4.c
from #1993 , file should have gone into kernel/zarch
2019-02-01 12:57:01 +01:00
Martin Kroeker
282230c303 Merge pull request #1993 from martin-frbg/aarnes-zarch
Various fixes for the new Z14 target
2019-01-31 21:27:00 +01:00
Martin Kroeker
cce574c3e0 Improve the z14 SGEMVT kernel
from patch provided by aarnez in #991
2019-01-31 21:24:55 +01:00
Martin Kroeker
877023e1e1 Fix precision of zarch DSDOT
from patch provided by aarnez in #991
2019-01-31 21:22:26 +01:00
Martin Kroeker
265142edd5 Fix typo in the zarch min/max kernels
from patch provided by aarnez in #991
2019-01-31 21:21:40 +01:00
Martin Kroeker
885a3c4350 USE_TRMM on Z14
from patch provided by aarnez in #991
2019-01-31 21:18:09 +01:00
Martin Kroeker
4b512f84dd Add cache sizes for Z14
from patch provided by aarnez in #991
2019-01-31 21:16:44 +01:00
Martin Kroeker
72d3e7c9b4 Add FORCE Z14
from patch provided by aarnez in #991
2019-01-31 21:15:50 +01:00
Martin Kroeker
bdc73a49e0 Add parameters for Z14
from patch provided by aarnez in #991
2019-01-31 21:14:37 +01:00
Martin Kroeker
1249ee1fd0 Add Z14 target
from patch provided by aarnez in #991
2019-01-31 21:13:46 +01:00
Martin Kroeker
42df9efa0c Merge pull request #1991 from maamountki/z14
[ZARCH] Z14 Support, BLAS 1/2 single precision implementations
2019-01-31 19:10:03 +01:00
maamountki
82124729af Merge branch 'develop' into z14 2019-01-31 19:36:41 +02:00
maamountki
29416cb5a3 [ZARCH] Add Z13 version for max/min functions 2019-01-31 19:11:11 +02:00
maamountki
48b9b94f7f [ZARCH] Improve loading performance for camax/icamax 2019-01-31 18:52:11 +02:00
Martin Kroeker
86a824c97f Fix wrong comparison that made IMIN identical to IMAX
as reported by aarnez in #1990
2019-01-31 15:27:21 +01:00
Martin Kroeker
808410c2c7 Fix wrong comparison that made IMIN identical to IMAX
as suggested in #1990
2019-01-31 15:25:15 +01:00
maamountki
eaf20f0e7a Remove ztest 2019-01-31 09:26:50 +02:00
maamountki
fcd814a8d2 [ZARCH] Fix bug in max/min functions 2019-01-29 17:59:38 +02:00
maamountki
dc4d3bccd5 [ZARCH] Fix icamax/icamin 2019-01-29 03:47:49 +02:00
maamountki
c7143c1019 [ZARCH] Fix iamax/imax single precision 2019-01-28 17:52:23 +02:00
maamountki
04873bb174 [ZARCH] Undo the last commit 2019-01-28 17:32:24 +02:00
maamountki
c8ef9fb220 [ZARCH] Fix bug in iamax/iamin/imax/imin 2019-01-28 17:16:18 +02:00
Martin Kroeker
5be61f4b47 Merge pull request #1985 from martin-frbg/issue1984
Correct naming of getrf_parallel object
2019-01-28 15:44:57 +01:00
Martin Kroeker
3d155cff83 Merge pull request #1981 from edisongustavo/develop
Fix include directory of exported targets
2019-01-28 15:44:42 +01:00
Martin Kroeker
7d47f0a82d Merge pull request #1978 from danielgindi/feature/msvc_cmake
Better support for MSVC/Windows in CMake (v0.3.x)
2019-01-28 15:43:35 +01:00
Martin Kroeker
a529c71a74 Merge pull request #1962 from brada4/r
Modrenize R benchmarks slightly
2019-01-28 15:42:57 +01:00
TiborGY
ea1716ce2a Update Makefile.rule
Revert generate to install, explain the nature of the affinity conflict
2019-01-27 17:22:26 +01:00
TiborGY
0f24b39ebf Reword/expand comments in Makefile.rule
Lots of small changes in the wording of the comments, plus an expansion of the NUM_THREADS and NO_AFFINITY sections.
2019-01-27 15:33:00 +01:00
Martin Kroeker
89b60dab8a Merge pull request #1987 from martin-frbg/issue1961
Change ARMV8 target with BINARY=32 to ARMV7 automatically
2019-01-26 22:25:29 +01:00
Martin Kroeker
58dd7e4501 Change ARMV8 target to ARMV7 for BINARY=32 2019-01-26 17:52:33 +01:00
Martin Kroeker
36b844af88 Change ARMV8 target to ARMV7 when BINARY32 is set
fixes #1961
2019-01-26 17:47:22 +01:00
Martin Kroeker
e882b239aa Correct naming of getrf_parallel object
fixes #1984
2019-01-26 00:45:45 +01:00
Martin Kroeker
3f7bb87a2a Merge pull request #1971 from martin-frbg/trsm-threshold
Shift transition to multithreading towards larger matrix sizes
2019-01-24 09:17:48 +01:00
Edison Gustavo Muenz
e908ac2a51 Fix include directory of exported targets 2019-01-23 15:09:13 +01:00
Martin Kroeker
8533aca964 Avoid penalizing tall skinny matrices 2019-01-23 10:03:00 +01:00
Martin Kroeker
16494cb7c4 Merge pull request #1980 from martin-frbg/issue1979
Report SkylakeX as Haswell if compiler does not support AVX512
2019-01-22 21:10:38 +01:00
Martin Kroeker
b56b34a75c Syntax fix 2019-01-22 18:55:43 +01:00
Martin Kroeker
21eda8b577 Report SkylakeX as Haswell if compiler does not support AVX512
... or make was invoked with NO_AVX512=1
2019-01-22 18:47:12 +01:00
Daniel Cohen Gindi
24288803b3 Adjust test script for correct deployment 2019-01-22 14:38:01 +02:00
Martin Kroeker
f0d834b824 Use VERSION_LESS for comparisons involving software version numbers 2019-01-22 12:32:24 +01:00
Daniel Cohen Gindi
63bbd7b0d7 Better support for MSVC/Windows in CMake 2019-01-21 17:47:47 +02:00
maamountki
b111829226 [ZARCH] Update max/min functions 2019-01-21 15:56:04 +02:00
Martin Kroeker
010d59bfee Merge pull request #1973 from martin-frbg/issue1464
Increase Zen SWITCH_RATIO to 16
2019-01-20 20:30:11 +01:00
Martin Kroeker
83b5c6b92d Fix compilation with NO_AVX=1 set
fixes #1974
2019-01-20 12:18:53 +01:00
Martin Kroeker
bbfdd6c0fe Increase Zen SWITCH_RATIO to 16
following GEMM benchmarks on Ryzen2700X. For #1464
2019-01-19 23:01:31 +01:00
Martin Kroeker
cda81cfae0 Shift transition to multithreading towards larger matrix sizes
See #1886 and JuliaRobotics issue 500. trsm benchmarks on Haswell and Zen showed that with these values performance is roughly doubled for matrix sizes between 8x8 and 14x14, and still 10 to 20 percent better near the new cutoff at 32x32.
2019-01-19 00:10:01 +01:00
Martin Kroeker
32b0f1168e Fix declaration of input arguments in the Sandybridge GER microkernels (#1967)
* Tag arguments 0 and 1 as both input and output
2019-01-18 08:11:39 +01:00
Martin Kroeker
b495e54310 Fix declaration of input arguments in the x86_64 SCAL microkernels (#1966)
* Tag arguments 0 and 1 as both input and output (see #1964)
2019-01-18 08:11:07 +01:00
Martin Kroeker
d5e6940253 Fix declaration of input arguments in the x86_64 microkernels for DOT and AXPY (#1965)
* Tag operands 0 and 1 as both input and output

For #1964 (basically a continuation of coding problems first seen in #1292)
2019-01-17 23:20:32 +01:00
Martin Kroeker
24e697eadb Merge pull request #1970 from quickwritereader/develop
crot fix
2019-01-17 16:42:11 +01:00
Martin Kroeker
3e9fd6359d Bump xcode version to 10.1 to make sure it handles AVX512 2019-01-17 16:19:03 +01:00
Ubuntu
43a4572038 crot fix 2019-01-17 14:45:31 +00:00
Martin Kroeker
256eb588bb Merge pull request #1963 from quickwritereader/develop
Blas1 single missing kernels implemented with vector builtins
2019-01-16 18:41:03 +01:00
Abdelrauf
a034e65512 Merge branch 'develop' into develop 2019-01-16 19:25:13 +04:00
Ubuntu
8c3386be87 Added missing Blas1 single fp {saxpy, caxpy, cdot, crot(refactored version of srot),isamax ,isamin, icamax, icamin},
Fixed idamin,icamin choosing the first occurance index of equal minimals
2019-01-16 15:16:21 +00:00
Andrew
3e601bd419 disable NaN checks before BLAS calls dgemm.R 2019-01-16 11:54:22 +02:00
Andrew
478d3c4569 disable NaN checks before BLAS calls deig.R (shorten matrix def) 2019-01-16 11:41:46 +02:00
Andrew
3afceb6c2a disable NaN checks before BLAS calls deig.R 2019-01-16 11:38:14 +02:00
Andrew
7af8b21dbb disable NaN checks before BLAS calls dsolve.R (shorter formula) 2019-01-16 11:34:46 +02:00
Martin Kroeker
1e3ada6db4 Merge pull request #1960 from cnjsdfcy/Hygon
Add support for Hygon Dhyana
2019-01-16 10:27:14 +01:00
Andrew
2777a7f506 disable NaN checks before BLAS calls dsolve.R (shorter config part) 2019-01-16 11:23:51 +02:00
Andrew
b70fd23836 disable NaN checks before BLAS calls dsolve.R 2019-01-16 11:18:54 +02:00
Andrew
def0385caa init 2019-01-16 09:51:29 +02:00
caiyu
29dc72889f Add support for Hygon Dhyana 2019-01-16 14:25:19 +08:00
maamountki
b815a04c87 [ZARCH] fix a bug in max/min functions 2019-01-15 21:04:22 +02:00
Martin Kroeker
dbc9a060ef Fix missing braces in support_av() call 2019-01-14 22:41:31 +01:00
Martin Kroeker
00401489c2 Fix missing braces in support_avx() 2019-01-14 22:38:32 +01:00
maamountki
1a7925b3a3 [ZARCH] Update dgemv_n_4.c 2019-01-11 17:43:11 +02:00
maamountki
406f835f00 [ZARCH] update cgemv_n_4.c 2019-01-11 17:39:17 +02:00
maamountki
621dedb37b [ZARCH] Update cgemv_t_4.c 2019-01-11 17:37:11 +02:00
maamountki
b731e8246f Update sgemv_t_4.c 2019-01-11 17:14:04 +02:00
maamountki
ecc31b743f Update dgemv_t_4.c 2019-01-11 17:13:02 +02:00
maamountki
5d89d6b143 [ZARCH] fix sgemv_n_4.c 2019-01-11 17:08:24 +02:00
maamountki
67432b23c2 [ZARCH] fix cgemv_n_4.c 2019-01-11 16:44:46 +02:00
Martin Kroeker
21c0f2af7b Merge pull request #1957 from martin-frbg/issue1954
Move TLS key deletion to openblas_quit
2019-01-10 12:04:08 +01:00
Martin Kroeker
ad2c386d6a Move TLS key deletion to openblas_quit
fixes #1954 (as suggested by thrasibule in that issue)
2019-01-10 00:32:50 +01:00
maamountki
be66f5d5c2 [ZARCH] fix data prefetch type in sdot 2019-01-09 16:50:07 +02:00
maamountki
c2ffef8156 [ZARCH] fix data prefetch type in ddot 2019-01-09 16:49:44 +02:00
maamountki
e7455f500c [ZARCH] fix dsdot.c 2019-01-09 16:33:54 +02:00
maamountki
3eafcfa650 [ZARCH] fix cgemv_n_4.c 2019-01-09 07:43:45 +02:00
Martin Kroeker
8d99dba86b Merge pull request #1949 from martin-frbg/issue1947
Query AVX2 and AVX512VL support when selecting x86 kernels
2019-01-08 20:44:08 +01:00
Martin Kroeker
1650311246 Bump xcode to 8.3 2019-01-08 14:43:45 +01:00
Martin Kroeker
cf5d48e833 Update OSX environment to Sierra
as homebrew seems to have dropped support for El Capitan in their gcc packages
2019-01-08 14:41:48 +01:00
Martin Kroeker
191677b902 Add travis_wait to the OSX brew install phase 2019-01-08 10:46:47 +01:00
Martin Kroeker
31ed19e8b9 Add message for SkylakeX and KNL fallbacks to Haswell 2019-01-05 19:41:13 +01:00
Martin Kroeker
e1574fa2b4 Add xcr0 (os support) check 2019-01-05 18:08:02 +01:00
Martin Kroeker
68eb3146ce Add xcr0 (os support) check 2019-01-05 18:07:14 +01:00
Martin Kroeker
0afaae4b23 Query AVX2 and AVX512VL capability in x86 cpu detection 2019-01-05 16:58:56 +01:00
Martin Kroeker
ae1d1f74f7 Query AVX2 and AVX512 capability for runtime cpu selection 2019-01-05 16:55:33 +01:00
maamountki
94cd946b96 [ZARCH] fix cgemv_n_4.c 2019-01-04 17:45:56 +02:00
Martin Kroeker
ed01f4932a Merge pull request #1946 from martin-frbg/issue1908
More fixes for cross-compiling ARM64 targets
2019-01-04 01:37:37 +01:00
maamountki
1aa840a0a2 [ZARCH] fix sgemv_t_4.c 2019-01-04 01:38:18 +02:00
Martin Kroeker
802f0dbde1 More fixes for cross-compiling ARM64 targets
Fixed core naming for DYNAMIC_ARCH. Corrected GEMM_DEFAULT entries and added SYMV_P. Replaced outdated VULCAN define for ThunderX2T99 with ARMV8 to get basic definitions back. For issue #1908
2019-01-03 22:17:31 +01:00
Martin Kroeker
20d1aad13f Fix missing quotes around thunderx targets 2019-01-02 20:15:35 +01:00
TiborGY
d11554c88f Validate user supplied TARGET (#1941)
the build will now abort with an error message when an undefined build TARGET is named

Fixes #1938
2018-12-31 23:19:44 +01:00
Martin Kroeker
ed704185ab Increment version to 0.3.6.dev 2018-12-31 23:11:37 +01:00
Martin Kroeker
2940798ea7 Increment version to 0.3.6.dev 2018-12-31 23:10:59 +01:00
Martin Kroeker
eebc189287 Version 0.3.5 2018-12-31 23:09:59 +01:00
Martin Kroeker
9185d419d3 Version 0.3.5 2018-12-31 23:09:20 +01:00
Martin Kroeker
4cf9d32694 Merge pull request #1945 from xianyi/develop
Merge changes from develop for 0.3.5 release
2018-12-31 23:08:25 +01:00
Martin Kroeker
1c75b65d53 Merge branch 'release-0.3.0' into develop 2018-12-31 23:07:53 +01:00
Martin Kroeker
13d006339b Update ChangeLog.txt with changes from 0.3.5 2018-12-31 23:00:46 +01:00
Martin Kroeker
bf76162635 Merge pull request #1944 from hartzell/patch-1
Typo: Skyalke -> Skylake
2018-12-31 18:36:18 +01:00
George Hartzell
0d52aefc6b Typo: Skyalke -> Skylake
Worth fixing, it gets in the way of searching....
2018-12-30 14:55:34 -08:00
Martin Kroeker
a6787b0f81 Merge pull request #1939 from TiborGY/patch-2
Fix typo in UNKNOWN core name
2018-12-30 20:10:05 +01:00
Martin Kroeker
8643521127 Merge pull request #1943 from martin-frbg/issue1748
Re-enable loop unrolling in trmv and remove the scary warning
2018-12-30 20:07:01 +01:00
Martin Kroeker
5a720cf9ca Re-enable loop unrolling in trmv and remove the scary warning
fixes #1748 as that half of the fix for #1332 appears to have been an overreaction on my part.
2018-12-30 15:22:37 +01:00
Martin Kroeker
ccd5945d38 Merge pull request #1942 from martin-frbg/issue1720
Delete the pthread key on cleanup in TLS mode
2018-12-30 14:47:05 +01:00
Martin Kroeker
9f80e0f5fc Remove stray include of complex.h
already provided conditionally by common.h via openblas_utest.h
Unconditional inclusion breaks older Android and similar platforms that use OPENBLAS_COMPLEX_STRUCT
2018-12-30 14:39:18 +01:00
Martin Kroeker
bba1e67269 Delete the pthread key on cleanup in TLS mode
to avoid a crash when OpenBLAS was loaded via dlopen and libc tries to clean up the leaked TLS after dlclose
Fixes #1720
2018-12-29 21:59:31 +01:00
Martin Kroeker
93240f489e Fix wrong case in TARGET setting for Alpine 2018-12-29 18:12:54 +01:00
TiborGY
7cbc2c37d6 Update cpuid_mips64.c 2018-12-28 14:36:39 +01:00
TiborGY
c329de2931 Update Makefile 2018-12-28 14:35:41 +01:00
TiborGY
187233953c Update cpuid_mips.c 2018-12-28 14:34:38 +01:00
TiborGY
09170268a3 Update cpuid_arm.c 2018-12-28 14:33:18 +01:00
TiborGY
211120c508 Fix typo in UNKNOWN core name
Should be of no consequence, right?
2018-12-27 23:09:21 +01:00
Martin Kroeker
9e4d190f4f Merge pull request #1932 from martin-frbg/issue1915
Add -fPIC to provided CFLAGS/FFLAGS if required
2018-12-24 23:48:33 +01:00
Martin Kroeker
fe02ba86a4 Remove unnecessary change again 2018-12-24 20:46:04 +01:00
Martin Kroeker
284fb00971 Merge pull request #1934 from fenrus75/betagoof
Fix thinko in skylake beta handling
2018-12-24 19:53:50 +01:00
Arjan van de Ven
795285c587 Fix thinko in skylake beta handling
casting ints is cheaper but it has a rounding, not memory casing effect, resulting in
invalid outcome
2018-12-24 18:49:50 +00:00
Martin Kroeker
d6818777d1 Make sure that -fPIC is present if needed 2018-12-23 23:47:37 +01:00
Martin Kroeker
5bd21ab6e1 Make sure that -fPIC is present when needed
override user-provided FFLAGS if necessary
2018-12-23 23:46:48 +01:00
Martin Kroeker
e1eab96502 Merge pull request #1931 from martin-frbg/pr1921
Add -mavx2 to TARGET=HASWELL builds
2018-12-23 23:15:54 +01:00
Martin Kroeker
76b4b8980f Use -dumpversion with gcc only 2018-12-23 19:08:19 +01:00
Martin Kroeker
49e0f485da Add -mavx2 for TARGET=HASWELL if compiler supports and requires it 2018-12-23 17:26:09 +01:00
Martin Kroeker
43c2b0eb55 Add -mavx2 to TARGET=HASWELL builds
to leverage improvements from PR#1921
2018-12-23 17:16:43 +01:00
Martin Kroeker
942e229ed5 Merge pull request #1930 from martin-frbg/issue1908
Reflect ARMV8 target definition changes from PR1876
2018-12-23 15:06:33 +01:00
Martin Kroeker
26a3402773 Reflect ARMV8 target definition changes from PR1876
and create config target directory for cross-compiles.
2018-12-23 12:26:01 +01:00
Martin Kroeker
20033f992a Merge pull request #1929 from martin-frbg/issue1924
Avoid taking the root of a negative number in simple threaded syrk
2018-12-23 09:03:58 +01:00
Martin Kroeker
f343ed65b5 Avoid taking the root of a negative number
Fixes #1924 where numpy 1.17+ would report the (transient) FE_INVALID exception raised for the domain error.
2018-12-22 22:30:29 +01:00
Martin Kroeker
a5a1118527 Merge pull request #1 from xianyi/develop
rebase
2018-12-22 22:13:44 +01:00
Martin Kroeker
e23366e860 Merge pull request #1921 from fenrus75/haswelldgemm
Replicate some of the SKYLAKEX dgemm improvements also to HASWELL
2018-12-17 08:39:20 +01:00
Arjan van de Ven
b28f75cd7e set GEMM_PREFERED_SIZE for HASWELL
Haswell likes a GEMM_PREFERED_SIZE of 16 to improve the split that the
threading code does to make it a nice multiple of the SIMD kernel size
2018-12-16 23:09:27 +00:00
Arjan van de Ven
d321448a63 dgemm: use dgemm_ncopy_8_skylakex.c also for Haswell
The dgemm_ncopy_8_skylakex.c code is not avx512 specific and gives
a nice performance boost for medium sized matrices
2018-12-16 23:09:22 +00:00
Arjan van de Ven
c43331ad0a dgemm: Use the skylakex beta function also for haswell
it's more efficient for certain tall/skinny matrices
2018-12-16 23:09:17 +00:00
Martin Kroeker
e8ca5a59a9 Merge pull request #1919 from fenrus75/haswelltuning
(sgemm) Apply some of the SKYLAKEX optimizations also to HASWELL
2018-12-16 20:11:05 +01:00
Martin Kroeker
c4e23dd016 Update Makefile 2018-12-16 18:14:40 +01:00
Martin Kroeker
cfc4acc221 typo 2018-12-16 16:19:51 +01:00
Martin Kroeker
545c2b1bbb Add -mavx2 on Haswell only if the compiler supports it 2018-12-16 13:09:19 +01:00
Arjan van de Ven
69d206440a Make the skylakex/haswell sgemm code compile and run even with compilers without avx2 support 2018-12-16 00:19:41 +00:00
Martin Kroeker
3843e3e017 use -maxv2 on haswell 2018-12-15 23:30:31 +01:00
Martin Kroeker
fbcb14a74b should be core-avx2 2018-12-15 20:18:59 +01:00
Martin Kroeker
2a3190dc76 fix elseifeq and use older option core2-avx for compatibility 2018-12-15 20:17:44 +01:00
Martin Kroeker
1ebe5c0f49 Add -march=haswell to HASWELL part of DYNAMIC_ARCH build 2018-12-15 19:35:35 +01:00
Arjan van de Ven
0586899a10 Use sgemm_ncopy_4_skylakex.c also for Haswell
sgemm_ncopy_4_skylakex.c uses SSE transpose operations where the
real perf win happens; this also works great for Haswell.

This gives double digit percentage gains on small and skinny matrices
2018-12-15 13:49:19 +00:00
Arjan van de Ven
00dc09ad19 Use the skylake sgemm beta code also for haswell
with a few small changes it's possible to use the skylake sgemm code
also for haswell, this gives a modest gain (10% range) for smallish
matrixes but does wonders for very skinny matrixes
2018-12-15 13:49:13 +00:00
Martin Kroeker
78d877b54b Merge pull request #1914 from fenrus75/smallmatrix
Add a "sgemm direct" mode for small matrixes
2018-12-13 19:08:14 +01:00
Arjan van de Ven
cdc668d82b Add a "sgemm direct" mode for small matrixes
OpenBLAS has a fancy algorithm for copying the input data while laying
it out in a more CPU friendly memory layout.

This is great for large matrixes; the cost of the copy is easily
ammortized by the gains from the better memory layout.

But for small matrixes (on CPUs that can do efficient unaligned loads) this
copy can be a net loss.

This patch adds (for SKYLAKEX initially) a "sgemm direct" mode, that bypasses
the whole copy machinary for ALPHA=1/BETA=0/... standard arguments,
for small matrixes only.

What is small? For the non-threaded case this has been measured to be
in the M*N*K = 28 * 512 * 512 range, while in the threaded case it's
less, around M*N*K = 1 * 512 * 512
2018-12-13 13:47:31 +00:00
Martin Kroeker
87718807f0 Merge pull request #1910 from martin-frbg/issue1909
Fix for DYNAMIC_ARCH builds made on a AVX512-capable host
2018-12-12 14:56:25 +01:00
Martin Kroeker
51aec8e96b make sure the added march=skylake-avx512 does not cause problems on Windows 2018-12-11 22:47:32 +01:00
Martin Kroeker
06f7d78d70 Add -march=skylake-avx512 to SkylakeX part of DYNAMIC_ARCH builds 2018-12-11 21:10:38 +01:00
Martin Kroeker
38cc638591 Avoid adding blanket march=skylake-avx512 to dynamic_arch builds 2018-12-11 21:09:26 +01:00
Martin Kroeker
0bf6d74e5f Fix typo in previous commit for arm dynamic arch 2018-12-07 19:37:33 +01:00
Martin Kroeker
133c278ee5 Add DYNAMIC_CORE list for ARM64
cf #1908
2018-12-07 17:42:23 +01:00
Martin Kroeker
2b355592e3 Make sure to use the arm version of dynamic.c in ARM64 DYNAMIC_ARCH
cf. #1908
2018-12-07 16:25:55 +01:00
Martin Kroeker
ff3eb1d474 Merge pull request #1904 from martin-frbg/issue1870
Fix cmake parsing of GEMM kernels for ARMV8
2018-12-06 23:01:23 +01:00
Martin Kroeker
0b09516678 Fix missing parameter in popen call 2018-12-06 18:33:05 +01:00
Martin Kroeker
7639f2e1f0 Rewrite the conditional for OSX to fix cmake parsing on others
The Makefile variable parser in utils.cmake currently does not handle conditionals. Having the definitions for non-OSX last will at least make cmake builds work again on non-OSX platforms.
2018-12-06 14:04:27 +01:00
Martin Kroeker
2fc712469d Avoid creating spurious non-suffixed c/zgemm_kernels
Plain cgemm_kernel and zgemm_kernel are not used anywhere, only cgemm_kernel_b etc.
Needlessly building them (without any define like NN, CN, etc.) just happened to work on most platforms, but not on arm64. See #1870
2018-12-06 13:56:06 +01:00
Martin Kroeker
6ba30e270d Fix typo that broke CNRM2 on ARMV8 since 0.3.0
must have happened in my #1449
2018-12-06 13:42:25 +01:00
Martin Kroeker
bf23518e36 Merge pull request #1903 from rengolin/armv8
Fix two mistakes on Arm64 builds
2018-12-05 22:10:53 +01:00
Renato Golin
31a490ea88 Fix two mistakes on Arm64 builds
* Falkor is an ARMv8.0 with ARMv8.1 features, and chosing armv8.1-a for
   march generates instructions it cannot cope with. Reverting it back
   to armv8-a.
 * ThunderX2's build was left with a #define VULCAN, which made it miss
   the right compiler flags in Makefile.arm64, although it did create
   the right library in the end.
2018-12-05 18:51:38 +00:00
Martin Kroeker
701ea88347 Use p2align instead of align for OSX compatibility
fixes #1902
2018-12-03 13:06:43 +01:00
Martin Kroeker
721c56c224 Merge pull request #1899 from brada4/fbsd12
Add mutually supported architecture mappings for FreeBSD12 ports
2018-12-03 12:50:27 +01:00
Martin Kroeker
c5f8aeff2d Merge branch 'develop' into fbsd12 2018-12-03 12:50:14 +01:00
Martin Kroeker
8278cbe7f8 Merge pull request #1894 from pkubaj/patch-2
Use correct ARCH name on BSD powerpc64
2018-12-03 12:48:53 +01:00
Martin Kroeker
ea6d1b96bd Update Makefile.system 2018-12-03 08:59:10 +01:00
Martin Kroeker
360374be62 Update with the changes from 0.3.4 2018-12-02 23:44:13 +01:00
Martin Kroeker
f5acaad8f0 Increment version to 0.3.5.dev 2018-12-02 23:43:15 +01:00
Martin Kroeker
93fa6b7b76 Increment version to 0.3.5.dev 2018-12-02 23:42:33 +01:00
Martin Kroeker
c0827a7164 Update with changes from 0.3.4 2018-12-02 23:41:17 +01:00
Martin Kroeker
86cff4effc Merge pull request #1900 from xianyi/develop
Update from develop for 0.3.4
2018-12-02 23:40:21 +01:00
Martin Kroeker
b028960aba Merge branch 'release-0.3.0' into develop 2018-12-02 23:38:49 +01:00
Martin Kroeker
3c9e3faedb fixup BSD naming of powerpc arch 2018-12-02 23:24:53 +01:00
Andrew
44c81fd135 oops 2018-12-02 20:27:53 +01:00
Andrew
26b3710485 Add architecture mappings for FreeBSD12 2018-12-02 12:07:41 +01:00
Andrew
84e614d0fd init 2018-12-02 12:05:15 +01:00
Martin Kroeker
dceff5542c Handle Android environments that identify as Linux (#1898)
* Handle Android environments that identify as Linux

termux terminal emulator does this, causing build failures through missed defines in common.h
2018-12-01 20:56:11 +01:00
Martin Kroeker
6c7b691083 Really revert xDOT changes from 1832
neglected to rebase #1892 on merging
2018-11-30 21:32:01 +01:00
Martin Kroeker
5f4c550c27 Merge pull request #1892 from martin-frbg/mipsdot
revert MIPS64 xDOT kernel changes from #1832
2018-11-30 21:28:21 +01:00
pkubaj
731b2722ba Fix build on POWER, remove DragonFly, add NetBSD
__asm is complete on its own

DBSD developers state they will only support amd64, but NetBSD supports POWER.
2018-11-30 21:12:05 +01:00
pkubaj
f85ce54d4a Use correct Makefile on powerpc64
FreeBSD uses powerpc64 name for POWER architecture. Use correct Makefile for this platform.
2018-11-30 16:05:49 +00:00
Andrew
2601cd58ab remove surplus locking code , only enabled w x86, disabled or never enabled on all others 2018-11-30 11:38:19 +01:00
Martin Kroeker
95a5542e3c Revert DOT kernel changes from #1834
as the failures seen on Loongson3A appear to be limited to DSDOT/SDSDOT (i.e. my hackish "fix" from #1684)
2018-11-30 11:16:24 +01:00
Martin Kroeker
7a2e1bc804 Use generic kernel for DSDOT/SDSDOT
as discussed in #1834
2018-11-30 10:57:09 +01:00
Martin Kroeker
35653e38b3 Merge pull request #1834 from fengrl/develop
register push/pop command change
2018-11-30 10:48:46 +01:00
Martin Kroeker
71e25ae42f Merge pull request #1890 from martin-frbg/issue1889
Include version number in openblas_get_config output
2018-11-29 15:47:35 +01:00
Martin Kroeker
97d7298973 call it OpenBLAS not just version 2018-11-29 11:52:08 +01:00
Martin Kroeker
de0d0ed52f Improve formatting of config output 2018-11-29 11:28:19 +01:00
Martin Kroeker
081ceb3e02 Propagate version number for openblas_get_config 2018-11-29 00:12:04 +01:00
Martin Kroeker
a29ec458c2 propagate verison number for openblas_config_version 2018-11-29 00:10:49 +01:00
Martin Kroeker
816775e309 Add version information to openblas_get_config output 2018-11-29 00:06:44 +01:00
Martin Kroeker
b6363f4539 Merge pull request #1885 from brada4/freebsd
Fix freebsd clang compilation of skylakex
2018-11-25 22:20:13 +01:00
Andrew
19c4bdd8b3 Add return value so that freebsd system clang does not err out 2018-11-25 21:35:01 +01:00
Andrew
f049a4c84f init 2018-11-25 21:34:09 +01:00
Martin Kroeker
f72fdf525c Merge pull request #1875 from martin-frbg/issue1851
Serialize accesses to parallelized level3 functions from multiple cal…
2018-11-25 20:53:46 +01:00
Martin Kroeker
5393759a98 Merge pull request #1869 from martin-frbg/axpy0
Handle special case INCX=0,INCY=0 in the axpy interface
2018-11-25 20:52:49 +01:00
Martin Kroeker
5cf18e2875 Merge pull request #1878 from kiwifb/PGI_f_check
Correct link flags for PGI compiler.
2018-11-25 20:51:50 +01:00
Martin Kroeker
910050985a Merge pull request #1876 from rengolin/armv8-cleanup
Simplifying ARMv8 build parameters
2018-11-25 20:51:24 +01:00
François Bissey
0184713e1a Correct link flags for PGI compiler. 2018-11-21 14:24:56 +13:00
Martin Kroeker
45c3c459e1 Merge pull request #1868 from martin-frbg/aix_cpuid
Use prtconf to determine CPU type on AIX
2018-11-20 17:25:57 +01:00
Martin Kroeker
113cb00b95 fix missing parenthesis 2018-11-19 21:01:36 +01:00
Martin Kroeker
5192651706 Add CriticalSection handling instead of mutexes for Windows 2018-11-19 17:58:22 +01:00
Renato Golin
310ea55f29 Simplifying ARMv8 build parameters
ARMv8 builds were a bit mixed up, with ThunderX2 code in ARMv8 mode
(which is not right because TX2 is ARMv8.1) as well as requiring a few
redundancies in the defines, making it harder to maintain and understand
what core has what. A few other minor issues were also fixed.

Tests were made on the following cores: A53, A57, A72, Falkor, ThunderX,
ThunderX2, and XGene.

Tests were: OpenBLAS/test, OpenBLAS/benchmark, BLAS-Tester.

A summary:
 * Removed TX2 code from ARMv8 build, to make sure it is compatible with
   all ARMv8 cores, not just v8.1. Also, the TX2 code has actually
   harmed performance on big cores.
 * Commoned up ARMv8 architectures' defines in params.h, to make sure
   that all will benefit from ARMv8 settings, in addition to their own.
 * Adding a few more cores, using ARMv8's include strategy, to benefit
   from compiler optimisations using mtune. Also updated cache
   information from the manuals, making sure we set good conservative
   values by default. Removed Vulcan, as it's an alias to TX2.
 * Auto-detecting most of those cores, but also updating the forced
   compilation in getarch.c, to make sure the parameters are the same
   whether compiled natively or forced arch.

Benefits:
 * ARMv8 build is now guaranteed to work on all ARMv8 cores
 * Improved performance for ARMv8 builds on some cores (A72, Falkor,
   ThunderX1 and 2: up to 11%) over current develop
 * Improved performance for *all* cores comparing to develop branch
   before TX2's patch (9% ~ 36%)
 * ThunderX1 builds are 14% faster than ARMv8 on TX1, 9% faster than
   current develop's branch and 8% faster than deveop before tx2 patches

Issues:
 * Regression from current develop branch for A53 (-12%) and A57 (-3%)
   with ARMv8 builds, but still faster than before TX2's commit (+15%
   and +24% respectively). This can be improved with a simplification of
   TX2's code, to be done in future patches. At least the code is
   guaranteed to be ARMv8.0 now.

Comments:
 * CortexA57 builds are unchanged on A57 hardware from develop's branch,
   which makes sense, as it's untouched.
 * CortexA72 builds improve over A57 on A72 hardware, even if they're
   using the same includes due to new compiler tunning in the makefile.
2018-11-19 16:41:49 +00:00
Martin Kroeker
2e6fae2aad Serialize accesses to parallelized level3 functions from multiple callers
for #1851
2018-11-19 14:02:50 +01:00
Martin Kroeker
368d14f8c8 Fix harmless typo
fixes #1872
2018-11-16 14:58:28 +01:00
Martin Kroeker
42bc2a9202 Fix copy-paste errors (POWER8/9 and extraneous return) 2018-11-16 12:10:44 +01:00
fengruilin
43bb386b10 fix dot problem on 64bit mips 2018-11-15 11:11:59 +08:00
Martin Kroeker
c171b8ad13 Handle special case INCX=0,INCY=0 in the axpy interface 2018-11-13 13:57:18 +01:00
Martin Kroeker
2f04cf22ac Detect POWER9 as POWER8 on AIX and Linux
(already supported by the *BSD version)
2018-11-13 08:16:14 +01:00
Martin Kroeker
807f6e6922 Use prtconf to determine CPU type on AIX
for #1803
2018-11-12 18:52:29 +01:00
Martin Kroeker
ecbeb802a0 Merge pull request #1865 from martin-frbg/issue1844
Optimize gemv for small M, large N only if it can be done in a threadsafe manner
2018-11-12 17:30:44 +01:00
Martin Kroeker
2c5725cc39 Merge pull request #1864 from aytekinar/patch-1
Add ARM tests on Travis
2018-11-12 14:30:28 +01:00
Arda Aytekin
e3666931d8 Update .travis.yml
Updated `.travis.yml` file to add emulated tests for `ARMV6` and `ARMV8`
architectures with `gcc` and `clang`.  Created prebuilt images with
required dependencies. Squashed layers into one.
2018-11-11 20:50:38 +01:00
Martin Kroeker
ae02a57261 Merge pull request #1866 from martin-frbg/issue1859
Fix argument in SLASET call to zero S
2018-11-10 19:23:31 +01:00
Martin Kroeker
a6a52a73f7 Fix argument in SLASET call to zero S
fixes #1859 in accordance with https://github.com/LAPACK-Reference/issue/296
2018-11-10 17:16:53 +01:00
Martin Kroeker
0427277cef Allow optimization for small m, large n only if it can be made threadsafe
otherwise the introduction of a static array in 8e5a108 to improve #532 breaks concurrent calls from multiple threads as seen in #1844
2018-11-10 15:45:54 +01:00
Martin Kroeker
4f43668eec Merge pull request #2 from xianyi/develop
merge develop
2018-11-10 15:37:25 +01:00
Martin Kroeker
b0c15bacc1 Merge pull request #1863 from martin-frbg/aix_install3
Set LIBSONAME suffix to .a for AIX
2018-11-09 13:12:06 +01:00
Martin Kroeker
cfb0f5b0f8 Set LIBSONAME suffix to .a for AIX
another fix for #1803
2018-11-08 22:39:10 +01:00
Martin Kroeker
667fed579d Merge pull request #1856 from rengolin/armv8-a57
[Arm64) Revert A53 detection as A57
2018-11-07 21:01:29 +01:00
Martin Kroeker
96d2f2c9b2 Merge pull request #1831 from brada4/hemv
disable threading in C/ZSWAP copying from S/DSWAP
2018-11-07 08:49:21 +01:00
Martin Kroeker
653e657a58 Merge pull request #1857 from brada4/fc-1847
Add gfortran -frecursive option from upstream and #1847
2018-11-07 08:48:31 +01:00
Martin Kroeker
5f8f0583d4 Merge branch 'develop' into fc-1847 2018-11-07 08:47:52 +01:00
Martin Kroeker
974a6a30f2 Merge pull request #1858 from brada4/buff-1847
Add minimum threshold for number of buffers
2018-11-07 08:46:55 +01:00
Andrew
9531d0e175 lets fit it in one 4k page 2018-11-06 17:51:24 +00:00
Andrew
40cce0e353 handle cmake too 2018-11-06 09:45:49 +00:00
Andrew
3fd41313fc add low bound for number of buffers 2018-11-06 09:40:13 +00:00
Andrew
a931afe269 init 2018-11-06 09:39:05 +00:00
Andrew
7d3502b500 Add -frecursive gfortran option by default 2018-11-06 08:20:55 +00:00
Andrew
066f8065d1 init 2018-11-06 08:19:08 +00:00
Renato Golin
fb5b2177ca [Arm64) Revert A53 detection as A57
This patch reverts the decision of treating A53 like A57, which was
based on an analysis done on server class hardware and is not
representative of all A53s out there.

Fixes #1855.
2018-11-05 11:34:49 +00:00
Martin Kroeker
f1c02273cb Merge pull request #1846 from fenrus75/threadsize
gemm/dgemm: add a way for an arch kernel to specify preferred sizes
2018-11-02 13:18:01 +01:00
Martin Kroeker
661035477c Merge pull request #1850 from martin-frbg/issue1811
Restore Android/ARMv7 build fix from #778
2018-11-02 09:50:51 +01:00
Martin Kroeker
aa7e47aa0a Merge pull request #1849 from martin-frbg/aix_install2
Use installbsd on AIX
2018-11-01 20:39:16 +01:00
Martin Kroeker
9c177d270b Restore Android/ARMv7 build fix from #778
for #1811
2018-11-01 18:50:25 +01:00
Martin Kroeker
b025523197 Use installbsd on AIX
(and fix misplaced parenthesis from previous commit). See #1803
2018-11-01 18:26:08 +01:00
Martin Kroeker
5b50bd36f7 Merge pull request #1845 from martin-frbg/aix_install
Accomodate AIX install, which has different syntax
2018-11-01 09:53:10 +01:00
Arjan van de Ven
5b708e5eb1 sgemm/dgemm: add a way for an arch kernel to specify prefered sizes
The current gemm threading code can make very unfortunate choices, for
example on my 10 core system a 1024x1024x1024 matrix multiply ends up
chunking into blocks of 102... which is not a vector friendly size
and performance ends up horrible.

this patch adds a helper define where an architecture can specify
a preference for size multiples.
This is different from existing defines that are minimum sizes and such.

The performance increase with this patch for the 1024x1024x1024 sgemm
is 2.3x (!!)
2018-11-01 01:43:20 +00:00
Arjan van de Ven
dcc5d6291e skylakex: Make the sgemm/dgemm beta code robust for a N=0 or M=0 case
in the threading code there are cases where N or M can become 0,
and the optimized beta code did not handle this well, leading
to a crash

during the audit for the crash a few edge conditions on the if statements
were found and fixed as well
2018-11-01 01:42:09 +00:00
Martin Kroeker
7b5aea52bb Accomodate AIX install, which has different syntax
for #1803
2018-10-31 21:50:34 +01:00
Martin Kroeker
f5595d0262 Merge pull request #1843 from martin-frbg/aix_numprocs
Add get_num_procs implementation for AIX
2018-10-31 21:25:15 +01:00
Martin Kroeker
326d394a0f Add get_num_procs implementation for AIX
(and copy HAIKU implementation to the non-TLS version of the code as well)
2018-10-31 18:38:22 +01:00
Martin Kroeker
6af8e35a24 Merge pull request #1837 from embray/set-num-thread-after-fork
Ensure that blas_thread_init has been called in openblas_set_num_threads
2018-10-30 12:41:24 +01:00
Erik M. Bray
38cf5d9364 ensure that threading has been initialized in the first place before calling openblas_set_num_threads 2018-10-28 21:16:52 +00:00
Martin Kroeker
8a43baacb2 Merge pull request #1836 from martin-frbg/zen2core
Fix detection of Ryzen2 (missing CORE_ZEN)
2018-10-28 20:00:01 +01:00
Martin Kroeker
64ca44873b Fix detection of Ryzen2 (missing CORE_ZEN) 2018-10-28 18:36:55 +01:00
fengrl
2d8064174c register push/pop command change
64bit push/pop register command should be used. Otherwise, data will lost.
2018-10-26 17:55:15 +08:00
Martin Kroeker
76a66eaac8 Merge pull request #1829 from ashwinyes/develop_aarch64_dynamic_arch_support
Add DYNAMIC_ARCH support for ARM64
2018-10-23 18:14:28 +02:00
Andrew
2992e3886a disable threading in C/ZSWAP copying from S/DSWAP 2018-10-22 23:21:49 +03:00
Ashwin Sekhar T K
d5aeff636f ARM64: Enable DYNAMIC_ARCH
Enable DYNAMIC_ARCH feature on ARM64. This patch uses the cpuid
feature in linux kernel to detect the core type at runtime
(https://www.kernel.org/doc/Documentation/arm64/cpu-feature-registers.txt).

If this feature is missing in kernel, then the user should use the
OPENBLAS_CORETYPE env variable to select the desired core type.
2018-10-22 01:49:35 -07:00
Ashwin Sekhar T K
af2837c392 ARM64: Remove #define ARMV8 for THUNDERX 2018-10-22 01:49:35 -07:00
Ashwin Sekhar T K
e7b66cd36e ARM64: Fix DYNAMIC_ARCH compilation for cores which dont use GEMM3M 2018-10-22 01:45:51 -07:00
Ashwin Sekhar T K
d50abc8903 ARM64: Move parameters from parameter.c to param.h
Remove the runtime setting of P, Q, R parameters for
targets ARMV8, THUNDERX2T99. Instead set them as constants
in param.h at compile time.
2018-10-22 01:45:51 -07:00
Ashwin Sekhar T K
351a0c777c ARM64: Remove XGENE1 references
Remove XGENE1 target as the implementation for the
same is incomplete. Moreover whoever wishes to use
on XGENE1 can use the generic ARMV8 target as there
are no XGENE1 specific optimizations in OpenBLAS.
2018-10-22 01:45:51 -07:00
Martin Kroeker
e3c262e5cf Merge pull request #1825 from brada4/hemv
Delay _hemv threading in attempt to address #1820
2018-10-21 20:34:05 +02:00
Andrew
a293bdcd5e re-arrange new code for readability 2018-10-20 21:37:53 +03:00
Andrew
c7bbf9c987 Attempt to tame _hemv threading #1820 2018-10-20 11:13:29 +03:00
Andrew
898a8dcaba init 2018-10-20 10:55:04 +03:00
Martin Kroeker
71c6deed60 Merge pull request #1821 from ashwinyes/develop_aarch64_armv8neonkernels
Use ThunderX2 Neon Kernels for ARMV8 Target
2018-10-18 08:13:05 +02:00
Ashwin Sekhar T K
21f46a1cf2 ARM64: Use THUNDERX2T99 Neon Kernels for ARMV8
Currently the generic ARMV8 target uses C implementations
for many routines. Replace these with the neon implementations
written for THUNDERX2T99 target which are upto 6x faster for
certain routines.
2018-10-17 10:44:37 -07:00
Ashwin Sekhar T K
caf339412f ARM64: Remove dependency of THUNDERX2T99 Makefile on CORTEXA57 Makefile 2018-10-17 08:02:40 -07:00
Ashwin Sekhar T K
8001fdcd2a ARM64: Remove dependency of THUNDERX Makefile on ARMV8 Makefile 2018-10-17 08:02:16 -07:00
Ashwin Sekhar T K
162e312832 ARM64: Remove dependency of CORTEXA57 Makefile on ARMV8 Makefile 2018-10-17 08:01:45 -07:00
Ashwin Sekhar T K
c3d93caa8d ARM64: Remove dependency of XGENE1 Makefile on ARMV8 Makefile 2018-10-17 08:01:27 -07:00
Martin Kroeker
a71923514f Merge pull request #1815 from fenrus75/sgemm_beta_fix
enable the SGEMM/SKX C based kernel
2018-10-14 19:57:34 +02:00
Arjan van de Ven
55b244ca0d enable the SGEMM/SKX C based kernel
In QA the final bug was found so now the sklyakex sgemm C based kernel can
be activated....
2018-10-12 09:30:35 +00:00
Martin Kroeker
2263d3906c Merge pull request #1812 from martin-frbg/issue1806-2
Use KERNEL_DEFINITIONS rather than COMMON_OPTS to pass -march=skylake…
2018-10-11 21:51:31 +02:00
Martin Kroeker
81c9985c3a Use KERNEL_DEFINITIONS rather than COMMON_OPTS to pass -march=skylake-avx512 2018-10-11 11:03:27 +02:00
Martin Kroeker
56ebc7b53e Merge pull request #1808 from martin-frbg/issue1806
Add -march=skylake-avx512 to CFLAGS when the target is Skylake
2018-10-11 07:48:08 +02:00
Martin Kroeker
c5f88f5a57 Merge pull request #1807 from xianyi/revert-1798-cmake-avx512
Revert "Add -march=skylake-avx512 when required"
2018-10-11 07:47:53 +02:00
Martin Kroeker
8a11ec19d1 Syntax fix 2018-10-10 23:47:35 +02:00
Martin Kroeker
fa53b903db Add -march=skylake-avx512 to CFLAGS when the target is Skylake
Should fix 1806 and #1801
2018-10-10 19:22:01 +02:00
Martin Kroeker
84bcdf9c66 Revert "Add -march=skylake-avx512 when required" 2018-10-10 19:15:32 +02:00
Martin Kroeker
8f7e986184 Merge pull request #1802 from martin-frbg/issue1801
Use avx512 workaround with msys2/mingw64 as well
2018-10-10 08:52:53 +02:00
Martin Kroeker
d0e83666ad Merge pull request #1804 from fenrus75/sgemm
Add a C+intrinsics version of the SGEMM/skylakex kernel
2018-10-10 08:50:44 +02:00
Arjan van de Ven
d4bad73834 Add a C+intrinsics version of the SGEMM/skylakex kernel
for most sizes this is 1.2x to 1.4x faster than the current code
2018-10-10 01:49:22 +00:00
Martin Kroeker
065763adde Merge pull request #1800 from fengrl/patch-1
Update common_mips64.h for the 1st loop of blas_memory_alloc
2018-10-09 10:56:37 +02:00
Martin Kroeker
210b03b543 Merge pull request #1792 from martin-frbg/cmakesuffix
Improve CMake help output and add SYMBOLPREFIX and -SUFFIX options
2018-10-09 10:34:52 +02:00
Martin Kroeker
6234a32656 Use cygwin compilation workaround for avx512 on msys2/mingw64 as well 2018-10-09 10:31:59 +02:00
Martin Kroeker
c0d7cd3dac Merge pull request #1799 from martin-frbg/issue1796
Handle conflicting usage of ARCH in at least some BSD environments
2018-10-09 08:20:52 +02:00
Martin Kroeker
667f0cc1cb Merge pull request #1793 from fenrus75/ncopy
Add optimized *copy versions for skylakex
2018-10-09 08:19:14 +02:00
fengrl
d4c8853a02 Update common_mips64.h 2018-10-09 11:20:16 +08:00
Martin Kroeker
d3d58f8ee5 Catch conflicting usage of ARCH in at least some BSD environments
fixes #1796
2018-10-08 22:29:35 +02:00
Martin Kroeker
697dc1baf8 Use override for ARCH in make.inc
in case a conflicting setting of ARCH (for architecture) gets pulled in from the environment
(originally suggested by dloghin in #1753)
2018-10-08 22:26:59 +02:00
Martin Kroeker
a9b51b8448 Merge pull request #1798 from martin-frbg/cmake-avx512
Add -march=skylake-avx512 when required
2018-10-08 21:15:17 +02:00
Martin Kroeker
eba394c711 Add -march=skylake-avx512 when required
fixes #1797
2018-10-08 19:18:12 +02:00
Arjan van de Ven
582c589727 dgemm/skylakex: replace discrete mul/add with fma
very minor gains since it's not super hot code, but general principles
2018-10-06 23:13:26 +00:00
Arjan van de Ven
adbf6afa25 Add vector optimizations for ncopy as well for dgemm/skylakex 2018-10-06 21:18:12 +00:00
Arjan van de Ven
32bec8afbb add a skylakex optimized dgemm beta function 2018-10-06 16:36:26 +00:00
Martin Kroeker
6e2c494556 Merge pull request #1791 from dev-zero/develop
fix parallel build issues with APFS/HFS+/ext2/3 in netlib-lapack
2018-10-06 16:29:29 +02:00
Arjan van de Ven
20c5d668fe dgemm/avx512 simplify and speed up the 4x4 kernel 2018-10-06 14:12:32 +00:00
Arjan van de Ven
6d43c51ccf undo slow dgemm/skylake microoptimization
the compare is more costly than the work
2018-10-06 14:00:37 +00:00
Arjan van de Ven
d74dc39b0f Add optimized *copy versions for skylakex
Add optimized n/t copy versions for skylakex; in the patch the
tcopy is also rewritten using intrinsics; the ncopy file
will be worked on in a future commit
2018-10-06 13:51:44 +00:00
Martin Kroeker
41951da6d4 Merge pull request #6 from xianyi/develop
merge develop
2018-10-06 14:36:36 +02:00
Martin Kroeker
474f7e9583 Add SYMBOLPREFIX and -SUFFIX options and improve help output 2018-10-06 14:28:04 +02:00
Tiziano Müller
79ea839b63 fix parallel build issues with APFS/HFS+/ext2/3 in netlib-lapack
The problem is that OpenBLAS sets the LAPACKE_LIB and the TMGLIB to the
same object and uses the `ar` feature to update the archive file. If the
underlying filesystem does not have sub-second timestamp resolution and
the system is fast enough (or `ccache` is used), the timestamp of the
builds which should be added to the previously generated archive is the
same as the archive file itself and therefore `make` does not update the
archive.

Since OpenBLAS takes care to not run the different targets updating the
archive in parallel, the easiest solution is to declare the respective
targets `.PHONY`, forcing `make` to always update them.

fixes #1682
2018-10-06 14:10:05 +02:00
Martin Kroeker
f7f97c6148 Merge pull request #1789 from brada4/develop
update travis alpine chroot with avx512 intrinsics headers
2018-10-05 20:42:37 +02:00
Martin Kroeker
6f22e1cfb8 Merge pull request #1788 from fenrus75/avx512-8x16
skylake dgemm: Add a 16x8 kernel
2018-10-05 20:40:38 +02:00
Arjan van de Ven
66b43affbc Add a 24x8 kernel to the skylakex dgemm implementation
Minor gains for small matrixes, but at 512x512 and above the gain
gets more significant.
2018-10-05 13:22:21 +00:00
Arjan van de Ven
1938819c25 skylake dgemm: Add a 16x8 kernel
The next step for the avx512 dgemm code is adding a 16x8 kernel.
In the 8x8 kernel, each FMA has a matching load (the broadcast);
in the 16x8 kernel we can reuse this load for 2 FMAs, which
in turn reduces pressure on the load ports of the CPU and gives
a nice performance boost (in the 25% range).
2018-10-05 13:11:35 +00:00
Andrew
bda3dbe2eb update travis alpine chroot with avx512 intrinsics headers 2018-10-05 15:47:55 +03:00
Andrew
c3e0f0eb38 update travis alpine chroot with avx512 intrinsics headers 2018-10-05 15:41:52 +03:00
Martin Kroeker
a980953bd7 Merge pull request #1785 from brada4/develop
address #1782 2nd loop
2018-10-05 08:25:38 +02:00
Martin Kroeker
78c99d5231 Merge pull request #1784 from fenrus75/dgemm-avx512
Create a AVX512 enabled version of DGEMM
2018-10-05 08:03:27 +02:00
Martin Kroeker
b7496c3638 Function name needs to be CNAME, set from outside to allow suffixing for dynamic_arch 2018-10-04 19:14:59 +02:00
Martin Kroeker
95f4e87579 Merge pull request #1787 from jeromerobert/develop
Fix unknown type name __WAIT_STATUS on RHEL5
2018-10-04 18:41:47 +02:00
Jerome Robert
b095f2fad6 Fix unknown type name __WAIT_STATUS on RHEL5
With glibc 2.5 one must have #define _XOPEN_SOURCE >= 500 to use wait.
But reading glibc code this is actually needed only if stdlib.h was
included before sys/wait.h. This was the case here through
openblas_utest.h. So changing include fix compilation on RHEL5 and
should ne hurt with more recent distro.

* Problem found when using with gcc 5.5 and 4.7.2 on RHEL5/CENTOS5
* Fix #1519
2018-10-04 14:37:08 +02:00
Martin Kroeker
02ef20a1e4 Merge pull request #1786 from martin-frbg/immintrin
Check for Immintrin.h presence in the AVX512 compatibility test as well
2018-10-04 09:07:09 +02:00
Martin Kroeker
4c3643ed7f Check availability of immintrin.h in the AVX512 compatibility test 2018-10-04 07:36:49 +02:00
Martin Kroeker
591cca7cb0 Check availability of immintrin.h in the AVX512 compatibility test 2018-10-04 07:35:30 +02:00
Andrew
3439158dea address #1782 2nd loop 2018-10-03 21:20:50 +02:00
Arjan van de Ven
45fe8cb0c5 Create a AVX512 enabled version of DGEMM
This patch adds dgemm_kernel_4x8_skylakex.c which is
* dgemm_kernel_4x8_haswell.s converted to C + intrinsics
* 8x8 support added
* 8x8 kernel implemented using AVX512

Performance is a work in progress, but already shows a 10% - 20%
increase for a wide range of matrix sizes.
2018-10-03 14:45:25 +00:00
Martin Kroeker
544b069e85 Merge pull request #1780 from martin-frbg/issue1774-2
Convert fldmia/fstmia instructions to UAL syntax for clang7
2018-09-29 09:27:47 +02:00
Martin Kroeker
9b2a7ad40d Convert fldmia/fstmia instructions to UAL syntax for clang7
second part of fix for #1774, containing files missed in #1775
2018-09-28 23:05:15 +02:00
Martin Kroeker
10ce70701a Merge pull request #1778 from fengrl/develop
test_axpy work error on LOONGSON3A platform #1777
2018-09-26 11:14:58 +02:00
fengruilin
6fc85a6359 test_axpy work error on LOONGSON3A platform #1777 2018-09-26 15:14:04 +08:00
Martin Kroeker
831c661386 Merge pull request #1775 from martin-frbg/issue1774
Convert fldmia/fstmia instructions to UAL syntax for clang7
2018-09-25 18:58:39 +02:00
Martin Kroeker
7e5df34e6a Convert fldmia/fstmia instructions to UAL syntax for clang7
fixes #1774
2018-09-25 09:41:58 +02:00
Martin Kroeker
4f45040b89 Merge pull request #1773 from martin-frbg/issue1767
Include thread numbers in failure message from blas_thread_init
2018-09-23 23:25:15 +02:00
Martin Kroeker
28aa94bf4b Include thread numbers in failure message from blas_thread_init
to aid in debugging cases like #1767
2018-09-22 14:00:15 +02:00
Martin Kroeker
56e7c68810 Merge pull request #1771 from staticfloat/sf/ldflags
Add `$(LDFLAGS)` to `$(CC)` and `$(FC)` invocations within `exports/Makefile`
2018-09-22 13:11:39 +02:00
Martin Kroeker
cf6df9464c Document the stub status of the QUAD_PRECiSION code (#1772)
* Document the stub status of the QUAD_PRECiSION code inherited from GotoBLAS2

in response to #1769
2018-09-22 12:31:37 +02:00
Elliot Saba
6f77af2eef Add $(LDFLAGS) to $(CC) and $(FC) invocations within exports/Makefile 2018-09-21 09:19:51 +00:00
Martin Kroeker
4d183e5567 Merge pull request #1765 from martin-frbg/issue1761
Do not use the new TLS-enabled memory allocator for non-threaded builds, and disable TLS by default in gmake as well
2018-09-19 22:02:21 +02:00
Martin Kroeker
34d55fd165 Merge pull request #1764 from yurivict/64-suffix
Allow to install the 'interface64' version concurrently with the regular version
2018-09-19 18:16:38 +02:00
Martin Kroeker
b991570210 Merge pull request #1762 from martin-frbg/issue1710-2
Add explicit casts to silence compiler warnings
2018-09-19 18:16:21 +02:00
Martin Kroeker
288aeea8a2 Fix default settings - USE_TLS and USE_SIMPLE_THREADED_LEVEL3 should both be off 2018-09-19 18:08:31 +02:00
Martin Kroeker
1ad1e79062 Catch inadvertent USE_TLS=0 declaration
for #1766
2018-09-19 18:03:43 +02:00
Martin Kroeker
b402626509 Do not use the new TLS code for non-threaded builds even if USE_TLS is set
Workaround for #1761 as that exposed a problem in the new code (which was intended to speed up multithreaded code only anyway).
2018-09-16 12:43:36 +02:00
Martin Kroeker
ec0cac1669 Merge pull request #4 from xianyi/develop
Update branch
2018-09-16 12:36:49 +02:00
Yuri
2349e15149 Allow to install the 'interfare64' version concurrently with the regular version 2018-09-15 21:00:03 -07:00
Martin Kroeker
f3c262156e Add an explicit cast to silence a warning
for #1710
2018-09-13 14:24:29 +02:00
Martin Kroeker
30f5a69ab8 Add explicit cast to silence a warning
for #1710
2018-09-13 14:23:31 +02:00
Martin Kroeker
fd081a91e4 Merge pull request #1759 from martin-frbg/lapack283
Remove an unused variable from several LAPACKE 2stage_work functions
2018-09-11 13:52:09 +02:00
Martin Kroeker
094f8c3b57 remove unused variable ldb_t
Copied from Reference-LAPACK PR283
2018-09-11 10:53:47 +02:00
Martin Kroeker
5cf090f516 remove unused variable ldb_t
Copied from Reference-LAPACK PR283
2018-09-11 10:52:30 +02:00
Martin Kroeker
58363542e7 remove unused variable ldb_t
Copied from Reference-LAPACK PR283
2018-09-11 10:51:17 +02:00
Martin Kroeker
3abc22a5bf Merge pull request #1757 from brada4/develop
fix small typo in strmm_ LN
2018-09-09 22:55:15 +02:00
Andrew
1e531701b7 fix small typo 2018-09-09 16:52:25 +02:00
Martin Kroeker
5d42b6ea04 Merge pull request #1756 from martin-frbg/issue1754
Follow netlib renaming/aliasing CBLAS_ORDER to CBLAS_LAYOUT
2018-09-07 11:02:18 +02:00
Martin Kroeker
ba4f433321 Merge pull request #1749 from martin-frbg/issue1531
Fix ARMV8 cross-compilation for IOS
2018-09-07 11:02:01 +02:00
Martin Kroeker
4cf7315a5d Adjust ARMV8 SGEMM unrolling when using the C fallback kernel_2x2 for IOS 2018-09-06 21:41:54 +02:00
Martin Kroeker
b57af93792 just make CBLAS_LAYOUT an alias of the existing CBLAS_ORDER
to avoid having to change all instances of enum CBLAS_ORDER in this file
2018-09-06 16:54:31 +02:00
Martin Kroeker
8aeab0601e Follow netlib renaming/aliasing CBLAS_ORDER to CBLAS_LAYOUT
fixes #1754
2018-09-06 16:39:52 +02:00
Martin Kroeker
1cb7b9015e Conditional compilation of assembly files that IOS does not like 2018-09-04 11:06:51 +02:00
Martin Kroeker
a4bd41e9f2 Fix paths to C kernels for nrm2 2018-09-04 10:51:19 +02:00
Martin Kroeker
9e2bb0c641 Update with the changes from 0.3.3 2018-08-31 00:21:13 +02:00
Martin Kroeker
dbfd7524cd Update version to 0.3.4.dev 2018-08-31 00:19:21 +02:00
Martin Kroeker
2982ce505d Update version to 0.3.4.dev 2018-08-31 00:18:37 +02:00
maamountki
e6c0e39492 Optimize Zgemv 2018-08-13 12:23:40 +03:00
maamountki
453bfa7e71 [ZARCH] Restore detect() function 2018-08-06 20:03:49 +03:00
maamountki
23229011db [ZARCH] Z14 support, BLAS 1/2 single precision implementations, Some missing double precision implementations, Gemv optimization 2018-08-06 18:20:40 +03:00
621 changed files with 66594 additions and 11884 deletions

143
.drone.yml Normal file
View File

@@ -0,0 +1,143 @@
---
kind: pipeline
name: arm64_gcc_make
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:19.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm32_gcc_make
platform:
os: linux
arch: arm
steps:
- name: Build and Test
image: ubuntu:19.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm64_clang_make
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm32_clang_cmake
platform:
os: linux
arch: arm
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: arm64_gcc_cmake
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: gcc
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: arm64_clang_cmake
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest

View File

@@ -4,11 +4,10 @@ dist: precise
sudo: true
language: c
jobs:
matrix:
include:
- &test-ubuntu
os: linux
stage: test
compiler: gcc
addons:
apt:
@@ -26,6 +25,15 @@ jobs:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64"
- <<: *test-ubuntu
os: linux-ppc64le
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
env:
# for matrix annotation only
- TARGET_BOX=PPC64LE_LINUX
- BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-ubuntu
env:
- TARGET_BOX=LINUX64
@@ -59,7 +67,6 @@ jobs:
- BTYPE="BINARY=32"
- os: linux
stage: test
compiler: gcc
addons:
apt:
@@ -80,13 +87,12 @@ jobs:
# that don't require sudo.
- &test-alpine
os: linux
stage: test
dist: trusty
sudo: true
language: minimal
before_install:
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \
&& echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1"
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
&& echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1"
- alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
install:
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
@@ -120,11 +126,10 @@ jobs:
- <<: *test-alpine
env:
- TARGET_BOX=LINUX64_MUSL
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
- &test-cmake
os: linux
stage: test
compiler: clang
addons:
apt:
@@ -153,8 +158,7 @@ jobs:
- &test-macos
os: osx
stage: test
osx_image: xcode8
osx_image: xcode10.1
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
- brew update
@@ -165,6 +169,7 @@ jobs:
- BTYPE="BINARY=64 INTERFACE64=1"
- <<: *test-macos
osx_image: xcode8.3
env:
- BTYPE="BINARY=32"

View File

@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 3)
set(OpenBLAS_PATCH_VERSION 8.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions
@@ -15,16 +15,26 @@ include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
set(OpenBLAS_LIBNAME openblas)
#######
if(MSVC)
option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
endif()
option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF)
option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF)
option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF)
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
else()
set(NO_AFFINITY 1)
endif()
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoids conflicts with other BLAS libraries, especially when using
# 64 bit integer interfaces in OpenBLAS.
set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" )
set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" )
#######
if(BUILD_WITHOUT_LAPACK)
set(NO_LAPACK 1)
@@ -37,12 +47,27 @@ endif()
#######
if(MSVC AND MSVC_STATIC_CRT)
set(CompilerFlags
CMAKE_CXX_FLAGS
CMAKE_CXX_FLAGS_DEBUG
CMAKE_CXX_FLAGS_RELEASE
CMAKE_C_FLAGS
CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_RELEASE
)
foreach(CompilerFlag ${CompilerFlags})
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
endforeach()
endif()
message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.")
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE})
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
if (NOT DYNAMIC_ARCH)
@@ -55,10 +80,10 @@ endif ()
set(SUBDIRS ${BLASDIRS})
if (NOT NO_LAPACK)
list(APPEND SUBDIRS lapack)
if(BUILD_RELAPACK)
list(APPEND SUBDIRS relapack/src)
endif()
list(APPEND SUBDIRS lapack)
endif ()
# set which float types we want to build for
@@ -127,7 +152,7 @@ endif ()
# Only generate .def for dll on MSVC and always produce pdb files for debug and release
if(MSVC)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
endif()
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
@@ -142,15 +167,9 @@ if (${DYNAMIC_ARCH})
endforeach()
endif ()
# Only build shared libs for MSVC
if (MSVC)
set(BUILD_SHARED_LIBS ON)
endif()
# add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
# Android needs to explicitly link against libm
if(ANDROID)
@@ -159,7 +178,7 @@ endif()
# Handle MSVC exports
if(MSVC AND BUILD_SHARED_LIBS)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
else()
# Creates verbose .def file (51KB vs 18KB)
@@ -192,7 +211,8 @@ if (USE_THREAD)
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
endif()
if (MSVC OR NOT NOFORTRAN)
#if (MSVC OR NOT NOFORTRAN)
if (NOT NO_CBLAS)
# Broken without fortran on unix
add_subdirectory(utest)
endif()
@@ -210,15 +230,92 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
SOVERSION ${OpenBLAS_MAJOR_VERSION}
)
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
if (NOT MSVC)
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
else()
target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE")
endif()
endif()
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64")
else()
set(ARCH_IN ${ARCH})
endif()
if (${CORE} STREQUAL "generic")
set(ARCH_IN "GENERIC")
endif ()
if (NOT DEFINED EXPRECISION)
set(EXPRECISION_IN 0)
else()
set(EXPRECISION_IN ${EXPRECISION})
endif()
if (NOT DEFINED NO_CBLAS)
set(NO_CBLAS_IN 0)
else()
set(NO_CBLAS_IN ${NO_CBLAS})
endif()
if (NOT DEFINED NO_LAPACK)
set(NO_LAPACK_IN 0)
else()
set(NO_LAPACK_IN ${NO_LAPACK})
endif()
if (NOT DEFINED NO_LAPACKE)
set(NO_LAPACKE_IN 0)
else()
set(NO_LAPACKE_IN ${NO_LAPACKE})
endif()
if (NOT DEFINED NEED2UNDERSCORES)
set(NEED2UNDERSCORES_IN 0)
else()
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
endif()
if (NOT DEFINED ONLY_CBLAS)
set(ONLY_CBLAS_IN 0)
else()
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
endif()
if (NOT DEFINED BU)
set(BU _)
endif()
if (NOT ${SYMBOLPREFIX} STREQUAL "")
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif()
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif()
add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
COMMENT "renaming symbols"
)
endif()
# Install project
# Install libraries
install(TARGETS ${OpenBLAS_LIBNAME}
EXPORT "OpenBLASTargets"
EXPORT "OpenBLAS${SUFFIX64}Targets"
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
# Install headers
set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
set(CMAKE_INSTALL_FULL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})
message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h)
@@ -238,7 +335,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
if(NOT NOFORTRAN)
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h)
set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h)
file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n")
file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n")
file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n")
@@ -251,10 +348,11 @@ endif()
if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h)
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif()
if(NOT NO_LAPACKE)
@@ -266,29 +364,31 @@ if(NOT NO_LAPACKE)
ADD_CUSTOM_TARGET(genlapacke
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
)
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
endif()
include(FindPkgConfig QUIET)
if(PKG_CONFIG_FOUND)
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY)
install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY)
install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
endif()
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
set(PN OpenBLAS)
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}")
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}")
configure_package_config_file(cmake/${PN}Config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake"
"${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
VERSION ${${PN}_VERSION}
COMPATIBILITY AnyNewerVersion)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake
${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
install(EXPORT "${PN}Targets"
NAMESPACE "${PN}::"
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
RENAME ${PN}${SUFFIX64}ConfigVersion.cmake
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
install(EXPORT "${PN}${SUFFIX64}Targets"
NAMESPACE "${PN}${SUFFIX64}::"
DESTINATION ${CMAKECONFIG_INSTALL_DIR})

View File

@@ -167,4 +167,7 @@ In chronological order:
* [2017-02-26] ztrmm kernel for IBM z13
* [2017-03-13] strmm and ctrmm kernel for IBM z13
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
* [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
* [2019-03-14] power9 dgemm/dtrmm kernel
* [2019-04-29] power9 sgemm/strmm kernel

View File

@@ -1,4 +1,229 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.7
11-Aug 2019
common:
* having the gmake special variables TARGET_ARCH or TARGET_MACH
defined no longer causes build failures in ctest or utest
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
has the same effect as setting them to 1
* a new test program was added to allow checking the library for
thread safety
* a new option USE_LOCKING was added to ensure thread safety when
OpenBLAS itself is built without multithreading but will be
called from multiple threads.
* a build failure on Linux with glibc versions earlier than 2.5
was fixed
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
on glibc 2.6 was fixed
* NO_AFFINITY was added to the CMAKE options (and defaults to being
active on Linux, as in the gmake builds)
x86_64:
* the build-time logic for detection of AVX512 availability in
the processor and compiler was fixed
* gmake builds on OSX now set the internal name of the library to
libopenblas.0.dylib (consistent with CMAKE)
* the Haswell DGEMM kernel received a significant speedup through
improved prefetch and load instructions
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
increased by avoiding vpermpd instructions
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
to fix remaining errors in DGEMM, DSYMM and DTRMM
## POWER:
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
* added optimized kernels for POWER9 SGEMM and STRMM
## ARMV7:
* fixed the softfp implementations of xAMAX and IxAMAX
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
they were appropriate for only a subset of platforms
====================================================================
Version 0.3.6
29-Apr-2019
common:
* the build tools now check that a given cpu TARGET is actually valid
* the build-time check of system features (c_check) has been made
less dependent on particular perl features (this should mainly
benefit building on Windows)
* several problem with the ReLAPACK integration were fixed,
including INTERFACE64 support and building a shared library
* building with CMAKE on BSD systems was improved
* a non-absolute SUM function was added based on the
existing optimized code for ASUM
* CBLAS interfaces to the IxMIN and IxMAX functions were added
* a name clash between LAPACKE and BOOST headers was resolved
* CMAKE builds with OpenMP failed to include the appropriate getrf_parallel
kernels
* a crash on thread (key) deletion with the USE_TLS=1 memory management
option was fixed
* restored several earlier fixes, in particular for OpenMP performance,
building on BSD, and calling fork on CYGWIN, which had inadvertently
been dropped in the 0.3.3 rewrite of the memory management code.
x86_64:
* the AVX512 DGEMM kernel has been disabled again due to unsolved problems
* building with old versions of MSVC was fixed
* it is now possible to build a static library on Windows with CMAKE
* accessing environment variables on CYGWIN at run time was fixed
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
* Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected
* building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported
with CMAKE as well
* building for DYNAMIC_ARCH with GENERIC as the default target is now supported
* a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed
* assembly bugs involving undeclared modification of input operands were fixed
in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem,
Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause
test failures or segfaults when compiled with recent versions of gcc from 8 onward.
* a similar bug was fixed in the blas_quickdivide code used to split workloads
in most functions
* a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX
* fixed building on SkylakeX systems when either the compiler or the (emulated) operating
environment does not support AVX512
* improved GEMM performance on ZEN targets
x86:
* build failures caused by the recently added checks for AVX512 were fixed
* an inline assembly bug involving undeclared modification of an input argument was
fixed in the blas_quickdivide code used to split workloads in most functions
* a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX
MIPS32:
* a bug in the IMIN implementation made it return the result of IMAX
POWER:
* single precision BLAS1/2 functions have received optimized POWER8 kernels
* POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel
* building on PPC970 systems under OSX Leopard or Tiger is now supported
* out-of-bounds memory accesses in the gemm_beta microkernels were fixed
* building a shared library on AIX is now supported for POWER6
* DYNAMIC_ARCH support has been added for POWER6 and newer
ARMv7:
* corrected xDOT behaviour with zero INC_X or INC_Y
* a bug in the IMIN implementation made it return the result of IMAX
ARMv8:
* added support for HiSilicon TSV110 cpus
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
* cross-compilation with CMAKE now works again
* a bug in the IMIN implementation made it return the result of IMAX
* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7
IBM Z:
* optimized microkernels for single precicion BLAS1/2 functions have been added
for both Z13 and Z14
====================================================================
Version 0.3.5
31-Dec-2018
common:
* loop unrolling in TRMV has been enabled again.
* A domain error in the thread workload distribution for SYRK
has been fixed.
* gmake builds will now automatically add -fPIC to the build
options if the platform requires it.
* a pthreads key leakage (and associate crash on dlclose) in
the USE_TLS codepath was fixed.
* building of the utest cases on systems that do not provide
an implementation of complex.h was fixed.
x86_64:
* the SkylakeX code was changed to compile on OSX.
* unwanted application of the -march=skylake-avx512 option
to the common code parts of a DYNAMIC_ARCH build was fixed.
* improved performance of SGEMM for small workloads on Skylake X.
* performance of SGEMM and DGEMM was improved on Haswell.
ARMV8:
* a configuration error that broke the CNRM2 kernel was corrected.
* compilation of the GEMM kernels with CMAKE was fixed.
* DYNAMIC_ARCH builds are now available with CMAKE as well.
* using CMAKE for cross-compilation to the new cpu TARGETs
introduced in 0.3.4 now works.
POWER:
* a problem in cpu autodetection for AIX has been corrected.
====================================================================
Version 0.3.4
02-Dec-2018
common:
* the new, experimental thread-local memory allocation had
inadvertently been left enabled for gmake builds in 0.3.3
despite the announcement. It is now disabled by default, and
single-threaded builds will keep using the old allocator even
if the USE_TLS option is turned on.
* OpenBLAS will now provide enough buffer space for at least 50
threads by default.
* The output of openblas_get_config() now contains the version
number.
* A serious thread safety bug in GEMV operation with small M and
large N size has been fixed.
* The code will now automatically call blas_thread_init after a
fork if needed before handling a call to openblas_set_num_threads
* Accesses to parallelized level3 functions from multiple callers
are now serialized to avoid thread races (unless using OpenMP).
This should provide better performance than the known-threadsafe
(but non-default) USE_SIMPLE_THREADED_LEVEL3 option.
* When building LAPACK with gfortran, -frecursive is now (again)
enabled by default to ensure correct behaviour.
* The OpenBLAS version cblas.h now supports both CBLAS_ORDER and
CBLAS_LAYOUT as the name of the matrix row/column order option.
* Externally set LDFLAGS are now passed through to the final compile/link
steps to facilitate setting platform-specific linker flags.
* A potential race condition during the build of LAPACK (that would
usually manifest itself as a failure to build TESTING/MATGEN) has been
fixed.
* xHEMV has been changed to stay single-threaded for small input sizes
where the overhead of multithreading exceeds any possible gains
* CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or
ThunderX hardware with sizable input.
* Linker flags for the PGI compiler have been updated
* Behaviour of AXPY with zero increments is now handled in the C interface,
correcting the result on at least Intel Atom.
* The result matrix from calling SGELSS with an all-zero input matrix is
now zeroed completely.
x86_64:
* Autodetection of AMD Ryzen2 has been fixed (again).
* CMAKE builds now support labeling of an INTERFACE64=1 build of
the library with the _64 suffix.
* AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel
has been sped up by rewriting with C intrinsics
* Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS)
POWER:
* added support for building on AIX (with gcc and GNU tools from AIX Toolbox).
* CPU type detection has been implemented for AIX.
* CPU type detection has been fixed for NETBSD.
MIPS64:
* AXPY on LOONGSON3A has been corrected to pass "zero increment" utest.
* DSDOT on LOONGSON3A has been fixed.
* the SGEMM microkernel has been hardened against potential data loss.
ARMV8:
* DYNAMic_ARCH support is now available for 64bit ARM
* cross-compiling for ARMV8 under iOS now works.
* cpu-specific code has been rearranged to make better use of both
hardware commonalities and model-specific compiler optimizations.
* XGENE1 has been removed as a TARGET, superseded by the improved generic
ARMV8 support.
ARMV7:
* Older assembly mnemonics have been converted to UAL form to allow
building with clang 7.0
* Cross compiling LAPACKE for Android has been fixed again (broken by
update to LAPACK 3.7.0 some while ago).
====================================================================
Version 0.3.3
31-Aug-2018

View File

@@ -34,7 +34,7 @@ endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
.PHONY : all libs netlib $(RELA) test ctest shared install
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
@@ -96,7 +96,7 @@ endif
@echo
shared :
ifndef NO_SHARED
ifneq ($(NO_SHARED), 1)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@@ -109,6 +109,7 @@ endif
ifeq ($(OSNAME), Darwin)
@$(MAKE) -C exports dyn
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq ($(OSNAME), WINNT)
@$(MAKE) -C exports dll
@@ -123,15 +124,18 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
$(MAKE) -C utest all
endif
$(MAKE) -C utest all
ifndef NO_CBLAS
$(MAKE) -C ctest all
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
$(MAKE) -C cpp_thread_test all
endif
endif
endif
libs :
ifeq ($(CORE), UNKOWN)
ifeq ($(CORE), UNKNOWN)
$(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
endif
ifeq ($(NOFORTRAN), 1)
@@ -251,7 +255,7 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc

View File

@@ -1,7 +1,7 @@
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -mfpu=neon -march=armv7-a
FCOMMON_OPT += -mfpu=neon -march=armv7-a
CCOMMON_OPT += -mfpu=neon
FCOMMON_OPT += -mfpu=neon
else
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
@@ -9,11 +9,6 @@ endif
endif
ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -mfpu=vfp -march=armv6
FCOMMON_OPT += -mfpu=vfp -march=armv6
endif
ifeq ($(CORE), ARMV5)
CCOMMON_OPT += -march=armv5
FCOMMON_OPT += -march=armv5
CCOMMON_OPT += -mfpu=vfp
FCOMMON_OPT += -mfpu=vfp
endif

View File

@@ -4,22 +4,42 @@ CCOMMON_OPT += -march=armv8-a
FCOMMON_OPT += -march=armv8-a
endif
ifeq ($(CORE), CORTEXA57)
CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
ifeq ($(CORE), CORTEXA53)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
endif
ifeq ($(CORE), VULCAN)
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
ifeq ($(CORE), CORTEXA57)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
endif
ifeq ($(CORE), CORTEXA72)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
endif
ifeq ($(CORE), CORTEXA73)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
endif
ifeq ($(CORE), THUNDERX)
CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
FCOMMON_OPT += -march=armv8-a -mtune=thunderx
endif
ifeq ($(CORE), FALKOR)
CCOMMON_OPT += -march=armv8-a -mtune=falkor
FCOMMON_OPT += -march=armv8-a -mtune=falkor
endif
ifeq ($(CORE), THUNDERX2T99)
CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif
ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
endif

View File

@@ -48,6 +48,7 @@ ifndef NO_CBLAS
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
endif
ifneq ($(OSNAME), AIX)
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@@ -57,14 +58,14 @@ ifndef NO_LAPACKE
endif
#for install static library
ifndef NO_STATIC
ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifndef NO_SHARED
ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@@ -72,6 +73,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
@@ -81,7 +83,8 @@ ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@@ -93,6 +96,33 @@ ifeq ($(OSNAME), CYGWIN_NT)
endif
endif
else
#install on AIX has different options syntax
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
endif
#for install static library
ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
endif
#Generating openblas.pc
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
@@ -109,7 +139,7 @@ endif
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
ifndef NO_SHARED
ifneq ($(NO_SHARED),1)
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"

View File

@@ -9,7 +9,15 @@ else
USE_OPENMP = 1
endif
ifeq ($(CORE), POWER9)
ifeq ($(USE_OPENMP), 1)
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
else
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math
endif
endif
ifeq ($(CORE), POWER8)
ifeq ($(USE_OPENMP), 1)
@@ -21,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas
endif
endif
# workaround for C->FORTRAN ABI violation in LAPACKE
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -fno-optimize-sibling-calls
endif
FLAMEPATH = $(HOME)/flame/lib

View File

@@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.3
VERSION = 0.3.8.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -48,6 +48,8 @@ VERSION = 0.3.3
# HOSTCC = gcc
# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64
# Please note that AVX is not available on 32-bit.
# Setting BINARY=32 disables AVX/AVX2/AVX-512.
# BINARY=64
# About threaded BLAS. It will be automatically detected if you don't
@@ -56,8 +58,14 @@ VERSION = 0.3.3
# For force setting for multi threaded, specify USE_THREAD = 1
# USE_THREAD = 0
# If you want to build a single-threaded OpenBLAS, but expect to call this
# from several concurrent threads in some other program, comment this in for
# thread safety. (This is done automatically for USE_THREAD=1 , and should not
# be necessary when USE_OPENMP=1)
# USE_LOCKING = 1
# If you're going to use this library with OpenMP, please comment it in.
# This flag is always set for POWER8. Don't modify the flag
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
# USE_OPENMP = 1
# The OpenMP scheduler to use - by default this is "static" and you
@@ -68,36 +76,45 @@ VERSION = 0.3.3
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
# CCOMMON_OPT += -DOMP_SCHED=dynamic
# You can define maximum number of threads. Basically it should be
# less than actual number of cores. If you don't specify one, it's
# automatically detected by the the script.
# You can define the maximum number of threads. Basically it should be less
# than or equal to the number of CPU threads. If you don't specify one, it's
# automatically detected by the build system.
# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to
# restrict NUM_THREADS to the number of physical cores. By default, the automatic
# detection includes logical CPUs, thus allowing the use of SMT.
# Users may opt at runtime to use less than NUM_THREADS threads.
#
# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS
# value (eg. 32-256) if you expect your users to use that many threads. Due to the way
# some internal structures are allocated, using a large NUM_THREADS value has a RAM
# footprint penalty, even if users reduce the actual number of threads at runtime.
# NUM_THREADS = 24
# If you have enabled USE_OPENMP and your application would call
# OpenBLAS's calculation API from multi threads, please comment it in.
# This flag defines how many instances of OpenBLAS's calculation API can
# actually run in parallel. If more threads call OpenBLAS's calculation API,
# OpenBLAS's calculation API from multiple threads, please comment this in.
# This flag defines how many instances of OpenBLAS's calculation API can actually
# run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API,
# they need to wait for the preceding API calls to finish or risk data corruption.
# NUM_PARALLEL = 2
# if you don't need to install the static library, please comment it in.
# If you don't need to install the static library, please comment this in.
# NO_STATIC = 1
# if you don't need generate the shared library, please comment it in.
# If you don't need to generate the shared library, please comment this in.
# NO_SHARED = 1
# If you don't need CBLAS interface, please comment it in.
# If you don't need the CBLAS interface, please comment this in.
# NO_CBLAS = 1
# If you only want CBLAS interface without installing Fortran compiler,
# please comment it in.
# If you only want the CBLAS interface without installing a Fortran compiler,
# please comment this in.
# ONLY_CBLAS = 1
# If you don't need LAPACK, please comment it in.
# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1.
# If you don't need LAPACK, please comment this in.
# If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1.
# NO_LAPACK = 1
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
# If you don't need LAPACKE (C Interface to LAPACK), please comment this in.
# NO_LAPACKE = 1
# Build LAPACK Deprecated functions since LAPACK 3.6.0
@@ -106,18 +123,18 @@ BUILD_LAPACK_DEPRECATED = 1
# Build RecursiveLAPACK on top of LAPACK
# BUILD_RELAPACK = 1
# If you want to use legacy threaded Level 3 implementation.
USE_SIMPLE_THREADED_LEVEL3 = 1
# If you want to use the legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1
# If you want to use the new, still somewhat experimental code that uses
# thread-local storage instead of a central memory buffer in memory.c
# Note that if your system uses GLIBC, it needs to have at least glibc 2.21
# for this to work.
USE_TLS = 1
# USE_TLS = 1
# If you want to drive whole 64bit region by BLAS. Not all Fortran
# compiler supports this. It's safe to keep comment it out if you
# are not sure(equivalent to "-i8" option).
# compilers support this. It's safe to keep this commented out if you
# are not sure. (This is equivalent to the "-i8" ifort option).
# INTERFACE64 = 1
# Unfortunately most of kernel won't give us high quality buffer.
@@ -125,10 +142,18 @@ USE_TLS = 1
# but it will consume time. If you don't like it, you can disable one.
NO_WARMUP = 1
# If you want to disable CPU/Memory affinity on Linux.
# Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling.
# This feature is only implemented on Linux, and is always disabled on other platforms.
# Enabling affinity handling may improve performance, especially on NUMA systems, but
# it may conflict with certain applications that also try to manage affinity.
# This conflict can result in threads of the application calling OpenBLAS ending up locked
# to the same core(s) as OpenBLAS, possibly binding all threads to a single core.
# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing
# else modifies affinity settings.
# Note: enabling affinity has been known to cause problems with NumPy and R
NO_AFFINITY = 1
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# BIGNUMA = 1
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
@@ -138,6 +163,10 @@ NO_AFFINITY = 1
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
# NO_AVX2 = 1
# Don't use SkylakeX optimizations if binutils or compiler are too old (the build
# system will try to determine this automatically)
# NO_AVX512 = 1
# Don't use parallel make.
# NO_PARALLEL_MAKE = 1
@@ -152,6 +181,9 @@ NO_AFFINITY = 1
# FUNCTION_PROFILE = 1
# Support for IEEE quad precision(it's *real* REAL*16)( under testing)
# This option should not be used - it is a holdover from unfinished code present
# in the original GotoBLAS2 library that may be usable as a starting point but
# is not even expected to compile in its present form.
# QUAD_PRECISION = 1
# Theads are still working for a while after finishing BLAS operation
@@ -159,17 +191,17 @@ NO_AFFINITY = 1
# time out to improve performance. This number should be from 4 to 30
# which corresponds to (1 << n) cycles. For example, if you set to 26,
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
# system). Also you can control this mumber by THREAD_TIMEOUT
# system). Also you can control this number by THREAD_TIMEOUT
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26
# Using special device driver for mapping physically contigous memory
# Using special device driver for mapping physically contiguous memory
# to the user space. If bigphysarea is enabled, it will use it.
# DEVICEDRIVER_ALLOCATION = 1
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
# CONSISTENT_FPCSR = 1
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
# with single thread. (Actually in recent versions this is a factor proportional to the
# number of floating point operations necessary for the given problem size, no longer
# an individual dimension). You can use this setting to avoid the overhead of multi-
@@ -177,7 +209,7 @@ NO_AFFINITY = 1
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
# GEMM_MULTITHREAD_THRESHOLD = 4
# If you need santy check by comparing reference BLAS. It'll be very
# If you need sanity check by comparing results to reference BLAS. It'll be very
# slow (Not implemented yet).
# SANITY_CHECK = 1
@@ -189,8 +221,8 @@ NO_AFFINITY = 1
# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT
# COMMON_OPT = -O2
# gfortran option for LAPACK
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
# gfortran option for LAPACK to improve thread-safety
# It is enabled by default in Makefile.system for gfortran
# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT
# FCOMMON_OPT = -frecursive
@@ -217,6 +249,21 @@ COMMON_PROF = -pg
# SYMBOLPREFIX=
# SYMBOLSUFFIX=
# Run a C++ based thread safety tester after the build is done.
# This is mostly intended as a developer feature to spot regressions, but users and
# package maintainers can enable this if they have doubts about the thread safety of
# the library, given the configuration in this file.
# By default, the thread safety tester launches 52 concurrent calculations at the same
# time.
#
# Please note that the test uses ~1300 MiB of RAM for the DGEMM test.
#
# The test requires CBLAS to be built, a C++11 capable compiler and the presence of
# an OpenMP implementation. If you are cross-compiling this test will probably not
# work at all.
#
# CPP_THREAD_SAFETY_TEST = 1
#
# End of user configuration
#

View File

@@ -9,6 +9,22 @@ ifndef TOPDIR
TOPDIR = .
endif
# If ARCH is not set, we use the host system's architecture.
ifndef ARCH
ARCH := $(shell uname -m)
endif
# Catch conflicting usage of ARCH in some BSD environments
ifeq ($(ARCH), amd64)
override ARCH=x86_64
else ifeq ($(ARCH), powerpc64)
override ARCH=power
else ifeq ($(ARCH), i386)
override ARCH=x86
else ifeq ($(ARCH), aarch64)
override ARCH=arm64
endif
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
# Default C compiler
@@ -54,6 +70,7 @@ endif
ifdef TARGET
GETARCH_FLAGS := -DFORCE_$(TARGET)
GETARCH_FLAGS += -DUSER_TARGET
endif
# Force fallbacks for 32bit
@@ -83,6 +100,9 @@ endif
ifeq ($(TARGET), ZEN)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), ARMV8)
GETARCH_FLAGS := -DFORCE_ARMV7
endif
endif
@@ -122,7 +142,12 @@ endif
endif
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
ifeq ($(ARCH), x86_64)
ifeq ($(findstring pgcc,$(HOSTCC)),)
GETARCH_FLAGS += -march=native
endif
endif
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
@@ -140,7 +165,8 @@ GETARCH_FLAGS += -DNO_AVX
endif
ifeq ($(BINARY), 32)
GETARCH_FLAGS += -DNO_AVX
GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512
NO_AVX512 = 1
endif
ifeq ($(NO_AVX2), 1)
@@ -221,6 +247,10 @@ SMP = 1
endif
endif
ifeq ($(SMP), 1)
USE_LOCKING =
endif
ifndef NEED_PIC
NEED_PIC = 1
endif
@@ -237,9 +267,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy
OBJCONV = $(CROSS_SUFFIX)objconv
# For detect fortran failed, only build BLAS.
# When fortran support was either not detected or actively deselected, only build BLAS.
ifeq ($(NOFORTRAN), 1)
NO_LAPACK = 1
override FEXTRALIB =
endif
#
@@ -372,6 +403,12 @@ ifneq ($(MAX_STACK_ALLOC), 0)
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
endif
ifdef USE_LOCKING
ifneq ($(USE_LOCKING), 0)
CCOMMON_OPT += -DUSE_LOCKING
endif
endif
#
# Architecture dependent settings
#
@@ -505,6 +542,19 @@ CCOMMON_OPT += $(XCCOMMON_OPT)
#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
endif
ifeq ($(ARCH), arm64)
DYNAMIC_CORE = ARMV8
DYNAMIC_CORE += CORTEXA57
DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99
endif
ifeq ($(ARCH), power)
DYNAMIC_CORE = POWER6
DYNAMIC_CORE += POWER8
DYNAMIC_CORE += POWER9
endif
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
ifndef DYNAMIC_CORE
override DYNAMIC_ARCH=
@@ -713,6 +763,10 @@ endif
ifeq ($(F_COMPILER), GFORTRAN)
CCOMMON_OPT += -DF_INTERFACE_GFORT
FCOMMON_OPT += -Wall
# make single-threaded LAPACK calls thread-safe #1847
FCOMMON_OPT += -frecursive
# work around ABI problem with passing single-character arguments
FCOMMON_OPT += -fno-optimize-sibling-calls
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
ifneq ($(NO_LAPACK), 1)
EXTRALIB += -lgfortran
@@ -1018,10 +1072,12 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif
ifdef USE_TLS
ifeq ($(USE_TLS), 1)
CCOMMON_OPT += -DUSE_TLS
endif
CCOMMON_OPT += -DVERSION=\"$(VERSION)\"
ifndef SYMBOLPREFIX
SYMBOLPREFIX =
endif
@@ -1069,8 +1125,12 @@ endif
endif
ifdef NO_AFFINITY
ifeq ($(NO_AFFINITY), 0)
override undefine NO_AFFINITY
else
CCOMMON_OPT += -DNO_AFFINITY
endif
endif
ifdef FUNCTION_PROFILE
CCOMMON_OPT += -DFUNCTION_PROFILE
@@ -1132,8 +1192,6 @@ ifndef FCOMMON_OPT
FCOMMON_OPT = -O2 -frecursive
endif
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
@@ -1141,6 +1199,12 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES =
ifdef NEED_PIC
ifeq (,$(findstring PIC,$(FFLAGS)))
override FFLAGS += -fPIC
endif
endif
#For LAPACK Fortran codes.
#Disable -fopenmp for LAPACK Fortran codes on Windows.
ifdef OS_WINDOWS
@@ -1199,7 +1263,11 @@ endif
LIBDLLNAME = $(LIBPREFIX).dll
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
ifneq ($(OSNAME), AIX)
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
else
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
endif
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)

View File

@@ -9,14 +9,36 @@ endif
endif
ifeq ($(CORE), SKYLAKEX)
ifndef DYNAMIC_ARCH
ifndef NO_AVX512
CCOMMON_OPT += -march=skylake-avx512
FCOMMON_OPT += -march=skylake-avx512
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
endif
endif
endif
endif
ifeq ($(CORE), HASWELL)
ifndef DYNAMIC_ARCH
ifndef NO_AVX2
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -mavx2
endif
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -mavx2
endif
endif
endif
endif
ifeq ($(OSNAME), Interix)
ARFLAGS = -m x64

View File

@@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
FCOMMON_OPT += -march=z13 -mzvector
endif
ifeq ($(CORE), Z14)
CCOMMON_OPT += -march=z14 -mzvector
FCOMMON_OPT += -march=z14 -mzvector
endif

View File

@@ -6,11 +6,13 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
## Binary Packages
@@ -22,7 +24,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
## Installation from Source
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
using Git from https://github.com/xianyi/OpenBLAS.git.
### Dependencies
@@ -63,9 +65,7 @@ A debug version can be built using `make DEBUG=1`.
### Compile with MASS support on Power CPU (optional)
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
consists of a set of mathematical functions for C, C++, and Fortran applications that are
are tuned for optimum performance on POWER architectures.
The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
The library can be installed as shown:
@@ -115,6 +115,7 @@ Please read `GotoBLAS_01Readme.txt`.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
- **AMD ZEN**: Uses Haswell codes with some optimizations.
#### MIPS64
@@ -133,11 +134,13 @@ Please read `GotoBLAS_01Readme.txt`.
#### PPC/PPC64
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
#### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)
### Supported OS
@@ -201,7 +204,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
Clang 3.0 will generate the wrong AVX binary code.
* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels.
* Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels.
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
the library with `BIGNUMA=1`.

View File

@@ -48,6 +48,7 @@ POWER5
POWER6
POWER7
POWER8
POWER9
PPCG4
PPC970
PPC970MP
@@ -83,11 +84,16 @@ ARMV5
8.ARM 64-bit CPU:
ARMV8
CORTEXA53
CORTEXA57
VULCAN
CORTEXA72
CORTEXA73
FALKOR
THUNDERX
THUNDERX2T99
TSV110
9.System Z:
ZARCH_GENERIC
Z13
Z14

View File

@@ -35,7 +35,14 @@ environment:
DYNAMIC_ARCH: ON
WITH_FORTRAN: no
- COMPILER: cl
- COMPILER: MinGW64-gcc-7.2.0-mingw
DYNAMIC_ARCH: OFF
WITH_FORTRAN: ignore
- COMPILER: MinGW64-gcc-7.2.0
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
COMPILER: MinGW-gcc-5.3.0
WITH_FORTRAN: ignore
install:
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
@@ -52,10 +59,17 @@ install:
before_build:
- ps: if (-Not (Test-Path .\build)) { mkdir build }
- cd build
- set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
- if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl ..
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
build_script:
- cmake --build .
@@ -64,3 +78,4 @@ test_script:
- echo Running Test
- cd utest
- openblas_utest

51
azure-pipelines.yml Normal file
View File

@@ -0,0 +1,51 @@
trigger:
# start a new build for every push
batch: False
branches:
include:
- develop
jobs:
# manylinux1 is useful to test because the
# standard Docker container uses an old version
# of gcc / glibc
- job: manylinux1_gcc
pool:
vmImage: 'ubuntu-16.04'
steps:
- script: |
echo "FROM quay.io/pypa/manylinux1_x86_64
COPY . /tmp/openblas
RUN cd /tmp/openblas && \
COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \
BTYPE='BINARY=64' CC=gcc && \
make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \
make -C test $COMMON_FLAGS $BTYPE && \
make -C ctest $COMMON_FLAGS $BTYPE && \
make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile
docker build .
displayName: Run manylinux1 docker build
- job: Intel_SDE_skx
pool:
vmImage: 'ubuntu-16.04'
steps:
- script: |
# at the time of writing the available Azure Ubuntu vm image
# does not support AVX512VL, so use more recent LTS version
echo "FROM ubuntu:bionic
COPY . /tmp/openblas
RUN apt-get -y update && apt-get -y install \\
cmake \\
gfortran \\
make \\
wget
RUN mkdir /tmp/SDE && cd /tmp/SDE && \\
mkdir sde-external-8.35.0-2019-03-11-lin && \\
wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\
tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1
RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64
CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile
docker build -t intel_sde .
# we need a privileged docker run for sde process attachment
docker run --privileged intel_sde
displayName: 'Run AVX512 SkylakeX docker build / test'

View File

@@ -207,7 +207,7 @@ int main(int argc, char *argv[]){
for (i = 0; i < m * n * COMPSIZE; i++) {
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
fprintf(stderr, " SIZE Flops Time\n");
for (i = from; i <= to; i += step) {

View File

@@ -2,6 +2,8 @@
argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128
nto <- 2048
nstep <- 128
@@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z])
}
}
}
p <- Sys.getenv("OPENBLAS_LOOPS")
@@ -27,29 +28,21 @@ if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom
while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
A <- matrix(rnorm(n * n), nrow = n)
ev <- 0
z <- system.time(for (l in 1:loops) {
ev <- eigen(A)
})
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6)
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06)
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep
}

View File

@@ -2,6 +2,8 @@
argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128
nto <- 2048
nstep <- 128
@@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z])
}
}
}
p <- Sys.getenv("OPENBLAS_LOOPS")
@@ -27,26 +28,13 @@ if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom
while (n <= nto) {
A <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
B <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
A <- matrix(runif(n * n), nrow = n)
B <- matrix(runif(n * n), nrow = n)
C <- 1
z <- system.time(for (l in 1:loops) {
@@ -54,11 +42,10 @@ while (n <= nto) {
l <- l + 1
})
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6)
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06)
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep
}

View File

@@ -2,6 +2,8 @@
argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128
nto <- 2048
nstep <- 128
@@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z])
}
}
}
p <- Sys.getenv("OPENBLAS_LOOPS")
@@ -27,31 +28,22 @@ if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom
while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
B <- matrix(rnorm(n * n), ncol = n, nrow = n)
A <- matrix(rnorm(n * n), nrow = n)
B <- matrix(rnorm(n * n), nrow = n)
z <- system.time(for (l in 1:loops) {
solve(A, B)
})
mflops <-
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6)
mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06)
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep
}

85
c_check
View File

@@ -1,7 +1,7 @@
#!/usr/bin/perl
use File::Basename;
use File::Temp qw(tempfile);
#use File::Basename;
# use File::Temp qw(tempfile);
# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
@@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$hostarch = "zarch" if ($hostarch eq "s390x");
$tmpf = new File::Temp( UNLINK => 1 );
#$tmpf = new File::Temp( UNLINK => 1 );
$binary = $ENV{"BINARY"};
$makefile = shift(@ARGV);
@@ -31,12 +31,25 @@ if ($?) {
$cross_suffix = "";
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
eval "use File::Basename";
if ($@){
warn "could not load PERL module File::Basename, emulating its functionality";
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
if ($dirnam ne ".") {
$cross_suffix .= $dirnam . "/";
}
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
if ($basnam =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
} else {
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
}
$compiler = "";
@@ -171,20 +184,26 @@ if ($?) {
$have_msa = 0;
if (($architecture eq "mips") || ($architecture eq "mips64")) {
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
} else {
$have_msa = 1;
$tmpf = new File::Temp( UNLINK => 1 );
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
} else {
$have_msa = 1;
}
unlink("$tmpf.o");
}
unlink("$tmpf.o");
}
$architecture = x86 if ($data =~ /ARCH_X86/);
@@ -204,17 +223,25 @@ $binformat = bin64 if ($data =~ /BINARY_64/);
$no_avx512= 0;
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "int main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
} else {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
$no_avx512 = 0;
} else {
# $tmpf = new File::Temp( UNLINK => 1 );
($fh,$tmpf) = tempfile( UNLINK => 1 );
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
} else {
$no_avx512 = 0;
}
unlink("$tmpf.o");
}
unlink("tmpf.o");
}
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;

18
cblas.h
View File

@@ -51,7 +51,8 @@ typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=1
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
typedef CBLAS_ORDER CBLAS_LAYOUT;
float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
float cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
@@ -72,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX);
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);
@@ -87,6 +93,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

View File

@@ -44,6 +44,10 @@ endif ()
if (DYNAMIC_ARCH)
if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99)
endif ()
if (X86)
set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
endif ()
@@ -69,11 +73,16 @@ if (DYNAMIC_ARCH)
endif ()
if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
endif ()
endif ()
if (NOT DYNAMIC_CORE)
unset(DYNAMIC_ARCH)
message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options")
unset(DYNAMIC_ARCH CACHE)
endif ()
endif ()

View File

@@ -3,6 +3,11 @@
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets Fortran related variables.
if (INTERFACE64)
set(SUFFIX64 64)
set(SUFFIX64_UNDERSCORE _64)
endif()
if (${F_COMPILER} STREQUAL "FLANG")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (BINARY64 AND INTERFACE64)
@@ -39,7 +44,10 @@ endif ()
if (${F_COMPILER} STREQUAL "GFORTRAN")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
# ensure reentrancy of lapack codes
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
# work around ABI violation in passing string arguments from C
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
if (NOT NO_LAPACK)
set(EXTRALIB "{EXTRALIB} -lgfortran")

View File

@@ -1,7 +1,7 @@
# helper functions for the kernel CMakeLists.txt
# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file.
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
macro(SetDefaultL1)
set(SAMAXKERNEL amax.S)
set(DAMAXKERNEL amax.S)
@@ -107,6 +107,12 @@ macro(SetDefaultL1)
set(DAXPBYKERNEL ../arm/axpby.c)
set(CAXPBYKERNEL ../arm/zaxpby.c)
set(ZAXPBYKERNEL ../arm/zaxpby.c)
set(SSUMKERNEL sum.S)
set(DSUMKERNEL sum.S)
set(CSUMKERNEL zsum.S)
set(ZSUMKERNEL zsum.S)
set(QSUMKERNEL sum.S)
set(XSUMKERNEL zsum.S)
endmacro ()
macro(SetDefaultL2)
@@ -162,4 +168,4 @@ macro(SetDefaultL3)
set(DGEADD_KERNEL ../generic/geadd.c)
set(CGEADD_KERNEL ../generic/zgeadd.c)
set(ZGEADD_KERNEL ../generic/zgeadd.c)
endmacro ()
endmacro ()

View File

@@ -1,4 +1,5 @@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
libsuffix=@SUFFIX64_UNDERSCORE@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
@@ -6,5 +7,5 @@ Name: OpenBLAS
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: @OPENBLAS_VERSION@
URL: https://github.com/xianyi/OpenBLAS
Libs: -L${libdir} -lopenblas
Libs: -L${libdir} -lopenblas${libsuffix}
Cflags: -I${includedir}

View File

@@ -8,6 +8,11 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
set(NO_EXPRECISION 1)
endif ()
if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly")
set(EXTRALIB "${EXTRALIB} -lm")
set(NO_EXPRECISION 1)
endif ()
if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX")
set(EXTRALIB "${EXTRALIB} -lm")
endif ()

View File

@@ -59,6 +59,9 @@ set(FU "")
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
set(FU "_")
endif()
if(MINGW AND NOT MINGW64)
set(FU "_")
endif()
set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
if (${COMPILER_ID} STREQUAL "GNU")
@@ -82,18 +85,28 @@ endif ()
# f_check
if (NOT NOFORTRAN)
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
else ()
file(APPEND ${TARGET_CONF_TEMP}
"#define BUNDERSCORE _\n"
"#define NEEDBUNDERSCORE 1\n")
set(BU "_")
endif ()
# Cannot run getarch on target if we are cross-compiling
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
# Write to config as getarch would
if (DEFINED TARGET_CORE)
set(TCORE ${TARGET_CORE})
else()
set(TCORE ${CORE})
endif()
# TODO: Set up defines that getarch sets up based on every other target
# Perhaps this should be inside a different file as it grows larger
file(APPEND ${TARGET_CONF_TEMP}
"#define ${CORE}\n"
"#define CHAR_CORENAME \"${CORE}\"\n")
if ("${CORE}" STREQUAL "ARMV7")
"#define ${TCORE}\n"
"#define CHAR_CORENAME \"${TCORE}\"\n")
if ("${TCORE}" STREQUAL "ARMV7")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t32\n"
@@ -108,7 +121,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4)
elseif ("${CORE}" STREQUAL "ARMV8")
elseif ("${TCORE}" STREQUAL "ARMV8")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
@@ -116,18 +129,26 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define L2_ASSOCIATIVE\t32\n")
set(SGEMM_UNROLL_M 4)
"#define L2_ASSOCIATIVE\t32\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
elseif ("${CORE}" STREQUAL "CORTEXA57")
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t49152\n"
"#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t2097152\n"
"#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
@@ -135,15 +156,124 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n")
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 8)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t49152\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "FALKOR")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t65536\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t128\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "THUNDERX")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t128\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t167772164\n"
"#define L2_LINESIZE\t128\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 4)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 2)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "THUNDERX2T99")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t8\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t8\n"
"#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t8\n"
"#define L3_SIZE\t33554432\n"
"#define L3_LINESIZE\t64\n"
"#define L3_ASSOCIATIVE\t32\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
endif()
# Or should this actually be NUM_CORES?
@@ -163,6 +293,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
file(APPEND ${TARGET_CONF_TEMP}
"#define GEMM_MULTITHREAD_THRESHOLD\t${GEMM_MULTITHREAD_THRESHOLD}\n")
# Move to where gen_config_h would place it
file(MAKE_DIRECTORY ${TARGET_CONF_DIR})
file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}")
else(NOT CMAKE_CROSSCOMPILING)

View File

@@ -39,13 +39,44 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
set(TARGET "BARCELONA")
endif ()
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
set(TARGET "ARMV7")
endif ()
endif ()
if (DEFINED TARGET)
if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
endif()
endif()
if (DEFINED TARGET)
message(STATUS "Targeting the ${TARGET} architecture.")
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
endif ()
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
endif ()
# On x86 no AVX support is available
if (X86 OR X86_64)
if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4"))
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512")
endif ()
endif ()
if (INTERFACE64)
message(STATUS "Using 64-bit integers.")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
@@ -117,10 +148,16 @@ endif ()
if (USE_THREAD)
message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.")
else()
if (${USE_LOCKING})
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING")
endif ()
endif ()
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
if (DEFINED BINARY)
message(STATUS "Compiling a ${BINARY}-bit binary.")
endif ()
if (NOT DEFINED NEED_PIC)
set(NEED_PIC 1)
endif ()
@@ -137,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
if (NOT NOFORTRAN)
# Fortran Compiler dependent settings
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
else ()
set(NO_LAPACK 1)
set(NO_LAPACKE 1)
endif ()
if (BINARY64)
@@ -162,12 +202,24 @@ if (NEED_PIC)
endif ()
if (DYNAMIC_ARCH)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
if (X86 OR X86_64 OR ARM64 OR PPC)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
endif ()
else ()
unset (DYNAMIC_ARCH)
message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing")
endif ()
endif ()
if (DYNAMIC_LIST)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST")
foreach(DCORE ${DYNAMIC_LIST})
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}")
endforeach ()
endif ()
if (NO_LAPACK)
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK")
#Disable LAPACK C interface
@@ -257,7 +309,7 @@ endif ()
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
# TODO: nead to convert these Makefiles
# TODO: need to convert these Makefiles
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
if (${CORE} STREQUAL "PPC440")
@@ -304,6 +356,8 @@ if (MIXED_MEMORY_ALLOCATION)
set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION")
endif ()
set(CCOMMON_OPT "${CCOMMON_OPT} -DVERSION=\"\\\"${OpenBLAS_VERSION}\\\"\"")
set(REVISION "-r${OpenBLAS_VERSION}")
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})

View File

@@ -10,6 +10,16 @@ if (${HOST_OS} STREQUAL "WINDOWS")
set(HOST_OS WINNT)
endif ()
if (${HOST_OS} STREQUAL "LINUX")
# check if we're building natively on Android (TERMUX)
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
if(${OPERATING_SYSTEM} MATCHES "Android")
set(HOST_OS ANDROID)
endif()
endif()
if(CMAKE_COMPILER_IS_GNUCC AND WIN32)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE
@@ -29,13 +39,21 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
set(X86_64 1)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(X86_64 1)
else()
set(X86 1)
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
set(ARM 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
set(ARM64 1)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(ARM64 1)
else()
set(ARM 1)
endif()
endif()
if (X86_64)
@@ -67,8 +85,8 @@ else()
endif()
if (X86_64 OR X86)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
if (NO_AVX512 EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif()

View File

@@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
endfunction ()
# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition
# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
# @param sources_in the source files to build from
# @param defines_in (optional) preprocessor definitions that will be applied to all objects
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.

View File

@@ -85,6 +85,8 @@ extern "C" {
#if !defined(_MSC_VER)
#include <unistd.h>
#elif _MSC_VER < 1900
#define snprintf _snprintf
#endif
#include <time.h>
@@ -129,7 +131,7 @@ extern "C" {
#include <time.h>
#include <unistd.h>
#include <math.h>
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
#include <pthread.h>
#endif
#endif
@@ -183,7 +185,7 @@ extern "C" {
#define ALLOCA_ALIGN 63UL
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
#ifdef NEEDBUNDERSCORE
#define BLASFUNC(FUNC) FUNC##_
@@ -198,7 +200,7 @@ extern "C" {
#error "You can't specify both LOCK operation!"
#endif
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
#define USE_PTHREAD_LOCK
#undef USE_PTHREAD_SPINLOCK
#endif
@@ -348,6 +350,11 @@ typedef int blasint;
#endif
#endif
#ifdef POWER9
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
/*
#ifdef PILEDRIVER
@@ -439,7 +446,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) 0
#else
#ifdef OS_WINDOWS
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
#else

View File

@@ -19,6 +19,7 @@
#define CDOTC_K cdotc_k
#define CNRM2_K cnrm2_k
#define CSCAL_K cscal_k
#define CSUM_K csum_k
#define CSWAP_K cswap_k
#define CROT_K csrot_k
@@ -249,6 +250,7 @@
#define CDOTC_K gotoblas -> cdotc_k
#define CNRM2_K gotoblas -> cnrm2_k
#define CSCAL_K gotoblas -> cscal_k
#define CSUM_K gotoblas -> csum_k
#define CSWAP_K gotoblas -> cswap_k
#define CROT_K gotoblas -> csrot_k

View File

@@ -19,6 +19,7 @@
#define DDOTC_K ddot_k
#define DNRM2_K dnrm2_k
#define DSCAL_K dscal_k
#define DSUM_K dsum_k
#define DSWAP_K dswap_k
#define DROT_K drot_k
@@ -174,6 +175,7 @@
#define DDOTC_K gotoblas -> ddot_k
#define DNRM2_K gotoblas -> dnrm2_k
#define DSCAL_K gotoblas -> dscal_k
#define DSUM_K gotoblas -> dsum_k
#define DSWAP_K gotoblas -> dswap_k
#define DROT_K gotoblas -> drot_k

View File

@@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
double BLASFUNC(dzasum)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);
FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *);
FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *);
double BLASFUNC(dsum) (blasint *, double *, blasint *);
xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
double BLASFUNC(dzsum)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);
blasint BLASFUNC(isamax)(blasint *, float *, blasint *);
blasint BLASFUNC(idamax)(blasint *, double *, blasint *);
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);

View File

@@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG);
double zasum_k (BLASLONG, double *, BLASLONG);
xdouble xasum_k (BLASLONG, xdouble *, BLASLONG);
float ssum_k (BLASLONG, float *, BLASLONG);
double dsum_k (BLASLONG, double *, BLASLONG);
xdouble qsum_k (BLASLONG, xdouble *, BLASLONG);
float csum_k (BLASLONG, float *, BLASLONG);
double zsum_k (BLASLONG, double *, BLASLONG);
xdouble xsum_k (BLASLONG, xdouble *, BLASLONG);
float samax_k (BLASLONG, float *, BLASLONG);
double damax_k (BLASLONG, double *, BLASLONG);
xdouble qamax_k (BLASLONG, xdouble *, BLASLONG);

View File

@@ -47,6 +47,14 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *);
extern "C" {
#endif
extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K,
float * A, BLASLONG strideA,
float * B, BLASLONG strideB,
float * R, BLASLONG strideR);
extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);
int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double,

View File

@@ -66,6 +66,7 @@
#define DOTC_K QDOTC_K
#define NRM2_K QNRM2_K
#define SCAL_K QSCAL_K
#define SUM_K QSUM_K
#define SWAP_K QSWAP_K
#define ROT_K QROT_K
@@ -356,6 +357,7 @@
#define DOTC_K DDOTC_K
#define NRM2_K DNRM2_K
#define SCAL_K DSCAL_K
#define SUM_K DSUM_K
#define SWAP_K DSWAP_K
#define ROT_K DROT_K
@@ -658,6 +660,7 @@
#define DOTC_K SDOTC_K
#define NRM2_K SNRM2_K
#define SCAL_K SSCAL_K
#define SUM_K SSUM_K
#define SWAP_K SSWAP_K
#define ROT_K SROT_K
@@ -962,6 +965,7 @@
#define DOTC_K XDOTC_K
#define NRM2_K XNRM2_K
#define SCAL_K XSCAL_K
#define SUM_K XSUM_K
#define SWAP_K XSWAP_K
#define ROT_K XROT_K
@@ -1363,6 +1367,7 @@
#define DOTC_K ZDOTC_K
#define NRM2_K ZNRM2_K
#define SCAL_K ZSCAL_K
#define SUM_K ZSUM_K
#define SWAP_K ZSWAP_K
#define ROT_K ZROT_K
@@ -1785,6 +1790,7 @@
#define DOTC_K CDOTC_K
#define NRM2_K CNRM2_K
#define SCAL_K CSCAL_K
#define SUM_K CSUM_K
#define SWAP_K CSWAP_K
#define ROT_K CROT_K

View File

@@ -94,7 +94,7 @@ static inline unsigned int rpcc(void){
#define RPCC_DEFINED
#ifndef NO_AFFINITY
#define WHEREAMI
//#define WHEREAMI
static inline int WhereAmI(void){
int ret=0;
__asm__ __volatile__(".set push \n"

View File

@@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
float (*snrm2_k) (BLASLONG, float *, BLASLONG);
float (*sasum_k) (BLASLONG, float *, BLASLONG);
float (*ssum_k) (BLASLONG, float *, BLASLONG);
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
double (*dasum_k) (BLASLONG, double *, BLASLONG);
double (*dsum_k) (BLASLONG, double *, BLASLONG);
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
@@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG);
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
@@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
float (*casum_k) (BLASLONG, float *, BLASLONG);
float (*csum_k) (BLASLONG, float *, BLASLONG);
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
double (*znrm2_k) (BLASLONG, double *, BLASLONG);
double (*zasum_k) (BLASLONG, double *, BLASLONG);
double (*zsum_k) (BLASLONG, double *, BLASLONG);
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
@@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG);
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);

View File

@@ -39,7 +39,7 @@
#ifndef COMMON_POWER
#define COMMON_POWER
#if defined(POWER8)
#if defined(POWER8) || defined(POWER9)
#define MB __asm__ __volatile__ ("eieio":::"memory")
#define WMB __asm__ __volatile__ ("eieio":::"memory")
#else
@@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH
#endif
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) )
#define DCBT_ARG 0
#else
#define DCBT_ARG 8
@@ -263,7 +263,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define L1_PREFETCH dcbtst
#endif
#if defined(POWER8)
#if defined(POWER8) || defined(POWER9)
#define L1_DUALFETCH
#define L1_PREFETCHSIZE (16 + 128 * 100)
#define L1_PREFETCH dcbtst
@@ -499,7 +499,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#ifdef OS_LINUX
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifndef __64BIT__
#define PROLOGUE \
.section .text;\
@@ -598,9 +598,14 @@ REALNAME:;\
#ifndef __64BIT__
#define PROLOGUE \
.machine "any";\
.toc;\
.globl .REALNAME;\
.globl REALNAME;\
.csect REALNAME[DS],3;\
REALNAME:;\
.long .REALNAME, TOC[tc0], 0;\
.csect .text[PR],5;\
.REALNAME:;
.REALNAME:
#define EPILOGUE \
_section_.text:;\
@@ -611,9 +616,14 @@ _section_.text:;\
#define PROLOGUE \
.machine "any";\
.toc;\
.globl .REALNAME;\
.globl REALNAME;\
.csect REALNAME[DS],3;\
REALNAME:;\
.llong .REALNAME, TOC[tc0], 0;\
.csect .text[PR], 5;\
.REALNAME:;
.REALNAME:
#define EPILOGUE \
_section_.text:;\
@@ -774,7 +784,7 @@ Lmcount$lazy_ptr:
#define HALT mfspr r0, 1023
#ifdef OS_LINUX
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#if defined(PPC440) || defined(PPC440FP2)
#undef MAX_CPU_NUMBER
#define MAX_CPU_NUMBER 1
@@ -802,7 +812,7 @@ Lmcount$lazy_ptr:
#define BUFFER_SIZE ( 2 << 20)
#elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8)
#elif defined(POWER8) || defined(POWER9)
#define BUFFER_SIZE ( 64 << 20)
#else
#define BUFFER_SIZE ( 16 << 20)
@@ -819,7 +829,7 @@ Lmcount$lazy_ptr:
#define MAP_ANONYMOUS MAP_ANON
#endif
#ifdef OS_LINUX
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifndef __64BIT__
#define FRAMESLOT(X) (((X) * 4) + 8)
#else

View File

@@ -19,6 +19,7 @@
#define QDOTC_K qdot_k
#define QNRM2_K qnrm2_k
#define QSCAL_K qscal_k
#define QSUM_K qsum_k
#define QSWAP_K qswap_k
#define QROT_K qrot_k
@@ -161,6 +162,7 @@
#define QDOTC_K gotoblas -> qdot_k
#define QNRM2_K gotoblas -> qnrm2_k
#define QSCAL_K gotoblas -> qscal_k
#define QSUM_K gotoblas -> qsum_k
#define QSWAP_K gotoblas -> qswap_k
#define QROT_K gotoblas -> qrot_k

View File

@@ -12,6 +12,7 @@
#define ISMAX_K ismax_k
#define ISMIN_K ismin_k
#define SASUM_K sasum_k
#define SSUM_K ssum_k
#define SAXPYU_K saxpy_k
#define SAXPYC_K saxpy_k
#define SCOPY_K scopy_k
@@ -170,6 +171,7 @@
#define ISMAX_K gotoblas -> ismax_k
#define ISMIN_K gotoblas -> ismin_k
#define SASUM_K gotoblas -> sasum_k
#define SSUM_K gotoblas -> ssum_k
#define SAXPYU_K gotoblas -> saxpy_k
#define SAXPYC_K gotoblas -> saxpy_k
#define SCOPY_K gotoblas -> scopy_k

View File

@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* SIZE must be carefully chosen to be:
* - as small as possible to maximize the number of stack allocation
* - large enough to support all architectures and kernel
* Chosing a too small SIZE will lead to a stack smashing.
* Choosing a SIZE too small will lead to a stack smashing.
*/
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \

View File

@@ -19,6 +19,7 @@
#define XDOTC_K xdotc_k
#define XNRM2_K xnrm2_k
#define XSCAL_K xscal_k
#define XSUM_K xsum_k
#define XSWAP_K xswap_k
#define XROT_K xqrot_k
@@ -227,6 +228,7 @@
#define XDOTC_K gotoblas -> xdotc_k
#define XNRM2_K gotoblas -> xnrm2_k
#define XSCAL_K gotoblas -> xscal_k
#define XSUM_K gotoblas -> xsum_k
#define XSWAP_K gotoblas -> xswap_k
#define XROT_K gotoblas -> xqrot_k

View File

@@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y));
return result;
#endif
@@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#endif
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona.
//Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

View File

@@ -129,7 +129,8 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
*ecx=cpuinfo[2];
*edx=cpuinfo[3];
#else
__asm__ __volatile__("cpuid"
__asm__ __volatile__("mov $0, %%ecx;"
"cpuid"
: "=a" (*eax),
"=b" (*ebx),
"=c" (*ecx),
@@ -210,7 +211,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
return result;
}
@@ -276,7 +277,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#ifdef ASSEMBLER
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona.
//Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

View File

@@ -19,6 +19,7 @@
#define ZDOTC_K zdotc_k
#define ZNRM2_K znrm2_k
#define ZSCAL_K zscal_k
#define ZSUM_K zsum_k
#define ZSWAP_K zswap_k
#define ZROT_K zdrot_k
@@ -249,6 +250,7 @@
#define ZDOTC_K gotoblas -> zdotc_k
#define ZNRM2_K gotoblas -> znrm2_k
#define ZSCAL_K gotoblas -> zscal_k
#define ZSUM_K gotoblas -> zsum_k
#define ZSWAP_K gotoblas -> zswap_k
#define ZROT_K gotoblas -> zdrot_k

14
cpp_thread_test/Makefile Normal file
View File

@@ -0,0 +1,14 @@
include ../Makefile.rule
all :: dgemv_tester dgemm_tester
dgemv_tester :
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
./dgemv_tester
dgemm_tester : dgemv_tester
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
./dgemm_tester
clean ::
rm -f dgemv_tester dgemm_tester

View File

@@ -0,0 +1,55 @@
inline void pauser(){
/// a portable way to pause a program
std::string dummy;
std::cout << "Press enter to continue...";
std::getline(std::cin, dummy);
}
void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
for(uint32_t i=0; i<numMat; i++){
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
matBlock[i][j] = rngdist(PRNG);
}
}
for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){
for(uint32_t j=0; j<numMat; j++){
matBlock[i+j] = matBlock[j];
}
}
}
void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){
for(uint32_t i=0; i<numVec; i++){
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
vecBlock[i][j] = rngdist(PRNG);
}
}
for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){
for(uint32_t j=0; j<numVec; j++){
vecBlock[i+j] = vecBlock[j];
}
}
}
std::mt19937_64 InitPRNG(){
std::random_device rd;
std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
//make sure the internal state of the PRNG is properly mixed by generating 10M random numbers
//PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed
for (uint32_t i=0;i<10000000;i++) rngdist(PRNG);
return PRNG;
}
void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){
std::cout<<i<<std::endl;
for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){
std::cout<<matBlock[i][j*randomMatSize + k]<<" ";
}
std::cout<<std::endl;
}
std::cout<<std::endl;
}
}

View File

@@ -0,0 +1,92 @@
#include <iostream>
#include <vector>
#include <random>
#include <future>
#include <omp.h>
#include "../cblas.h"
#include "cpp_thread_safety_common.h"
void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize);
}
int main(int argc, char* argv[]){
blasint randomMatSize = 1024; //dimension of the random square matrices used
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
uint32_t numTestRounds = 16; //number of testing rounds before success exit
if (argc > 4){
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
abort();
}
if(argc == 4){
std::vector<std::string> cliArgs;
for (int i = 1; i < argc; i++){
cliArgs.push_back(argv[i]);
std::cout<<argv[i]<<std::endl;
}
randomMatSize = std::stoul(cliArgs[0]);
numConcurrentThreads = std::stoul(cliArgs[1]);
numTestRounds = std::stoul(cliArgs[2]);
}
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
std::vector<std::vector<double>> matBlock(numConcurrentThreads*3);
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
std::cout<<"*----------------------------*\n";
std::cout<<"| DGEMM thread safety tester |\n";
std::cout<<"*----------------------------*\n";
std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n';
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
std::cout<<"Initializing random number generator..."<<std::flush;
std::mt19937_64 PRNG = InitPRNG();
std::cout<<"done\n";
std::cout<<"Preparing to test CBLAS DGEMM thread safety\n";
std::cout<<"Allocating matrices..."<<std::flush;
for(uint32_t i=0; i<(numConcurrentThreads*3); i++){
matBlock[i].resize(randomMatSize*randomMatSize);
}
std::cout<<"done\n";
//pauser();
std::cout<<"Filling matrices with random numbers..."<<std::flush;
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3);
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
std::cout<<"done\n";
std::cout<<"Testing CBLAS DGEMM thread safety\n";
omp_set_num_threads(numConcurrentThreads);
for(uint32_t R=0; R<numTestRounds; R++){
std::cout<<"DGEMM round #"<<R<<std::endl;
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
#pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads)
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize);
//launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]);
}
std::cout<<"done\n";
std::cout<<"Waiting for threads to finish..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i].get();
}
std::cout<<"done\n";
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
std::cout<<"Comparing results from different threads..."<<std::flush;
for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl;
std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl;
return -1;
}
}
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl;
return 0;
}

View File

@@ -0,0 +1,101 @@
#include <iostream>
#include <vector>
#include <random>
#include <future>
#include <omp.h>
#include "../cblas.h"
#include "cpp_thread_safety_common.h"
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
const blasint inc = 1;
cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
}
int main(int argc, char* argv[]){
blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
uint32_t numTestRounds = 16; //number of testing rounds before success exit
if (argc > 4){
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
abort();
}
if(argc == 4){
std::vector<std::string> cliArgs;
for (int i = 1; i < argc; i++){
cliArgs.push_back(argv[i]);
std::cout<<argv[i]<<std::endl;
}
randomMatSize = std::stoul(cliArgs.at(0));
numConcurrentThreads = std::stoul(cliArgs.at(1));
numTestRounds = std::stoul(cliArgs.at(2));
}
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
std::vector<std::vector<double>> matBlock(numConcurrentThreads);
std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2);
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
std::cout<<"*----------------------------*\n";
std::cout<<"| DGEMV thread safety tester |\n";
std::cout<<"*----------------------------*\n";
std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n';
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
std::cout<<"Initializing random number generator..."<<std::flush;
std::mt19937_64 PRNG = InitPRNG();
std::cout<<"done\n";
std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
std::cout<<"Allocating matrices..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
matBlock.at(i).resize(randomMatSize*randomMatSize);
}
std::cout<<"done\n";
std::cout<<"Allocating vectors..."<<std::flush;
for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
vecBlock.at(i).resize(randomMatSize);
}
std::cout<<"done\n";
//pauser();
std::cout<<"Filling matrices with random numbers..."<<std::flush;
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1);
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads);
std::cout<<"done\n";
std::cout<<"Filling vectors with random numbers..."<<std::flush;
FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2);
std::cout<<"done\n";
std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
omp_set_num_threads(numConcurrentThreads);
for(uint32_t R=0; R<numTestRounds; R++){
std::cout<<"DGEMV round #"<<R<<std::endl;
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
#pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
}
std::cout<<"done\n";
std::cout<<"Waiting for threads to finish..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i].get();
}
std::cout<<"done\n";
std::cout<<"Comparing results from different threads..."<<std::flush;
for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
return -1;
}
}
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
return 0;
}

View File

@@ -53,6 +53,7 @@
#define VENDOR_SIS 8
#define VENDOR_TRANSMETA 9
#define VENDOR_NSC 10
#define VENDOR_HYGON 11
#define VENDOR_UNKNOWN 99
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
@@ -116,6 +117,7 @@
#define CORE_EXCAVATOR 26
#define CORE_ZEN 27
#define CORE_SKYLAKEX 28
#define CORE_DHYANA 29
#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
@@ -139,6 +141,7 @@
#define HAVE_FMA4 (1 << 19)
#define HAVE_FMA3 (1 << 20)
#define HAVE_AVX512VL (1 << 21)
#define HAVE_AVX2 (1 << 22)
#define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2
@@ -214,5 +217,8 @@ typedef struct {
#define CPUTYPE_EXCAVATOR 50
#define CPUTYPE_ZEN 51
#define CPUTYPE_SKYLAKEX 52
#define CPUTYPE_DHYANA 53
#define CPUTYPE_HYGON_UNKNOWN 54
#endif

View File

@@ -34,7 +34,7 @@
#define CPU_CORTEXA15 4
static char *cpuname[] = {
"UNKOWN",
"UNKNOWN",
"ARMV6",
"ARMV7",
"CORTEXA9",

View File

@@ -29,27 +29,43 @@
#define CPU_UNKNOWN 0
#define CPU_ARMV8 1
#define CPU_CORTEXA57 2
#define CPU_VULCAN 3
#define CPU_THUNDERX 4
#define CPU_THUNDERX2T99 5
// Arm
#define CPU_CORTEXA53 2
#define CPU_CORTEXA57 3
#define CPU_CORTEXA72 4
#define CPU_CORTEXA73 5
// Qualcomm
#define CPU_FALKOR 6
// Cavium
#define CPU_THUNDERX 7
#define CPU_THUNDERX2T99 8
//Hisilicon
#define CPU_TSV110 9
static char *cpuname[] = {
"UNKNOWN",
"ARMV8" ,
"CORTEXA53",
"CORTEXA57",
"VULCAN",
"CORTEXA72",
"CORTEXA73",
"FALKOR",
"THUNDERX",
"THUNDERX2T99"
"THUNDERX2T99",
"TSV110"
};
static char *cpuname_lower[] = {
"unknown",
"armv8" ,
"armv8",
"cortexa53",
"cortexa57",
"vulcan",
"cortexa72",
"cortexa73",
"falkor",
"thunderx",
"thunderx2t99"
"thunderx2t99",
"tsv110"
};
int get_feature(char *search)
@@ -78,7 +94,7 @@ int get_feature(char *search)
if( p == NULL ) return 0;
t = strtok(p," ");
while( t = strtok(NULL," "))
while( (t = strtok(NULL," ")))
{
if (!strcmp(t, search)) { return(1); }
}
@@ -114,15 +130,28 @@ int detect(void)
fclose(infile);
if(cpu_part != NULL && cpu_implementer != NULL) {
if (strstr(cpu_implementer, "0x41") &&
(strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08") || strstr(cpu_part,"0xd03") ))
return CPU_CORTEXA57; //or compatible A53, A72
else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42"))
return CPU_VULCAN;
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
// Arm
if (strstr(cpu_implementer, "0x41")) {
if (strstr(cpu_part, "0xd03"))
return CPU_CORTEXA53;
else if (strstr(cpu_part, "0xd07"))
return CPU_CORTEXA57;
else if (strstr(cpu_part, "0xd08"))
return CPU_CORTEXA72;
else if (strstr(cpu_part, "0xd09"))
return CPU_CORTEXA73;
}
// Qualcomm
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
return CPU_FALKOR;
// Cavium
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0a1"))
return CPU_THUNDERX;
else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43"))
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
return CPU_THUNDERX2T99;
// HiSilicon
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
return CPU_TSV110;
}
p = (char *) NULL ;
@@ -180,64 +209,63 @@ void get_subdirname(void)
void get_cpuconfig(void)
{
// All arches should define ARMv8
printf("#define ARMV8\n");
printf("#define HAVE_NEON\n"); // This shouldn't be necessary
printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary
int d = detect();
switch (d)
{
case CPU_CORTEXA53:
printf("#define %s\n", cpuname[d]);
// Fall-through
case CPU_ARMV8:
printf("#define ARMV8\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
break;
case CPU_VULCAN:
printf("#define VULCAN \n");
printf("#define HAVE_VFP \n");
printf("#define HAVE_VFPV3 \n");
printf("#define HAVE_NEON \n");
printf("#define HAVE_VFPV4 \n");
printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
printf("#define L1_DATA_SIZE 32768 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
printf("#define L2_SIZE 262144 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define L3_SIZE 33554432 \n");
printf("#define L3_LINESIZE 64 \n");
printf("#define L3_ASSOCIATIVE 32 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
// Minimum parameters for ARMv8 (based on A53)
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
break;
case CPU_CORTEXA57:
printf("#define CORTEXA57\n");
printf("#define HAVE_VFP\n");
printf("#define HAVE_VFPV3\n");
printf("#define HAVE_NEON\n");
printf("#define HAVE_VFPV4\n");
case CPU_CORTEXA72:
case CPU_CORTEXA73:
// Common minimum settings for these Arm cores
// Can change a lot, but we need to be conservative
// TODO: detect info from /sys if possible
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 49152\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 3\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 2\n");
printf("#define L2_SIZE 2097152\n");
printf("#define L2_SIZE 524288\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_FALKOR:
printf("#define FALKOR\n");
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n");
printf("#define L2_SIZE 524288\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
case CPU_THUNDERX:
printf("#define ARMV8\n");
printf("#define THUNDERX\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n");
@@ -249,11 +277,7 @@ void get_cpuconfig(void)
break;
case CPU_THUNDERX2T99:
printf("#define VULCAN \n");
printf("#define HAVE_VFP \n");
printf("#define HAVE_VFPV3 \n");
printf("#define HAVE_NEON \n");
printf("#define HAVE_VFPV4 \n");
printf("#define THUNDERX2T99 \n");
printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
@@ -269,6 +293,21 @@ void get_cpuconfig(void)
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
case CPU_TSV110:
printf("#define TSV110 \n");
printf("#define L1_CODE_SIZE 65536 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
printf("#define L1_DATA_SIZE 65536 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
printf("#define L2_SIZE 524228 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
}
}
@@ -305,7 +344,7 @@ void get_features(void)
if( p == NULL ) return;
t = strtok(p," ");
while( t = strtok(NULL," "))
while( (t = strtok(NULL," ")))
{
}

View File

@@ -75,7 +75,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_1004K 2
static char *cpuname[] = {
"UNKOWN",
"UNKNOWN",
"P5600",
"1004K"
};

View File

@@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_I6500 6
static char *cpuname[] = {
"UNKOWN",
"UNKNOWN",
"SICORTEX",
"LOONGSON3A",
"LOONGSON3B",

View File

@@ -56,6 +56,7 @@
#define CPUTYPE_CELL 6
#define CPUTYPE_PPCG4 7
#define CPUTYPE_POWER8 8
#define CPUTYPE_POWER9 9
char *cpuname[] = {
"UNKNOWN",
@@ -66,7 +67,8 @@ char *cpuname[] = {
"POWER6",
"CELL",
"PPCG4",
"POWER8"
"POWER8",
"POWER9"
};
char *lowercpuname[] = {
@@ -78,7 +80,8 @@ char *lowercpuname[] = {
"power6",
"cell",
"ppcg4",
"power8"
"power8",
"power9"
};
char *corename[] = {
@@ -90,7 +93,8 @@ char *corename[] = {
"POWER6",
"CELL",
"PPCG4",
"POWER8"
"POWER8",
"POWER9"
};
int detect(void){
@@ -120,6 +124,7 @@ int detect(void){
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
@@ -127,6 +132,33 @@ int detect(void){
#endif
#ifdef _AIX
FILE *infile;
char buffer[512], *p;
p = (char *)NULL;
infile = popen("prtconf|grep 'Processor Type'", "r");
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("Pro", buffer, 3)){
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
#endif
break;
}
}
pclose(infile);
if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3;
if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4;
if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970;
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
return CPUTYPE_POWER5;
#endif
@@ -143,12 +175,12 @@ int detect(void){
return CPUTYPE_PPC970;
#endif
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
int id;
id = __asm __volatile("mfpvr %0" : "=r"(id));
__asm __volatile("mfpvr %0" : "=r"(id));
switch ( id >> 16 ) {
case 0x4e: // POWER9
return return CPUTYPE_POWER8;
return CPUTYPE_POWER9;
break;
case 0x4d:
case 0x4b: // POWER8/8E

View File

@@ -97,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
("mov %%ebx, %%edi;"
"cpuid;"
"xchgl %%ebx, %%edi;"
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc");
#else
__asm__ __volatile__
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc");
#endif
}
@@ -211,6 +211,44 @@ int support_avx(){
#endif
}
int support_avx2(){
#ifndef NO_AVX2
int eax, ebx, ecx=0, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 0)
ret=1; //OS supports AVX2
return ret;
#else
return 0;
#endif
}
int support_avx512(){
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & 32) != 32){
ret=0; //OS does not even support AVX2
}
if((ebx & (1<<31)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 0xe0) == 0xe0)
ret=1; //OS supports AVX512VL
}
return ret;
#else
return 0;
#endif
}
int get_vendor(void){
int eax, ebx, ecx, edx;
@@ -233,6 +271,7 @@ int get_vendor(void){
if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS;
if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA;
if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC;
if (!strcmp(vendor, "HygonGenuine")) return VENDOR_HYGON;
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
@@ -294,6 +333,8 @@ int get_cputype(int gettype){
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
#ifndef NO_AVX
if (support_avx()) feature |= HAVE_AVX;
if (support_avx2()) feature |= HAVE_AVX2;
if (support_avx512()) feature |= HAVE_AVX512VL;
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
#endif
@@ -1006,7 +1047,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
}
}
if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) {
if ((get_vendor() == VENDOR_AMD) ||
(get_vendor() == VENDOR_HYGON) ||
(get_vendor() == VENDOR_CENTAUR)) {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
LDTB.size = 4096;
@@ -1168,7 +1211,7 @@ int get_cpuname(void){
return CPUTYPE_CORE2;
}
break;
case 1:
case 1: // family 6 exmodel 1
switch (model) {
case 6:
return CPUTYPE_CORE2;
@@ -1185,7 +1228,7 @@ int get_cpuname(void){
return CPUTYPE_DUNNINGTON;
}
break;
case 2:
case 2: // family 6 exmodel 2
switch (model) {
case 5:
//Intel Core (Clarkdale) / Core (Arrandale)
@@ -1214,7 +1257,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 3:
case 3: // family 6 exmodel 3
switch (model) {
case 7:
// Bay Trail
@@ -1228,57 +1271,47 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
case 12:
case 15:
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 13:
//Broadwell
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
}
break;
case 4:
case 4: // family 6 exmodel 4
switch (model) {
case 5:
case 6:
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 7:
case 15:
//Broadwell
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 14:
//Skylake
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 12:
@@ -1288,84 +1321,85 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 5:
case 5: // family 6 exmodel 5
switch (model) {
case 6:
//Broadwell
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 5:
// Skylake X
#ifndef NO_AVX512
return CPUTYPE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
#endif
case 14:
// Skylake
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 7:
// Xeon Phi Knights Landing
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 12:
// Apollo Lake
case 15:
// Denverton
return CPUTYPE_NEHALEM;
}
break;
case 6:
case 6: // family 6 exmodel 6
switch (model) {
case 6: // Cannon Lake
#ifndef NO_AVX512
return CPUTYPE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
#endif
}
break;
case 9:
case 8:
break;
case 7: // family 6 exmodel 7
switch (model) {
case 14: // Kaby Lake
if(support_avx())
#ifndef NO_AVX2
case 10: // Goldmont Plus
return CPUTYPE_NEHALEM;
case 14: // Ice Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
case 9:
case 8:
switch (model) {
case 14: // Kaby Lake and refreshes
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
}
@@ -1469,6 +1503,26 @@ int get_cpuname(void){
return CPUTYPE_AMD_UNKNOWN;
}
if (vendor == VENDOR_HYGON){
switch (family) {
case 0xf:
switch (exfamily) {
case 9:
//Hygon Dhyana
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_ZEN;
#else
return CPUTYPE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
#endif
else
return CPUTYPE_BARCELONA;
}
break;
}
return CPUTYPE_HYGON_UNKNOWN;
}
if (vendor == VENDOR_CYRIX){
switch (family) {
case 0x4:
@@ -1590,7 +1644,8 @@ static char *cpuname[] = {
"STEAMROLLER",
"EXCAVATOR",
"ZEN",
"SKYLAKEX"
"SKYLAKEX",
"DHYANA"
};
static char *lowercpuname[] = {
@@ -1645,11 +1700,12 @@ static char *lowercpuname[] = {
"steamroller",
"excavator",
"zen",
"skylakex"
"skylakex",
"dhyana"
};
static char *corename[] = {
"UNKOWN",
"UNKNOWN",
"80486",
"P5",
"P6",
@@ -1677,7 +1733,8 @@ static char *corename[] = {
"STEAMROLLER",
"EXCAVATOR",
"ZEN",
"SKYLAKEX"
"SKYLAKEX",
"DHYANA"
};
static char *corename_lower[] = {
@@ -1709,7 +1766,8 @@ static char *corename_lower[] = {
"steamroller",
"excavator",
"zen",
"skylakex"
"skylakex",
"dhyana"
};
@@ -2009,6 +2067,8 @@ int get_coretype(void){
switch (model) {
case 1:
// AMD Ryzen
case 8:
// Ryzen 2
if(support_avx())
#ifndef NO_AVX2
return CORE_ZEN;
@@ -2024,6 +2084,23 @@ int get_coretype(void){
}
}
if (vendor == VENDOR_HYGON){
if (family == 0xf){
if (exfamily == 9) {
if(support_avx())
#ifndef NO_AVX2
return CORE_ZEN;
#else
return CORE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
#endif
else
return CORE_BARCELONA;
} else {
return CORE_BARCELONA;
}
}
}
if (vendor == VENDOR_CENTAUR) {
switch (family) {
case 0x6:
@@ -2110,6 +2187,8 @@ void get_cpuconfig(void){
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n");
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n");
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n");
if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n");
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
@@ -2178,6 +2257,8 @@ void get_sse(void){
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n");
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n");
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n");
if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n");
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");

View File

@@ -27,9 +27,9 @@
#include <string.h>
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
static char *cpuname[] = {
"ZARCH_GENERIC",
@@ -64,10 +64,8 @@ int detect(void)
if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13;
/* detect z14, but fall back to z13 */
if (strstr(p, "3906")) return CPU_Z13;
if (strstr(p, "3907")) return CPU_Z13;
if (strstr(p, "3906")) return CPU_Z14;
if (strstr(p, "3907")) return CPU_Z14;
return CPU_GENERIC;
}
@@ -116,7 +114,14 @@ void get_cpuconfig(void)
break;
case CPU_Z14:
printf("#define Z14\n");
printf("#define L1_DATA_SIZE 131072\n");
printf("#define L1_DATA_LINESIZE 256\n");
printf("#define L1_DATA_ASSOCIATIVE 8\n");
printf("#define L2_SIZE 4194304\n");
printf("#define L2_LINESIZE 256\n");
printf("#define L2_ASSOCIATIVE 8\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
}
}

View File

@@ -113,7 +113,7 @@ ARCH_X86
ARCH_X86_64
#endif
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER)
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__)
ARCH_POWER
#endif

View File

@@ -6,6 +6,8 @@ TOPDIR = ..
include $(TOPDIR)/Makefile.system
override CFLAGS += -DADD$(BU) -DCBLAS
override TARGET_ARCH=
override TARGET_MACH=
LIB = $(TOPDIR)/$(LIBNAME)

View File

@@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@@ -62,9 +62,36 @@
#endif
#endif
#ifndef TRANSA
#ifndef thread_local
# if __STDC_VERSION__ >= 201112 && !defined __STDC_NO_THREADS__
# define thread_local _Thread_local
# elif defined _WIN32 && ( \
defined _MSC_VER || \
defined __ICL || \
defined __DMC__ || \
defined __BORLANDC__ )
# define thread_local __declspec(thread)
/* note that ICC (linux) and Clang are covered by __GNUC__ */
# elif defined __GNUC__ || \
defined __SUNPRO_C || \
defined __xlC__
# define thread_local __thread
# else
# define UNSAFE
#endif
#endif
#if defined USE_OPENMP
#undef UNSAFE
#endif
#if !defined(TRANSA) && !defined(UNSAFE)
#define Y_DUMMY_NUM 1024
#if defined(USE_OPENMP)
static FLOAT y_dummy[Y_DUMMY_NUM];
#pragma omp threadprivate(y_dummy)
# else
static thread_local FLOAT y_dummy[Y_DUMMY_NUM];
# endif
#endif
static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
@@ -105,10 +132,12 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#ifdef TRANSA
y += n_from * incy * COMPSIZE;
#else
# ifndef UNSAFE
//for split matrix row (n) direction and vector x of gemv_n
x += n_from * incx * COMPSIZE;
//store partial result for every thread
y += (m_to - m_from) * 1 * COMPSIZE * pos;
# endif
#endif
}
@@ -136,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
BLASLONG width, i, num_cpu;
#ifndef TRANSA
#if !defined(TRANSA) && !defined(UNSAFE)
int split_x=0;
#endif
@@ -212,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
i -= width;
}
#ifndef TRANSA
#if !defined(TRANSA) && !defined(UNSAFE)
//try to split matrix on row direction and x.
//Then, reduction.
if (num_cpu < nthreads) {
@@ -272,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
exec_blas(num_cpu, queue);
}
#ifndef TRANSA
#if !defined(TRANSA) && !defined(UNSAFE)
if(split_x==1){
//reduction
for(i=0; i<num_cpu; i++){

View File

@@ -54,16 +54,12 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
COPY_K(m, b, incb, buffer, 1);
}
/*FIXME the GEMV unrolling performed here was found to be broken, see issue 1332 */
/* Multiplying DTB size by 100 is just a quick-and-dirty hack to disable it for now[B */
for (is = 0; is < m; is += DTB_ENTRIES){
for (is = 0; is < m; is += DTB_ENTRIES * 100){
min_i = MIN(m - is, DTB_ENTRIES * 100);
min_i = MIN(m - is, DTB_ENTRIES);
#ifndef TRANSA
if (is > 0){
fprintf(stderr,"WARNING unrolling of the trmv_U loop may give wrong results\n");
if (is > 0){
GEMV_N(is, min_i, 0, dp1,
a + is * lda, lda,
B + is, 1,

View File

@@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel;
@@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel;

View File

@@ -48,6 +48,10 @@
#define SWITCH_RATIO 2
#endif
#ifndef GEMM_PREFERED_SIZE
#define GEMM_PREFERED_SIZE 1
#endif
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@@ -510,10 +514,29 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
return 0;
}
static int round_up(int remainder, int width, int multiple)
{
if (multiple > remainder || width <= multiple)
return width;
width = (width + multiple - 1) / multiple;
width = width * multiple;
return width;
}
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
*range_n, FLOAT *sa, FLOAT *sb,
BLASLONG nthreads_m, BLASLONG nthreads_n) {
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
#else
CRITICAL_SECTION level3_lock;
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
blas_arg_t newarg;
#ifndef USE_ALLOC_HEAP
@@ -554,6 +577,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
#endif
#endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_lock(&level3_lock);
#else
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
#ifdef USE_ALLOC_HEAP
/* Dynamically allocate workspace */
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
@@ -601,9 +632,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
num_parts = 0;
while (m > 0){
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
width = round_up(m, width, GEMM_PREFERED_SIZE);
m -= width;
if (m < 0) width = width + m;
range_M[num_parts + 1] = range_M[num_parts] + width;
num_parts ++;
}
for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
@@ -645,9 +681,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
if (width < SWITCH_RATIO) {
width = SWITCH_RATIO;
}
width = round_up(n, width, GEMM_PREFERED_SIZE);
n -= width;
if (n < 0) width = width + n;
range_N[num_parts + 1] = range_N[num_parts] + width;
num_parts ++;
}
for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
@@ -671,6 +710,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
free(job);
#endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_unlock(&level3_lock);
#else
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
return 0;
}

View File

@@ -48,7 +48,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
BLASLONG width, i;
BLASLONG n_from, n_to;
double dnum, nf, nt, di;
double dnum, nf, nt, di, dinum;
int num_cpu;
int mask = 0;
@@ -109,7 +109,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) {
di = (double)i;
width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1);
dinum = di * di +dnum;
if (dinum <0)
width = (BLASLONG)(( - di + mask)/(mask+1)) * (mask+1);
else
width = (BLASLONG)(( sqrt(dinum) - di + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
@@ -136,9 +140,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
nf = (double)(arg -> n - n_from);
nt = (double)(arg -> n - n_to);
dnum = (nt * nt - nf * nf) / (double)nthreads;
num_cpu = 0;
range[0] = n_from;
@@ -149,8 +151,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) {
di = (double)(arg -> n - i);
width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1);
dinum = di * di + dnum;
if (dinum<0)
width = ((BLASLONG)(di + mask)/(mask+1)) * (mask+1);
else
width = ((BLASLONG)((-sqrt(dinum) + di) + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
} else {

View File

@@ -47,7 +47,11 @@ GenerateNamedObjects("abs.c" "DOUBLE" "z_abs" 0 "" "" 1)
GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1)
if (DYNAMIC_ARCH)
list(APPEND COMMON_SOURCES dynamic.c)
if (ARM64)
list(APPEND COMMON_SOURCES dynamic_arm64.c)
else ()
list(APPEND COMMON_SOURCES dynamic.c)
endif ()
else ()
list(APPEND COMMON_SOURCES parameter.c)
endif ()

View File

@@ -15,7 +15,15 @@ endif
# COMMONOBJS += info.$(SUFFIX)
ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH),arm64)
COMMONOBJS += dynamic_arm64.$(SUFFIX)
else
ifeq ($(ARCH),power)
COMMONOBJS += dynamic_power.$(SUFFIX)
else
COMMONOBJS += dynamic.$(SUFFIX)
endif
endif
else
COMMONOBJS += parameter.$(SUFFIX)
endif
@@ -71,7 +79,15 @@ BLAS_SERVER = blas_server.c
endif
ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH),arm64)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX)
else
ifeq ($(ARCH),power)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX)
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
endif
endif
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
endif

View File

@@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
/* jobs is queued. */
/* We need this grobal for cheking if initialization is finished. */
/* We need this global for checking if initialization is finished. */
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
/* Local Variables */
@@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
#ifdef MONITOR
/* Monitor is a function to see thread's status for every seconds. */
/* Usually it turns off and it's for debugging. */
/* Monitor is a function to see thread's status for every second. */
/* Usually it turns off and it's for debugging. */
static pthread_t monitor_thread;
static int main_status[MAX_CPU_NUMBER];
@@ -582,7 +582,7 @@ int blas_thread_init(void){
if(ret!=0){
struct rlimit rlim;
const char *msg = strerror(ret);
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg);
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg);
#ifdef RLIMIT_NPROC
if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
@@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) {
long i;
#ifdef SMP_SERVER
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
if (num_threads < 1) num_threads = blas_num_threads;
#ifndef NO_AFFINITY

View File

@@ -50,7 +50,7 @@
/* This is a thread implementation for Win32 lazy implementation */
/* Thread server common infomation */
/* Thread server common information */
typedef struct{
CRITICAL_SECTION lock;
HANDLE filled;
@@ -61,7 +61,7 @@ typedef struct{
} blas_pool_t;
/* We need this global for cheking if initialization is finished. */
/* We need this global for checking if initialization is finished. */
int blas_server_avail = 0;
/* Local Variables */
@@ -461,13 +461,18 @@ int BLASFUNC(blas_thread_shutdown)(void){
SetEvent(pool.killed);
for(i = 0; i < blas_num_threads - 1; i++){
// Could also just use WaitForMultipleObjects
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
#ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
TerminateThread(blas_threads[i],0);
#endif
CloseHandle(blas_threads[i]);
}
CloseHandle(pool.filled);
CloseHandle(pool.killed);
blas_server_avail = 0;
}
@@ -478,7 +483,12 @@ int BLASFUNC(blas_thread_shutdown)(void){
void goto_set_num_threads(int num_threads)
{
long i;
long i;
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
if (num_threads < 1) num_threads = blas_cpu_number;

View File

@@ -274,6 +274,7 @@ extern gotoblas_t gotoblas_SKYLAKEX;
#define VENDOR_INTEL 1
#define VENDOR_AMD 2
#define VENDOR_CENTAUR 3
#define VENDOR_HYGON 4
#define VENDOR_UNKNOWN 99
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
@@ -304,9 +305,49 @@ int support_avx(){
#endif
}
int support_avx2(){
#ifndef NO_AVX2
int eax, ebx, ecx=0, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 0)
ret=1; //OS supports AVX2
return ret;
#else
return 0;
#endif
}
int support_avx512(){
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 1){
ret=0; //OS does not even support AVX2
}
if((ebx & (1<<31)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 0xe0) == 0xe0)
ret=1; //OS supports AVX512VL
}
return ret;
#else
return 0;
#endif
}
extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
static int get_vendor(void){
@@ -329,6 +370,7 @@ static int get_vendor(void){
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
@@ -403,18 +445,24 @@ static gotoblas_t *get_coretype(void){
}
//Intel Haswell
if (model == 12 || model == 15) {
if(support_avx())
if(support_avx2())
return &gotoblas_HASWELL;
else{
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Broadwell
if (model == 13) {
if(support_avx())
if(support_avx2())
return &gotoblas_HASWELL;
else{
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
@@ -424,27 +472,36 @@ static gotoblas_t *get_coretype(void){
case 4:
//Intel Haswell
if (model == 5 || model == 6) {
if(support_avx())
if(support_avx2())
return &gotoblas_HASWELL;
else{
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Broadwell
if (model == 7 || model == 15) {
if(support_avx())
if(support_avx2())
return &gotoblas_HASWELL;
else{
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Skylake
if (model == 14) {
if(support_avx())
if(support_avx2())
return &gotoblas_HASWELL;
else{
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
@@ -457,72 +514,104 @@ static gotoblas_t *get_coretype(void){
case 5:
//Intel Broadwell
if (model == 6) {
if(support_avx())
if(support_avx2())
return &gotoblas_HASWELL;
else{
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
if (model == 5) {
// Intel Skylake X
#ifndef NO_AVX512
return &gotoblas_SKYLAKEX;
#else
if(support_avx())
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
#endif
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
//Intel Skylake
if (model == 14) {
if(support_avx())
if(support_avx2())
return &gotoblas_HASWELL;
else{
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Phi Knights Landing
if (model == 7) {
if(support_avx())
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
else{
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Apollo Lake
if (model == 12) {
//Apollo Lake or Denverton
if (model == 12 || model == 15) {
return &gotoblas_NEHALEM;
}
return NULL;
case 6:
if (model == 6) {
// Cannon Lake
#ifndef NO_AVX512
return &gotoblas_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return &gotoblas_HASWELL;
#else
return &gotoblas_SANDYBRIDGE;
#endif
else
return &gotoblas_NEHALEM;
#endif
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 7:
if (model == 14) {
// Ice Lake
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 9:
case 8:
if (model == 14 ) { // Kaby Lake
if(support_avx())
if (model == 14 ) { // Kaby Lake, Coffee Lake
if(support_avx2())
return &gotoblas_HASWELL;
else{
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
@@ -535,7 +624,7 @@ static gotoblas_t *get_coretype(void){
}
}
if (vendor == VENDOR_AMD){
if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
if (family <= 0xe) {
// Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
@@ -615,6 +704,13 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
} else if (exfamily == 9) {
if(support_avx())
return &gotoblas_ZEN;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else {
return &gotoblas_BARCELONA;
}

View File

@@ -0,0 +1,198 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include "common.h"
#include <asm/hwcap.h>
#include <sys/auxv.h>
extern gotoblas_t gotoblas_ARMV8;
extern gotoblas_t gotoblas_CORTEXA57;
extern gotoblas_t gotoblas_THUNDERX;
extern gotoblas_t gotoblas_THUNDERX2T99;
extern void openblas_warning(int verbose, const char * msg);
#define NUM_CORETYPES 4
/*
* In case asm/hwcap.h is outdated on the build system, make sure
* that HWCAP_CPUID is defined
*/
#ifndef HWCAP_CPUID
#define HWCAP_CPUID (1 << 11)
#endif
#define get_cpu_ftr(id, var) ({ \
asm("mrs %0, "#id : "=r" (var)); \
})
static char *corename[] = {
"armv8",
"cortexa57",
"thunderx",
"thunderx2t99",
"unknown"
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1];
if (gotoblas == &gotoblas_THUNDERX) return corename[ 2];
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3];
return corename[NUM_CORETYPES];
}
static gotoblas_t *force_coretype(char *coretype) {
int i ;
int found = -1;
char message[128];
for ( i=0 ; i < NUM_CORETYPES; i++)
{
if (!strncasecmp(coretype, corename[i], 20))
{
found = i;
break;
}
}
switch (found)
{
case 0: return (&gotoblas_ARMV8);
case 1: return (&gotoblas_CORTEXA57);
case 2: return (&gotoblas_THUNDERX);
case 3: return (&gotoblas_THUNDERX2T99);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
return NULL;
}
static gotoblas_t *get_coretype(void) {
int implementer, variant, part, arch, revision, midr_el1;
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
char coremsg[128];
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
openblas_warning(1, coremsg);
return NULL;
}
get_cpu_ftr(MIDR_EL1, midr_el1);
/*
* MIDR_EL1
*
* 31 24 23 20 19 16 15 4 3 0
* -----------------------------------------------------------------
* | Implementer | Variant | Architecture | Part Number | Revision |
* -----------------------------------------------------------------
*/
implementer = (midr_el1 >> 24) & 0xFF;
part = (midr_el1 >> 4) & 0xFFF;
switch(implementer)
{
case 0x41: // ARM
switch (part)
{
case 0xd07: // Cortex A57
case 0xd08: // Cortex A72
case 0xd03: // Cortex A53
return &gotoblas_CORTEXA57;
}
break;
case 0x42: // Broadcom
switch (part)
{
case 0x516: // Vulcan
return &gotoblas_THUNDERX2T99;
}
break;
case 0x43: // Cavium
switch (part)
{
case 0x0a1: // ThunderX
return &gotoblas_THUNDERX;
case 0x0af: // ThunderX2
return &gotoblas_THUNDERX2T99;
}
break;
}
return NULL;
}
void gotoblas_dynamic_init(void) {
char coremsg[128];
char coren[22];
char *p;
if (gotoblas) return;
p = getenv("OPENBLAS_CORETYPE");
if ( p )
{
gotoblas = force_coretype(p);
}
else
{
gotoblas = get_coretype();
}
if (gotoblas == NULL)
{
snprintf(coremsg, 128, "Falling back to generic ARMV8 core\n");
openblas_warning(1, coremsg);
gotoblas = &gotoblas_ARMV8;
}
if (gotoblas && gotoblas->init) {
strncpy(coren, gotoblas_corename(), 20);
sprintf(coremsg, "Core: %s\n", coren);
openblas_warning(2, coremsg);
gotoblas -> init();
} else {
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1);
}
}
void gotoblas_dynamic_quit(void) {
gotoblas = NULL;
}

View File

@@ -0,0 +1,102 @@
#include "common.h"
extern gotoblas_t gotoblas_POWER6;
extern gotoblas_t gotoblas_POWER8;
extern gotoblas_t gotoblas_POWER9;
extern void openblas_warning(int verbose, const char *msg);
static char *corename[] = {
"unknown",
"POWER6",
"POWER8",
"POWER9"
};
#define NUM_CORETYPES 4
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_POWER6) return corename[1];
if (gotoblas == &gotoblas_POWER8) return corename[2];
if (gotoblas == &gotoblas_POWER9) return corename[3];
return corename[0];
}
static gotoblas_t *get_coretype(void) {
if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
return &gotoblas_POWER6;
if (__builtin_cpu_is("power8"))
return &gotoblas_POWER8;
if (__builtin_cpu_is("power9"))
return &gotoblas_POWER9;
return NULL;
}
static gotoblas_t *force_coretype(char * coretype) {
int i ;
int found = -1;
char message[128];
for ( i = 0 ; i < NUM_CORETYPES; i++)
{
if (!strncasecmp(coretype, corename[i], 20))
{
found = i;
break;
}
}
switch (found)
{
case 1: return (&gotoblas_POWER6);
case 2: return (&gotoblas_POWER8);
case 3: return (&gotoblas_POWER9);
default: return NULL;
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
}
void gotoblas_dynamic_init(void) {
char coremsg[128];
char coren[22];
char *p;
if (gotoblas) return;
p = getenv("OPENBLAS_CORETYPE");
if ( p )
{
gotoblas = force_coretype(p);
}
else
{
gotoblas = get_coretype();
}
if (gotoblas == NULL)
{
snprintf(coremsg, 128, "Falling back to POWER8 core\n");
openblas_warning(1, coremsg);
gotoblas = &gotoblas_POWER8;
}
if (gotoblas && gotoblas -> init) {
strncpy(coren,gotoblas_corename(),20);
sprintf(coremsg, "Core: %s\n",coren);
openblas_warning(2, coremsg);
gotoblas -> init();
} else {
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1);
}
}
void gotoblas_dynamic_quit(void) {
gotoblas = NULL;
}

View File

@@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {
int mynode = 1;
/* if number of threads is larger than inital condition */
/* if number of threads is larger than initial condition */
if (pos < 0) {
sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
return 0;
@@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) {
common -> shmid = pshmid;
if (common -> magic != SH_MAGIC) {
#if defined(__GLIBC_PREREQ)
#if __GLIBC_PREREQ(2, 7)
cpu_set_t *cpusetp;
#else
cpu_set_t cpuset;
#endif
#endif
int nums;
int ret;
@@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) {
}
CPU_FREE(cpusetp);
#else
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
if (ret!=0) {
common->num_procs = nums;
} else {
@@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) {
int i;
int n = 0;
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++;
if (CPU_ISSET(i,&cpuset)) n++;
common->num_procs = n;
}
#else
common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
common->num_procs = CPU_COUNT(&cpuset);
}
#endif

View File

@@ -73,8 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(USE_TLS)
#if defined(USE_TLS) && defined(SMP)
#define COMPILE_TLS
#if USE_TLS != 1
#undef COMPILE_TLS
#endif
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2,20)
#undef COMPILE_TLS
@@ -193,45 +198,68 @@ int get_num_procs(void);
#else
int get_num_procs(void) {
static int nums = 0;
cpu_set_t *cpusetp;
size_t size;
int ret;
int i,n;
cpu_set_t cpuset,*cpusetp;
size_t size;
int ret;
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 7)
int i;
#if !__GLIBC_PREREQ(2, 6)
int n;
#endif
#endif
#endif
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX)
return nums;
return nums;
#endif
#if !defined(__GLIBC_PREREQ)
return nums;
return nums;
#else
#if !__GLIBC_PREREQ(2, 3)
return nums;
return nums;
#endif
#if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
if (ret!=0) return nums;
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++;
if (CPU_ISSET(i,&cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
#endif
return nums;
#else
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) return nums;
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums;
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
if (nums >= CPU_SETSIZE) {
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) {
return nums;
}
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) {
CPU_FREE(cpusetp);
return nums;
}
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
} else {
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
if (ret!=0) {
return nums;
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
return nums;
}
#endif
#endif
}
@@ -254,6 +282,16 @@ int get_num_procs(void) {
}
#endif
#ifdef OS_AIX
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
#ifdef OS_WINDOWS
int get_num_procs(void) {
@@ -1275,6 +1313,13 @@ void blas_memory_free_nolock(void * map_address) {
free(map_address);
}
#ifdef SMP
void blas_thread_memory_cleanup(void) {
blas_memory_cleanup((void*)get_memory_table());
}
#endif
void blas_shutdown(void){
#ifdef SMP
BLASFUNC(blas_thread_shutdown)();
@@ -1284,7 +1329,7 @@ void blas_shutdown(void){
/* Only cleanupIf we were built for threading and TLS was initialized */
if (local_storage_key)
#endif
blas_memory_cleanup((void*)get_memory_table());
blas_thread_memory_cleanup();
#ifdef SEEK_ADDRESS
base_address = 0UL;
@@ -1471,6 +1516,14 @@ void DESTRUCTOR gotoblas_quit(void) {
blas_shutdown();
#if defined(SMP)
#if defined(OS_WINDOWS)
TlsFree(local_storage_key);
#else
pthread_key_delete(local_storage_key);
#endif
#endif
#ifdef PROFILE
moncontrol (0);
#endif
@@ -1506,7 +1559,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
break;
case DLL_THREAD_DETACH:
#if defined(SMP)
blas_memory_cleanup((void*)get_memory_table());
blas_thread_memory_cleanup();
#endif
break;
case DLL_PROCESS_DETACH:
@@ -1569,6 +1622,7 @@ void gotoblas_dummy_for_PGI(void) {
gotoblas_init();
gotoblas_quit();
#if __PGIC__ < 19
#if 0
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
@@ -1576,13 +1630,16 @@ void gotoblas_dummy_for_PGI(void) {
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
#endif
#endif
}
#endif
#else
/* USE_TLS / COMPILE_TLS not set */
#include <errno.h>
#ifdef OS_WINDOWS
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
#define ALLOC_WINDOWS
#ifndef MEM_LARGE_PAGES
#define MEM_LARGE_PAGES 0x20000000
@@ -1596,7 +1653,7 @@ void gotoblas_dummy_for_PGI(void) {
#include <stdio.h>
#include <fcntl.h>
#ifndef OS_WINDOWS
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
#include <sys/mman.h>
#ifndef NO_SYSV_IPC
#include <sys/shm.h>
@@ -1616,7 +1673,7 @@ void gotoblas_dummy_for_PGI(void) {
#include <sys/resource.h>
#endif
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#include <sys/sysctl.h>
#include <sys/resource.h>
#endif
@@ -1655,9 +1712,12 @@ void gotoblas_dummy_for_PGI(void) {
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#else
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
#define CONSTRUCTOR __attribute__ ((constructor(101)))
#define DESTRUCTOR __attribute__ ((destructor(101)))
#else
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#endif
#ifdef DYNAMIC_ARCH
@@ -1681,45 +1741,70 @@ void goto_set_num_threads(int num_threads) {};
int get_num_procs(void);
#else
int get_num_procs(void) {
static int nums = 0;
cpu_set_t *cpusetp;
size_t size;
int ret;
int i,n;
cpu_set_t cpuset,*cpusetp;
size_t size;
int ret;
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 7)
int i;
#if !__GLIBC_PREREQ(2, 6)
int n;
#endif
#endif
#endif
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX)
return nums;
return nums;
#endif
#if !defined(__GLIBC_PREREQ)
return nums;
return nums;
#else
#if !__GLIBC_PREREQ(2, 3)
return nums;
return nums;
#endif
#if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
if (ret!=0) return nums;
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++;
if (CPU_ISSET(i,&cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
#endif
return nums;
#else
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) return nums;
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums;
nums = CPU_COUNT_S(size,cpusetp);
CPU_FREE(cpusetp);
return nums;
if (nums >= CPU_SETSIZE) {
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) {
return nums;
}
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) {
CPU_FREE(cpusetp);
return nums;
}
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
} else {
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
if (ret!=0) {
return nums;
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
return nums;
}
#endif
#endif
}
@@ -1734,6 +1819,22 @@ int get_num_procs(void) {
}
#endif
#ifdef OS_HAIKU
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
#ifdef OS_AIX
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
#ifdef OS_WINDOWS
int get_num_procs(void) {
@@ -1754,7 +1855,7 @@ int get_num_procs(void) {
#endif
#if defined(OS_FREEBSD)
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
int get_num_procs(void) {
@@ -1831,7 +1932,7 @@ void openblas_fork_handler()
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
// In the mean time build with USE_OPENMP=0 or link against another
// implementation of OpenMP.
#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER)
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
int err;
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
if(err != 0)
@@ -1844,7 +1945,7 @@ extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num;
#endif
int blas_goto_num = 0;
@@ -1852,11 +1953,11 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
max_num = get_num_procs();
#endif
blas_goto_num = 0;
// blas_goto_num = 0;
#ifndef USE_OPENMP
blas_goto_num=openblas_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
@@ -1868,7 +1969,7 @@ int blas_get_cpu_number(void){
#endif
blas_omp_num = 0;
// blas_omp_num = 0;
blas_omp_num=openblas_omp_num_threads_env();
if (blas_omp_num < 0) blas_omp_num = 0;
@@ -1876,7 +1977,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif
@@ -1940,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL;
static void alloc_mmap_free(struct release_t *release){
if (!release->address) return;
if (munmap(release -> address, BUFFER_SIZE)) {
printf("OpenBLAS : munmap failed\n");
int errsv=errno;
perror("OpenBLAS : munmap failed:");
printf("error code=%d,\trelease->address=%lx\n",errsv,release->address);
}
}
@@ -1963,11 +2068,21 @@ static void *alloc_mmap(void *address){
}
if (map_address != (void *)-1) {
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
} else {
#ifdef DEBUG
int errsv=errno;
perror("OpenBLAS : mmap failed:");
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
#endif
}
#ifdef OS_LINUX
@@ -2109,14 +2224,18 @@ static void *alloc_mmap(void *address){
#if defined(OS_LINUX) && !defined(NO_WARMUP)
}
#endif
LOCK_COMMAND(&alloc_lock);
if (map_address != (void *)-1) {
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
UNLOCK_COMMAND(&alloc_lock);
return map_address;
}
@@ -2484,7 +2603,7 @@ void *blas_memory_alloc(int procpos){
int position;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
int mypos;
int mypos = 0;
#endif
void *map_address;
@@ -2515,6 +2634,11 @@ void *blas_memory_alloc(int procpos){
NULL,
};
void *(**func)(void *address);
#if defined(USE_OPENMP)
if (!memory_initialized) {
#endif
LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) {
@@ -2550,12 +2674,15 @@ void *blas_memory_alloc(int procpos){
}
UNLOCK_COMMAND(&alloc_lock);
#if defined(USE_OPENMP)
}
#endif
#ifdef DEBUG
printf("Alloc Start ...\n");
#endif
#if defined(WHEREAMI) && !defined(USE_OPENMP)
/* #if defined(WHEREAMI) && !defined(USE_OPENMP)
mypos = WhereAmI();
@@ -2564,13 +2691,17 @@ void *blas_memory_alloc(int procpos){
do {
if (!memory[position].used && (memory[position].pos == mypos)) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
/* blas_lock(&memory[position].lock);*/
#else
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
/* blas_unlock(&memory[position].lock);*/
#else
blas_unlock(&memory[position].lock);
#endif
}
position ++;
@@ -2578,25 +2709,30 @@ void *blas_memory_alloc(int procpos){
} while (position < NUM_BUFFERS);
#endif
#endif */
position = 0;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
do {
/* if (!memory[position].used) { */
LOCK_COMMAND(&alloc_lock);
/* blas_lock(&memory[position].lock);*/
#if defined(USE_OPENMP)
if (!memory[position].used) {
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
UNLOCK_COMMAND(&alloc_lock);
/* blas_unlock(&memory[position].lock);*/
/* } */
#if defined(USE_OPENMP)
blas_unlock(&memory[position].lock);
}
#endif
position ++;
} while (position < NUM_BUFFERS);
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
goto error;
allocation :
@@ -2606,10 +2742,11 @@ void *blas_memory_alloc(int procpos){
#endif
memory[position].used = 1;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
/* blas_unlock(&memory[position].lock);*/
#else
blas_unlock(&memory[position].lock);
#endif
if (!memory[position].addr) {
do {
#ifdef DEBUG
@@ -2626,7 +2763,7 @@ void *blas_memory_alloc(int procpos){
#ifdef ALLOC_DEVICEDRIVER
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
}
#endif
@@ -2654,9 +2791,13 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1);
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
memory[position].addr = map_address;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
@@ -2710,8 +2851,9 @@ void blas_memory_free(void *free_area){
#endif
position = 0;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
position++;
@@ -2725,7 +2867,9 @@ void blas_memory_free(void *free_area){
WMB;
memory[position].used = 0;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG
printf("Unmap Succeeded.\n\n");
@@ -2740,8 +2884,9 @@ void blas_memory_free(void *free_area){
for (position = 0; position < NUM_BUFFERS; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
return;
}
@@ -2791,7 +2936,7 @@ void blas_shutdown(void){
#if defined(OS_LINUX) && !defined(NO_WARMUP)
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
@@ -2816,7 +2961,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
if (hot_alloc != 2) {
#endif
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
LOCK_COMMAND(&init_lock);
#endif
@@ -2826,7 +2971,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
size -= PAGESIZE;
}
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
UNLOCK_COMMAND(&init_lock);
#endif
@@ -3059,7 +3204,7 @@ void gotoblas_dummy_for_PGI(void) {
gotoblas_init();
gotoblas_quit();
#if __PGIC__ < 19
#if 0
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
@@ -3067,6 +3212,7 @@ void gotoblas_dummy_for_PGI(void) {
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
#endif
#endif
}
#endif

View File

@@ -35,15 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <string.h>
#if defined(_WIN32) && defined(_MSC_VER)
#if _MSC_VER < 1900
#define snprintf _snprintf
#endif
#endif
static char* openblas_config_str=""
"OpenBLAS "
VERSION
" "
#ifdef USE64BITINT
"USE64BITINT "
" USE64BITINT "
#endif
#ifdef NO_CBLAS
"NO_CBLAS "

View File

@@ -730,35 +730,8 @@ void blas_set_parameter(void){
#if defined(ARCH_ARM64)
#if defined(VULCAN) || defined(THUNDERX2T99)
unsigned long dgemm_prefetch_size_a;
unsigned long dgemm_prefetch_size_b;
unsigned long dgemm_prefetch_size_c;
#endif
void blas_set_parameter(void)
{
#if defined(VULCAN) || defined(THUNDERX2T99)
dgemm_p = 160;
dgemm_q = 128;
dgemm_r = 4096;
sgemm_p = 128;
sgemm_q = 352;
sgemm_r = 4096;
cgemm_p = 128;
cgemm_q = 224;
cgemm_r = 4096;
zgemm_p = 128;
zgemm_q = 112;
zgemm_r = 4096;
dgemm_prefetch_size_a = 3584;
dgemm_prefetch_size_b = 512;
dgemm_prefetch_size_c = 128;
#endif
}
#endif

897
dynamic.c Normal file
View File

@@ -0,0 +1,897 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include "common.h"
#ifdef _MSC_VER
#define strncasecmp _strnicmp
#define strcasecmp _stricmp
#endif
#ifdef ARCH_X86
#define EXTERN extern
#else
#define EXTERN
#endif
#ifdef DYNAMIC_LIST
extern gotoblas_t gotoblas_PRESCOTT;
#ifdef DYN_ATHLON
extern gotoblas_t gotoblas_ATHLON;
#else
#define gotoblas_ATHLON gotoblas_PRESCOTT
#endif
#ifdef DYN_KATMAI
extern gotoblas_t gotoblas_KATMAI;
#else
#define gotoblas_KATMAI gotoblas_PRESCOTT
#endif
#ifdef DYN_BANIAS
extern gotoblas_t gotoblas_BANIAS;
#else
#define gotoblas_BANIAS gotoblas_PRESCOTT
#endif
#ifdef DYN_COPPERMINE
extern gotoblas_t gotoblas_COPPERMINE;
#else
#define gotoblas_COPPERMINE gotoblas_PRESCOTT
#endif
#ifdef DYN_NORTHWOOD
extern gotoblas_t gotoblas_NORTHWOOD;
#else
#define gotoblas_NORTHWOOD gotoblas_PRESCOTT
#endif
#ifdef DYN_CORE2
extern gotoblas_t gotoblas_CORE2;
#else
#define gotoblas_CORE2 gotoblas_PRESCOTT
#endif
#ifdef DYN_NEHALEM
extern gotoblas_t gotoblas_NEHALEM;
#else
#define gotoblas_NEHALEM gotoblas_PRESCOTT
#endif
#ifdef DYN_BARCELONA
extern gotoblas_t gotoblas_BARCELONA;
#elif defined(DYN_NEHALEM)
#define gotoblas_BARCELONA gotoblas_NEHALEM
#else
#define gotoblas_BARCELONA gotoblas_PRESCOTT
#endif
#ifdef DYN_ATOM
extern gotoblas_t gotoblas_ATOM;
elif defined(DYN_NEHALEM)
#define gotoblas_ATOM gotoblas_NEHALEM
#else
#define gotoblas_ATOM gotoblas_PRESCOTT
#endif
#ifdef DYN_NANO
extern gotoblas_t gotoblas_NANO;
#else
#define gotoblas_NANO gotoblas_PRESCOTT
#endif
#ifdef DYN_PENRYN
extern gotoblas_t gotoblas_PENRYN;
#else
#define gotoblas_PENRYN gotoblas_PRESCOTT
#endif
#ifdef DYN_DUNNINGTON
extern gotoblas_t gotoblas_DUNNINGTON;
#else
#define gotoblas_DUNNINGTON gotoblas_PRESCOTT
#endif
#ifdef DYN_OPTERON
extern gotoblas_t gotoblas_OPTERON;
#else
#define gotoblas_OPTERON gotoblas_PRESCOTT
#endif
#ifdef DYN_OPTERON_SSE3
extern gotoblas_t gotoblas_OPTERON_SSE3;
#else
#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
#endif
#ifdef DYN_BOBCAT
extern gotoblas_t gotoblas_BOBCAT;
#elif defined(DYN_NEHALEM)
#define gotoblas_BOBCAT gotoblas_NEHALEM
#else
#define gotoblas_BOBCAT gotoblas_PRESCOTT
#endif
#ifdef DYN_SANDYBRIDGE
extern gotoblas_t gotoblas_SANDYBRIDGE;
#elif defined(DYN_NEHALEM)
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#else
#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
#endif
#ifdef DYN_BULLDOZER
extern gotoblas_t gotoblas_BULLDOZER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_BULLDOZER gotoblas_NEHALEM
#else
#define gotoblas_BULLDOZER gotoblas_PRESCOTT
#endif
#ifdef DYN_PILEDRIVER
extern gotoblas_t gotoblas_PILEDRIVER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_PILEDRIVER gotoblas_NEHALEM
#else
#define gotoblas_PILEDRIVER gotoblas_PRESCOTT
#endif
#ifdef DYN_STEAMROLLER
extern gotoblas_t gotoblas_STEAMROLLER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_STEAMROLLER gotoblas_NEHALEM
#else
#define gotoblas_STEAMROLLER gotoblas_PRESCOTT
#endif
#ifdef DYN_EXCAVATOR
extern gotoblas_t gotoblas_EXCAVATOR;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_EXCAVATOR gotoblas_NEHALEM
#else
#define gotoblas_EXCAVATOR gotoblas_PRESCOTT
#endif
#ifdef DYN_HASWELL
extern gotoblas_t gotoblas_HASWELL;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_HASWELL gotoblas_NEHALEM
#else
#define gotoblas_HASWELL gotoblas_PRESCOTT
#endif
#ifdef DYN_ZEN
extern gotoblas_t gotoblas_ZEN;
#elif defined(DYN_HASWELL)
#define gotoblas_ZEN gotoblas_HASWELL
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_ZEN gotoblas_NEHALEM
#else
#define gotoblas_ZEN gotoblas_PRESCOTT
#endif
#ifdef DYN_SKYLAKEX
extern gotoblas_t gotoblas_SKYLAKEX;
#elif defined(DYN_HASWELL)
#define gotoblas_SKYLAKEX gotoblas_HASWELL
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
#else
#define gotoblas_SKYLAKEX gotoblas_PRESCOTT
#endif
#else // not DYNAMIC_LIST
EXTERN gotoblas_t gotoblas_KATMAI;
EXTERN gotoblas_t gotoblas_COPPERMINE;
EXTERN gotoblas_t gotoblas_NORTHWOOD;
EXTERN gotoblas_t gotoblas_BANIAS;
EXTERN gotoblas_t gotoblas_ATHLON;
extern gotoblas_t gotoblas_PRESCOTT;
extern gotoblas_t gotoblas_CORE2;
extern gotoblas_t gotoblas_NEHALEM;
extern gotoblas_t gotoblas_BARCELONA;
#ifdef DYNAMIC_OLDER
extern gotoblas_t gotoblas_ATOM;
extern gotoblas_t gotoblas_NANO;
extern gotoblas_t gotoblas_PENRYN;
extern gotoblas_t gotoblas_DUNNINGTON;
extern gotoblas_t gotoblas_OPTERON;
extern gotoblas_t gotoblas_OPTERON_SSE3;
extern gotoblas_t gotoblas_BOBCAT;
#else
#define gotoblas_ATOM gotoblas_NEHALEM
#define gotoblas_NANO gotoblas_NEHALEM
#define gotoblas_PENRYN gotoblas_CORE2
#define gotoblas_DUNNINGTON gotoblas_CORE2
#define gotoblas_OPTERON gotoblas_CORE2
#define gotoblas_OPTERON_SSE3 gotoblas_CORE2
#define gotoblas_BOBCAT gotoblas_CORE2
#endif
#ifndef NO_AVX
extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER;
extern gotoblas_t gotoblas_PILEDRIVER;
extern gotoblas_t gotoblas_STEAMROLLER;
extern gotoblas_t gotoblas_EXCAVATOR;
#ifdef NO_AVX2
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
#else
extern gotoblas_t gotoblas_HASWELL;
extern gotoblas_t gotoblas_ZEN;
#ifndef NO_AVX512
extern gotoblas_t gotoblas_SKYLAKEX;
#else
#define gotoblas_SKYLAKEX gotoblas_HASWELL
#endif
#endif
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
#define gotoblas_EXCAVATOR gotoblas_BARCELONA
#define gotoblas_ZEN gotoblas_BARCELONA
#endif
#endif // DYNAMIC_LIST
#define VENDOR_INTEL 1
#define VENDOR_AMD 2
#define VENDOR_CENTAUR 3
#define VENDOR_HYGON 4
#define VENDOR_UNKNOWN 99
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
#ifndef NO_AVX
static inline void xgetbv(int op, int * eax, int * edx){
//Use binary code for xgetbv
__asm__ __volatile__
(".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
}
#endif
int support_avx(){
#ifndef NO_AVX
int eax, ebx, ecx, edx;
int ret=0;
cpuid(1, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 6) == 6){
ret=1; //OS support AVX
}
}
return ret;
#else
return 0;
#endif
}
int support_avx2(){
#ifndef NO_AVX2
int eax, ebx, ecx=0, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 0)
ret=1; //OS supports AVX2
return ret;
#else
return 0;
#endif
}
int support_avx512(){
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 1){
ret=0; //OS does not even support AVX2
}
if((ebx & (1<<31)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 0xe0) == 0xe0)
ret=1; //OS supports AVX512VL
}
return ret;
#else
return 0;
#endif
}
extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
static int get_vendor(void){
int eax, ebx, ecx, edx;
union
{
char vchar[16];
int vint[4];
} vendor;
cpuid(0, &eax, &ebx, &ecx, &edx);
*(&vendor.vint[0]) = ebx;
*(&vendor.vint[1]) = edx;
*(&vendor.vint[2]) = ecx;
vendor.vchar[12] = '\0';
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
return VENDOR_UNKNOWN;
}
static gotoblas_t *get_coretype(void){
int eax, ebx, ecx, edx;
int family, exfamily, model, vendor, exmodel;
cpuid(1, &eax, &ebx, &ecx, &edx);
family = BITMASK(eax, 8, 0x0f);
exfamily = BITMASK(eax, 20, 0xff);
model = BITMASK(eax, 4, 0x0f);
exmodel = BITMASK(eax, 16, 0x0f);
vendor = get_vendor();
if (vendor == VENDOR_INTEL){
switch (family) {
case 0x6:
switch (exmodel) {
case 0:
if (model <= 0x7) return &gotoblas_KATMAI;
if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE;
if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS;
if (model == 14) return &gotoblas_BANIAS;
if (model == 15) return &gotoblas_CORE2;
return NULL;
case 1:
if (model == 6) return &gotoblas_CORE2;
if (model == 7) return &gotoblas_PENRYN;
if (model == 13) return &gotoblas_DUNNINGTON;
if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM;
if (model == 12) return &gotoblas_ATOM;
return NULL;
case 2:
//Intel Core (Clarkdale) / Core (Arrandale)
// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
// Xeon (Clarkdale), 32nm
if (model == 5) return &gotoblas_NEHALEM;
//Intel Xeon Processor 5600 (Westmere-EP)
//Xeon Processor E7 (Westmere-EX)
//Xeon E7540
if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
//Intel Core i7-3000 / Xeon E5
if (model == 10 || model == 13) {
if(support_avx())
return &gotoblas_SANDYBRIDGE;
else{
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
case 3:
//Intel Sandy Bridge 22nm (Ivy Bridge?)
if (model == 10 || model == 14) {
if(support_avx())
return &gotoblas_SANDYBRIDGE;
else{
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Haswell
if (model == 12 || model == 15) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Broadwell
if (model == 13) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
if (model == 7) return &gotoblas_ATOM; //Bay Trail
return NULL;
case 4:
//Intel Haswell
if (model == 5 || model == 6) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Broadwell
if (model == 7 || model == 15) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Skylake
if (model == 14) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Braswell / Avoton
if (model == 12 || model == 13) {
return &gotoblas_NEHALEM;
}
return NULL;
case 5:
//Intel Broadwell
if (model == 6) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
if (model == 5) {
// Intel Skylake X
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
//Intel Skylake
if (model == 14) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Phi Knights Landing
if (model == 7) {
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Apollo Lake or Denverton
if (model == 12 || model == 15) {
return &gotoblas_NEHALEM;
}
return NULL;
case 6:
if (model == 6) {
// Cannon Lake
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 7:
if (model == 10) // Goldmont plus
return &gotoblas_NEHALEM;
if (model == 14) {
// Ice Lake
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 9:
case 8:
if (model == 14 ) { // Kaby Lake, Coffee Lake
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
}
case 0xf:
if (model <= 0x2) return &gotoblas_NORTHWOOD;
return &gotoblas_PRESCOTT;
}
}
if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
if (family <= 0xe) {
// Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
if ( (eax & 0xffff) >= 0x01) {
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0)
return NULL;
}
else
return NULL;
return &gotoblas_ATHLON;
}
if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) {
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
else return &gotoblas_OPTERON;
} else if (exfamily == 5) {
return &gotoblas_BOBCAT;
} else if (exfamily == 6) {
if(model == 1){
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return &gotoblas_BULLDOZER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 2 || model == 3){
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
if(support_avx())
return &gotoblas_PILEDRIVER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 5){
if(support_avx())
return &gotoblas_EXCAVATOR;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 0 || model == 8){
if (exmodel == 1) {
//AMD Trinity
if(support_avx())
return &gotoblas_PILEDRIVER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if (exmodel == 3) {
//AMD STEAMROLLER
if(support_avx())
return &gotoblas_STEAMROLLER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if (exmodel == 6) {
if(support_avx())
return &gotoblas_EXCAVATOR;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
}
} else if (exfamily == 8) {
if (model == 1 || model == 8) {
if(support_avx())
return &gotoblas_ZEN;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
} else if (exfamily == 9) {
if(support_avx())
return &gotoblas_ZEN;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else {
return &gotoblas_BARCELONA;
}
}
}
if (vendor == VENDOR_CENTAUR) {
switch (family) {
case 0x6:
return &gotoblas_NANO;
}
}
return NULL;
}
static char *corename[] = {
"Unknown",
"Katmai",
"Coppermine",
"Northwood",
"Prescott",
"Banias",
"Atom",
"Core2",
"Penryn",
"Dunnington",
"Nehalem",
"Athlon",
"Opteron",
"Opteron_SSE3",
"Barcelona",
"Nano",
"Sandybridge",
"Bobcat",
"Bulldozer",
"Piledriver",
"Haswell",
"Steamroller",
"Excavator",
"Zen",
"SkylakeX"
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_KATMAI) return corename[ 1];
if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2];
if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3];
if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4];
if (gotoblas == &gotoblas_BANIAS) return corename[ 5];
if (gotoblas == &gotoblas_ATOM) return corename[ 6];
if (gotoblas == &gotoblas_CORE2) return corename[ 7];
if (gotoblas == &gotoblas_PENRYN) return corename[ 8];
if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9];
if (gotoblas == &gotoblas_NEHALEM) return corename[10];
if (gotoblas == &gotoblas_ATHLON) return corename[11];
if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12];
if (gotoblas == &gotoblas_OPTERON) return corename[13];
if (gotoblas == &gotoblas_BARCELONA) return corename[14];
if (gotoblas == &gotoblas_NANO) return corename[15];
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
if (gotoblas == &gotoblas_HASWELL) return corename[20];
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
if (gotoblas == &gotoblas_ZEN) return corename[23];
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
return corename[0];
}
static gotoblas_t *force_coretype(char *coretype){
int i ;
int found = -1;
char message[128];
//char mname[20];
for ( i=1 ; i <= 24; i++)
{
if (!strncasecmp(coretype,corename[i],20))
{
found = i;
break;
}
}
if (found < 0)
{
//strncpy(mname,coretype,20);
snprintf(message, 128, "Core not found: %s\n",coretype);
openblas_warning(1, message);
return(NULL);
}
switch (found)
{
case 24: return (&gotoblas_SKYLAKEX);
case 23: return (&gotoblas_ZEN);
case 22: return (&gotoblas_EXCAVATOR);
case 21: return (&gotoblas_STEAMROLLER);
case 20: return (&gotoblas_HASWELL);
case 19: return (&gotoblas_PILEDRIVER);
case 18: return (&gotoblas_BULLDOZER);
case 17: return (&gotoblas_BOBCAT);
case 16: return (&gotoblas_SANDYBRIDGE);
case 15: return (&gotoblas_NANO);
case 14: return (&gotoblas_BARCELONA);
case 13: return (&gotoblas_OPTERON);
case 12: return (&gotoblas_OPTERON_SSE3);
case 11: return (&gotoblas_ATHLON);
case 10: return (&gotoblas_NEHALEM);
case 9: return (&gotoblas_DUNNINGTON);
case 8: return (&gotoblas_PENRYN);
case 7: return (&gotoblas_CORE2);
case 6: return (&gotoblas_ATOM);
case 5: return (&gotoblas_BANIAS);
case 4: return (&gotoblas_PRESCOTT);
case 3: return (&gotoblas_NORTHWOOD);
case 2: return (&gotoblas_COPPERMINE);
case 1: return (&gotoblas_KATMAI);
}
return(NULL);
}
void gotoblas_dynamic_init(void) {
char coremsg[128];
char coren[22];
char *p;
if (gotoblas) return;
p = getenv("OPENBLAS_CORETYPE");
if ( p )
{
gotoblas = force_coretype(p);
}
else
{
gotoblas = get_coretype();
}
#ifdef ARCH_X86
if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
#else
if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
/* sanity check, if 64bit pointer we can't have a 32 bit cpu */
if (sizeof(void*) == 8) {
if (gotoblas == &gotoblas_KATMAI ||
gotoblas == &gotoblas_COPPERMINE ||
gotoblas == &gotoblas_NORTHWOOD ||
gotoblas == &gotoblas_BANIAS ||
gotoblas == &gotoblas_ATHLON)
gotoblas = &gotoblas_PRESCOTT;
}
#endif
if (gotoblas && gotoblas -> init) {
strncpy(coren,gotoblas_corename(),20);
sprintf(coremsg, "Core: %s\n",coren);
openblas_warning(2, coremsg);
gotoblas -> init();
} else {
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1);
}
}
void gotoblas_dynamic_quit(void) {
gotoblas = NULL;
}

View File

@@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol
libgoto_hpl.def : gensymbol
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
ifeq ($(OSNAME), Darwin)
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
else
@@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
endif
ifneq (,$(filter 1 2,$(NOFORTRAN)))
#only build without Fortran
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
endif
dllinit.$(SUFFIX) : dllinit.c
@@ -141,6 +145,14 @@ else
$(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed
../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c
endif
ifeq ($(F_COMPILER), INTEL)
$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
else
ifneq ($(C_COMPILER), LSB)
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
@@ -152,6 +164,7 @@ else
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
endif
endif
rm -f linktest

View File

@@ -40,15 +40,25 @@
void gotoblas_init(void);
void gotoblas_quit(void);
#if defined(SMP) && defined(USE_TLS)
void blas_thread_memory_cleanup(void);
#endif
BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {
if (reason == DLL_PROCESS_ATTACH) {
gotoblas_init();
}
if (reason == DLL_PROCESS_DETACH) {
gotoblas_quit();
switch(reason) {
case DLL_PROCESS_ATTACH:
gotoblas_init();
break;
case DLL_PROCESS_DETACH:
gotoblas_quit();
break;
case DLL_THREAD_ATTACH:
break;
case DLL_THREAD_DETACH:
#if defined(SMP) && defined(USE_TLS)
blas_thread_memory_cleanup();
#endif
break;
}
return TRUE;

12
f_check
View File

@@ -125,7 +125,7 @@ if ($compiler eq "") {
$openmp = "-openmp";
}
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data =~ / zho_ge__/) {
$need2bu = 1;
@@ -292,9 +292,6 @@ if ($link ne "") {
&& ($flags !~ /^-LIST:/)
&& ($flags !~ /^-LANG:/)
) {
if ($vendor eq "PGI") {
$flags =~ s/lib$/libso/;
}
$linker_L .= $flags . " ";
}
@@ -311,17 +308,11 @@ if ($link ne "") {
if ($flags =~ /^\-rpath\@/) {
$flags =~ s/\@/\,/g;
if ($vendor eq "PGI") {
$flags =~ s/lib$/libso/;
}
$linker_L .= "-Wl,". $flags . " " ;
}
if ($flags =~ /^\-rpath-link\@/) {
$flags =~ s/\@/\,/g;
if ($vendor eq "PGI") {
$flags =~ s/lib$/libso/;
}
$linker_L .= "-Wl,". $flags . " " ;
}
@@ -330,7 +321,6 @@ if ($link ne "") {
&& ($flags !~ /gfortranbegin/)
&& ($flags !~ /frtbegin/)
&& ($flags !~ /pathfstart/)
&& ($flags !~ /numa/)
&& ($flags !~ /crt[0-9]/)
&& ($flags !~ /gcc/)
&& ($flags !~ /user32/)

140
getarch.c
View File

@@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <unistd.h>
#endif
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#else
#define NO_AVX512
#endif
/* #define FORCE_P2 */
/* #define FORCE_KATMAI */
/* #define FORCE_COPPERMINE */
@@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef FORCE_SKYLAKEX
#ifdef NO_AVX512
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#else
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
@@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define LIBNAME "skylakex"
#define CORENAME "SKYLAKEX"
#endif
#endif
#ifdef FORCE_ATOM
#define FORCE
@@ -618,6 +637,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "POWER8"
#endif
#if defined(FORCE_POWER9)
#define FORCE
#define ARCHITECTURE "POWER"
#define SUBARCHITECTURE "POWER9"
#define SUBDIRNAME "power"
#define ARCHCONFIG "-DPOWER9 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
"-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "power9"
#define CORENAME "POWER9"
#endif
#ifdef FORCE_PPCG4
#define FORCE
@@ -927,11 +958,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DARMV8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 "
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "armv8"
#define CORENAME "ARMV8"
#endif
#ifdef FORCE_CORTEXA53
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXA53"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA53 " \
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa53"
#define CORENAME "CORTEXA53"
#else
#endif
#ifdef FORCE_CORTEXA57
#define FORCE
#define ARCHITECTURE "ARM64"
@@ -942,26 +990,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa57"
#define CORENAME "CORTEXA57"
#else
#endif
#ifdef FORCE_VULCAN
#ifdef FORCE_CORTEXA72
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "VULCAN"
#define SUBARCHITECTURE "CORTEXA72"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DVULCAN " \
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
#define ARCHCONFIG "-DCORTEXA72 " \
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
#define LIBNAME "vulcan"
#define CORENAME "VULCAN"
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa72"
#define CORENAME "CORTEXA72"
#else
#endif
#ifdef FORCE_CORTEXA73
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXA73"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA73 " \
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa73"
#define CORENAME "CORTEXA73"
#else
#endif
#ifdef FORCE_FALKOR
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "FALKOR"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DFALKOR " \
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "falkor"
#define CORENAME "FALKOR"
#else
#endif
@@ -973,13 +1052,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DTHUNDERX " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
"-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 "
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "thunderx"
#define CORENAME "THUNDERX"
#else
#endif
#ifdef FORCE_THUNDERX2T99
#define ARMV8
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "THUNDERX2T99"
@@ -990,12 +1071,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "thunderx2t99"
#define CORENAME "THUNDERX2T99"
#else
#endif
#ifdef FORCE_TSV110
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "TSV110"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DTSV110 " \
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "tsv110"
#define CORENAME "TSV110"
#else
#endif
#ifdef FORCE_ZARCH_GENERIC
#define FORCE
#define ARCHITECTURE "ZARCH"
@@ -1016,8 +1114,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "Z13"
#endif
#ifdef FORCE_Z14
#define FORCE
#define ARCHITECTURE "ZARCH"
#define SUBARCHITECTURE "Z14"
#define ARCHCONFIG "-DZ14 " \
"-DDTB_DEFAULT_ENTRIES=64"
#define LIBNAME "z14"
#define CORENAME "Z14"
#endif
#ifndef FORCE
#ifdef USER_TARGET
#error "The TARGET specified on the command line or in Makefile.rule is not supported. Please choose a target from TargetList.txt"
#endif
#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \
defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__)
#ifndef POWER

View File

@@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES
rotm.c rotmg.c # N.B. these do not have complex counterparts
rot.c
asum.c
sum.c
)
# these will have 'z' prepended for the complex version
@@ -23,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES
axpby.c
)
# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f
# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f
# these all have 'z' sources for complex versions
set(BLAS2_SOURCES
gemv.c ger.c
@@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES})
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX")
GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX")
GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX")
GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX")
endif ()
if (${float_type} STREQUAL "ZCOMPLEX")
GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX")
@@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES})
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
endif ()
endforeach ()

View File

@@ -25,7 +25,7 @@ SBLAS1OBJS = \
saxpy.$(SUFFIX) sswap.$(SUFFIX) \
scopy.$(SUFFIX) sscal.$(SUFFIX) \
sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \
sasum.$(SUFFIX) snrm2.$(SUFFIX) \
sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \
smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \
smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \
srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \
@@ -51,7 +51,7 @@ DBLAS1OBJS = \
daxpy.$(SUFFIX) dswap.$(SUFFIX) \
dcopy.$(SUFFIX) dscal.$(SUFFIX) \
ddot.$(SUFFIX) \
dasum.$(SUFFIX) dnrm2.$(SUFFIX) \
dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \
dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \
dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \
drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \
@@ -76,7 +76,7 @@ CBLAS1OBJS = \
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \
cdotc.$(SUFFIX) cdotu.$(SUFFIX) \
scasum.$(SUFFIX) scnrm2.$(SUFFIX) \
scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \
scamax.$(SUFFIX) icamax.$(SUFFIX) \
scamin.$(SUFFIX) icamin.$(SUFFIX) \
csrot.$(SUFFIX) crotg.$(SUFFIX) \
@@ -105,7 +105,7 @@ ZBLAS1OBJS = \
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \
zdotc.$(SUFFIX) zdotu.$(SUFFIX) \
dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \
dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \
dzamax.$(SUFFIX) izamax.$(SUFFIX) \
dzamin.$(SUFFIX) izamin.$(SUFFIX) \
zdrot.$(SUFFIX) zrotg.$(SUFFIX) \
@@ -146,7 +146,7 @@ QBLAS1OBJS = \
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
qdot.$(SUFFIX) \
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
@@ -168,7 +168,7 @@ XBLAS1OBJS = \
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
xdotc.$(SUFFIX) xdotu.$(SUFFIX) \
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
@@ -203,7 +203,7 @@ ifdef QUAD_PRECISION
QBLAS1OBJS = \
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
@@ -224,7 +224,7 @@ QBLAS3OBJS = \
XBLAS1OBJS = \
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
@@ -263,7 +263,8 @@ CSBLAS1OBJS = \
cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
CSBLAS2OBJS = \
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
@@ -280,7 +281,8 @@ CDBLAS1OBJS = \
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
CDBLAS2OBJS = \
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
@@ -300,7 +302,8 @@ CCBLAS1OBJS = \
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
cblas_caxpby.$(SUFFIX)
cblas_caxpby.$(SUFFIX) \
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)
CCBLAS2OBJS = \
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
@@ -326,7 +329,9 @@ CZBLAS1OBJS = \
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
cblas_zaxpby.$(SUFFIX)
cblas_zaxpby.$(SUFFIX) \
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)
CZBLAS2OBJS = \
cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \
@@ -560,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c
qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c $< -o $(@F)
@@ -1383,6 +1406,18 @@ cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c
cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_icmax.$(SUFFIX) cblas_icmax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_izmax.$(SUFFIX) cblas_izmax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
@@ -1395,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c
cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
@@ -1402,7 +1449,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

View File

@@ -75,6 +75,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
if (alpha == ZERO) return;
if (incx == 0 && incy == 0) {
*y += n * alpha *(*x);
return;
}
IDEBUG_START;
FUNCTION_PROFILE_START();
@@ -86,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
//
//Temporarily work-around the low performance issue with small imput size &
//Temporarily work-around the low performance issue with small input size &
//multithreads.
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1;

View File

@@ -271,6 +271,14 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
PRINT_DEBUG_CNAME;
#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT)
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) {
sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc);
return;
}
#endif
#ifndef COMPLEX
args.alpha = (void *)&alpha;
args.beta = (void *)&beta;

View File

@@ -97,7 +97,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
blas_level1_thread(mode, n, k1, k2, dummyalpha,
a, lda, NULL, 0, ipiv, incx,
laswp[flag], nthreads);
(int(*)())laswp[flag], nthreads);
}
#endif

View File

@@ -96,7 +96,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads);
blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)())laswp[flag], nthreads);
}
#endif

97
interface/sum.c Normal file
View File

@@ -0,0 +1,97 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#ifndef CBLAS
FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
BLASLONG n = *N;
BLASLONG incx = *INCX;
FLOATRET ret;
PRINT_DEBUG_NAME;
if (n <= 0) return 0;
IDEBUG_START;
FUNCTION_PROFILE_START();
ret = (FLOATRET)SUM_K(n, x, incx);
FUNCTION_PROFILE_END(COMPSIZE, n, n);
IDEBUG_END;
return ret;
}
#else
#ifdef COMPLEX
FLOAT CNAME(blasint n, void *vx, blasint incx){
FLOAT *x = (FLOAT*) vx;
#else
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
#endif
FLOAT ret;
PRINT_DEBUG_CNAME;
if (n <= 0) return 0;
IDEBUG_START;
FUNCTION_PROFILE_START();
ret = SUM_K(n, x, incx);
FUNCTION_PROFILE_END(COMPSIZE, n, n);
IDEBUG_END;
return ret;
}
#endif

View File

@@ -42,7 +42,7 @@
#include "functable.h"
#endif
#if defined(THUNDERX2T99) || defined(VULCAN)
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
// Multithreaded swap gives performance benefits in ThunderX2T99
#else
// Disable multi-threading as it does not show any performance

View File

@@ -218,11 +218,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
buffer = (FLOAT *)blas_memory_alloc(1);
#ifdef SMP
/* nthreads = num_cpu_avail(2);
nthreads = num_cpu_avail(2);
FIXME trmv_thread was found to be broken, see issue 1332 */
nthreads = 1;
if (nthreads == 1) {
#endif

View File

@@ -81,6 +81,12 @@
#endif
#endif
#ifndef COMPLEX
#define SMP_FACTOR 256
#else
#define SMP_FACTOR 128
#endif
static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
#ifndef TRMM
TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN,
@@ -198,7 +204,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG,
if (side < 0) info = 1;
if (info != 0) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)-1);
return;
}
@@ -366,11 +372,15 @@ void CNAME(enum CBLAS_ORDER order,
mode |= (trans << BLAS_TRANSA_SHIFT);
mode |= (side << BLAS_RSIDE_SHIFT);
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
/*
if ( args.m < 2 * GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1;
else
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1;
*/
if ( args.m * args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD)
args.nthreads = 1;
else
args.nthreads = num_cpu_avail(3);

View File

@@ -82,6 +82,12 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
if (incx == 0 && incy == 0) {
*y += n * (alpha_r * (*x) - alpha_i* (*(x+1)) );
*(y+1) += n * (alpha_i * (*x) + alpha_r * (*(x +1)) );
return;
}
IDEBUG_START;
FUNCTION_PROFILE_START();
@@ -93,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
//
//Temporarily work-around the low performance issue with small imput size &
//Temporarily work-around the low performance issue with small input size &
//multithreads.
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1;

Some files were not shown because too many files have changed in this diff Show More