Compare commits

...

929 Commits

Author SHA1 Message Date
Martin Kroeker
5fdf9ad24f Merge pull request #2228 from martin-frbg/issue2227
Add Intel Goldmont Plus CPUID
2019-08-19 18:26:51 +02:00
Martin Kroeker
2fe967c542 Merge branch 'develop' into issue2227 2019-08-19 14:20:39 +02:00
Martin Kroeker
6d8595351c Add Intel Goldmont Plus CPUID
fixes #2227
2019-08-19 14:19:21 +02:00
Martin Kroeker
f40200f559 Merge pull request #2223 from martin-frbg/getarch-pgi
Make getarch compile with PGI
2019-08-16 12:21:30 +02:00
Martin Kroeker
a95a5e52b8 Fix PGI compiler detection for getarch 2019-08-16 09:00:11 +02:00
Martin Kroeker
e3d846ab57 Do not use -march=native with the PGI compiler 2019-08-16 08:58:10 +02:00
Martin Kroeker
8506386d82 Merge pull request #1 from xianyi/develop
rebase
2019-08-16 08:56:15 +02:00
Martin Kroeker
9ef96b32a6 Add multithreading support to the x86_64 zdot kernel (#2222)
* Add multithreading support

copied from the ThunderX2T99 kernel. For #2221
2019-08-15 22:09:12 +02:00
Martin Kroeker
b48c025974 Merge pull request #2218 from martin-frbg/issue2215
Make the new DGEMM regression test properly depend on CBLAS and LAPACKE
2019-08-14 07:32:31 +02:00
Martin Kroeker
a1fce67743 Make the new DGEMM regression test properly depend on CBLAS and LAPACKE
fixes #2215
2019-08-13 22:29:48 +02:00
Martin Kroeker
103b32fdb7 Merge pull request #2216 from martin-frbg/issue2214
Remove case-sensitivity in x86 LSAME on (AMD) cpus without CMOV
2019-08-13 13:59:33 +02:00
Martin Kroeker
aef9804089 Fix unwanted case-sensitivity in x86 LSAME for (AMD) processors without CMOV
Problem was already noticed some years ago in #238, but back then the problem was only corrected in one of the #ifdef branches.
Fixes #2214
2019-08-13 10:19:10 +02:00
Martin Kroeker
303869f572 Update with changes from 0.3.7 2019-08-11 23:31:36 +02:00
Martin Kroeker
02d9203981 Increment version to 0.3.8.dev 2019-08-11 23:28:47 +02:00
Martin Kroeker
7b6808b69c Increment version to 0.3.8.dev 2019-08-11 23:28:13 +02:00
Martin Kroeker
321288597c Merge pull request #2212 from martin-frbg/nofort-nolib
Avoid spurious dependency on the fortran runtime despite NOFORTRAN=1
2019-08-11 20:26:34 +02:00
Martin Kroeker
be147a9f28 Avoid adding a spurious dependency on the fortran runtime despite NOFORTRAN=1
for cases where a fortran compiler is present but not wanted (e.g. not fully functional)
2019-08-11 16:24:39 +02:00
Martin Kroeker
c275290ea6 Merge pull request #2211 from martin-frbg/arm64_gcc_trivial
Silence two nuisance warnings from gcc
2019-08-11 16:08:05 +02:00
Martin Kroeker
b7bbb02447 Silence two nuisance warnings from gcc 2019-08-11 12:46:05 +02:00
Martin Kroeker
bf1430f7d7 Merge pull request #2208 from martin-frbg/munmap-debug
Provide more information on mmap/munmap failure
2019-08-09 07:55:35 +02:00
Martin Kroeker
dccff2e785 Merge pull request #2206 from martin-frbg/zen-dtrmm
Replace vpermpd with vpermilpd in the Haswell DTRMM kernel
2019-08-09 07:55:20 +02:00
Martin Kroeker
5c3458a6e7 Merge pull request #2199 from martin-frbg/zen-dtrsm
Replace most vpermpd calls in the Haswell DTRSM_RN kernel
2019-08-09 07:55:02 +02:00
Martin Kroeker
1776ad82c0 Add files via upload 2019-08-09 00:08:11 +02:00
Martin Kroeker
4e2f81cfa1 Provide more information on mmap/munmap failure
for #2207
2019-08-08 23:15:35 +02:00
Martin Kroeker
acf6002ab2 Replace most vpermpd calls in the Haswell DTRSM_RN kernel 2019-08-03 12:40:13 +02:00
Martin Kroeker
96a794e9fd Merge pull request #2198 from martin-frbg/icelake
Update CPUID recognition for Intel Ice Lake
2019-08-02 08:36:14 +02:00
Martin Kroeker
3d36c45116 Add CPUID identification of Intel Ice Lake 2019-08-01 22:52:35 +02:00
Martin Kroeker
648491e1aa Autodetect Intel Ice Lake (as SKYLAKEX target) 2019-08-01 22:51:09 +02:00
Martin Kroeker
2dfb804cb9 Replace vpermpd with vpermilpd in the Haswell DTRMM kernel
to improve performance on AMD Zen (#2180) applying wjc404's improvement of the DGEMM kernel from #2186
2019-07-28 23:17:28 +02:00
Martin Kroeker
4c153ec9da Merge pull request #2196 from wjc404/develop
Add vbroadcastsd kernel to dgemm_kernel_4x8_haswell.S
2019-07-28 23:11:40 +02:00
wjc404
7eecd8e39c Add files via upload 2019-07-28 07:39:09 +08:00
Martin Kroeker
f0406a7708 Merge pull request #2112 from ffontaine/develop
Makefile.arm: remove -march flags
2019-07-27 13:00:13 +02:00
Martin Kroeker
561f3fd995 Merge pull request #2193 from martin-frbg/makeutest
Override special make variables
2019-07-24 20:19:21 +02:00
Martin Kroeker
30efed14d1 Unset special make variables in ctest Makefile as well 2019-07-24 15:26:09 +02:00
Martin Kroeker
af2e7f28fc Override special make variables
as seen in https://github.com/xianyi/OpenBLAS/issues/1912#issuecomment-514183900 , any external setting of TARGET_ARCH (which could result from building OpenBLAS as part of a larger project that actually uses this variable) would cause the utest build to fail. 
(Other subtargets appear to be unaffected as they do not use implicit make rules)
2019-07-23 16:56:40 +02:00
Martin Kroeker
4250e6ed64 Merge pull request #2191 from tylerjereddy/conditional_updates
MAINT: remove legacy CMake endif()
2019-07-23 16:20:39 +02:00
Martin Kroeker
7b0b7c11d2 Merge pull request #2190 from martin-frbg/zdot-zen
Replace vpermpd with vpermilpd in the Haswell/Zen zdot microkernel
2019-07-23 16:15:08 +02:00
Martin Kroeker
d14cf1ccf4 Merge pull request #2189 from wjc404/develop
Update dgemm_kernel_4x8_haswell.S for reducing cache misses
2019-07-23 08:32:56 +02:00
Tyler Reddy
3f6ab1582a MAINT: remove legacy CMake endif()
* clean up a case where CMake endif()
contained the conditional used in the
if(), which is no longer needed /
discouraged since our minimum required
CMake version supports the modern syntax
2019-07-22 21:24:57 -06:00
Martin Kroeker
28e96458e5 Replace vpermpd with vpermilpd
to improve performance on Zen/Zen2 (as demonstrated by wjc404 in #2180)
2019-07-22 08:28:16 +02:00
wjc404
95fb98f556 Update dgemm_kernel_4x8_haswell.S 2019-07-21 01:10:32 +08:00
wjc404
4801c6d36b Update dgemm_kernel_4x8_haswell.S 2019-07-21 00:47:45 +08:00
wjc404
9440fa607d Add files via upload 2019-07-20 22:08:22 +08:00
wjc404
94db259e5b Add files via upload 2019-07-20 22:04:41 +08:00
wjc404
f49f8047ac Add files via upload 2019-07-20 14:33:37 +08:00
wjc404
825777faab Update dgemm_kernel_4x8_haswell.S 2019-07-19 23:58:24 +08:00
wjc404
9c89757562 Add files via upload 2019-07-19 23:47:58 +08:00
Martin Kroeker
b0b7600bef Merge pull request #2186 from wjc404/develop
Update "dgemm_kernel_4x8_haswell.S" for improving performance on zen2 chips
2019-07-18 16:04:44 +02:00
wjc404
9b04baeaee Update dgemm_kernel_4x8_haswell.S 2019-07-17 23:50:03 +08:00
wjc404
8a074b3965 Update dgemm_kernel_4x8_haswell.S 2019-07-17 23:47:30 +08:00
wjc404
211ab03b14 Update dgemm_kernel_4x8_haswell.S 2019-07-17 22:39:15 +08:00
wjc404
1733f927e6 Update dgemm_kernel_4x8_haswell.S 2019-07-17 21:27:41 +08:00
wjc404
182b06d6ad Update dgemm_kernel_4x8_haswell.S 2019-07-17 17:02:35 +08:00
wjc404
7a9050d681 Update dgemm_kernel_4x8_haswell.S 2019-07-17 00:55:06 +08:00
wjc404
0ba29fd262 Update dgemm_kernel_4x8_haswell.S for zen2
replaced a bunch of vpermpd instructions with vpermilpd and vperm2f128
2019-07-17 00:46:51 +08:00
Martin Kroeker
bafa021ed6 Merge pull request #2181 from isuruf/install_name
Change install_name on osx to match linux
2019-07-09 20:08:52 +02:00
Isuru Fernando
b89d9762a2 Change install_name on osx to match linux 2019-07-08 17:14:35 -05:00
Martin Kroeker
08dedf4c5e Merge pull request #2177 from martin-frbg/noaff
Fix surprising behaviour of NO_AFFINITY=0
2019-07-07 18:28:21 +02:00
Martin Kroeker
b89c781637 Fix surprising behaviour of NO_AFFINITY=0 2019-07-07 16:04:45 +02:00
Martin Kroeker
dd7ff77f4b Merge pull request #2175 from martin-frbg/cmake-mingw-fixes
Fix CMAKE compilation with MinGW32 and add it to Appveyor
2019-07-06 18:07:19 +02:00
Martin Kroeker
8fb76134bc Mingw32 needs leading underscore on object names
(also copy BUNDERSCORE settings for FORTRAN from the corresponding Makefile)
2019-07-06 15:07:15 +02:00
Martin Kroeker
04d671aae2 Make disabling DYNAMIC_ARCH on unsupported systems work
needs to be unset in the cache for the change to have any effect
2019-07-06 15:05:04 +02:00
Martin Kroeker
f69a0be712 Add getarch flags to disable AVX on x86
(and other small fixes to match Makefile behaviour)
2019-07-06 15:02:39 +02:00
Martin Kroeker
ae9e8b131e Add mingw builds to Appveyor config 2019-07-06 14:30:33 +02:00
Martin Kroeker
9086543f50 Utest needs CBLAS but not necessarily FORTRAN 2019-07-06 14:29:47 +02:00
Martin Kroeker
abea977ded Merge pull request #2162 from martin-frbg/pgi
Fixes for PGI compiler
2019-07-03 19:16:30 +02:00
Martin Kroeker
6b6c9b1441 Merge pull request #2172 from quickwritereader/develop
power9 cgemm/ctrmm. new sgemm 8x16
2019-07-01 21:06:02 +02:00
AbdelRauf
a97b301aaa cgemm/ctrmm power9 2019-07-01 14:07:54 +00:00
Martin Kroeker
2f13f04224 Merge pull request #2170 from pkubaj/patch-1
Fix build on PPC970 for FreeBSD
2019-06-30 23:29:02 +02:00
pkubaj
7c7505a778 Fix build for PPC970 on FreeBSD pt.2
FreeBSD needs those macros too.
2019-06-28 10:31:45 +00:00
pkubaj
5a4f1a2118 Fix build for PPC970 on FreeBSD pt. 1
FreeBSD needs DCBT_ARG=0 as well.
2019-06-28 10:29:44 +00:00
Martin Kroeker
3b761892df Merge pull request #2169 from pkubaj/develop
Fix build on FreeBSD/powerpc64.
2019-06-25 12:56:33 +02:00
Piotr Kubaj
eebfeba768 Fix build on FreeBSD/powerpc64.
Signed-off-by: Piotr Kubaj <pkubaj@anongoth.pl>
2019-06-25 10:58:56 +02:00
Martin Kroeker
7684c4f8f8 PGI compiler does not like -march=native 2019-06-20 19:56:01 +02:00
Martin Kroeker
7faf42b7bb Merge pull request #2167 from kavanabhat/dtrmm_power8_segfault
Fix DTRMMKERNEL register save for power8 64-bit mode (Fix for #2166)
2019-06-19 14:38:01 +02:00
kavanabhat
a575f1e4c7 Update dtrmm_kernel_16x4_power8.S 2019-06-19 15:27:14 +05:30
AbdelRauf
cdbfb891da new sgemm 8x16 2019-06-17 15:33:38 +00:00
Martin Kroeker
280552b988 Fix mov syntax 2019-06-16 18:35:43 +02:00
Martin Kroeker
bbd4bb0154 Zero ecx with a mov instruction
PGI assembler does not like the initialization in the constraints.
2019-06-16 15:04:10 +02:00
Martin Kroeker
6d3efb2b58 Update Makefile.x86_64 2019-06-14 08:08:11 +02:00
Martin Kroeker
d9ff2cd90d Do not force gcc options on non-gcc compilers
fixes compile failure with pgi 18.10 as reported on OpenBLAS-users
2019-06-13 23:01:35 +02:00
Martin Kroeker
2a43062de7 Merge pull request #2159 from martin-frbg/issue2149
Avoid unintentional activation of TLS codepath via USE_TLS=0
2019-06-10 19:12:45 +02:00
Martin Kroeker
4ea794a522 Avoid unintentional activation of TLS code via USE_TLS=0
fixes #2149
2019-06-10 17:24:15 +02:00
Martin Kroeker
ece0bfb881 Merge pull request #2158 from martin-frbg/issue2143
Remove any inadvertent use of -march=native from DYNAMIC_ARCH builds
2019-06-10 14:08:11 +02:00
Martin Kroeker
1f4b6a5d5d Remove any inadvertent use of -march=native from DYNAMIC_ARCH builds
from #2143, -march=native precludes use of more specific options like -march=skylake-avx512 in individual kernels, and defeats the purpose of dynamic arch anyway.
2019-06-10 09:50:13 +02:00
Martin Kroeker
be8f70d269 Merge pull request #2157 from martin-frbg/2154-2
Add gfortran workaround for potential ABI violation
2019-06-09 12:19:08 +02:00
Martin Kroeker
e674e1c735 Update fc.cmake 2019-06-09 09:31:13 +02:00
Martin Kroeker
6ca898b63b Add gfortran workaround for potential ABI violation
for #2154
2019-06-08 23:17:03 +02:00
Martin Kroeker
26411acd56 Merge pull request #2148 from TiborGY/cpp_thread_test_2
Thread safety tester using C++11 threading (cleaned history)
2019-06-07 13:23:07 +02:00
Martin Kroeker
0ab4076dd8 Merge pull request #2156 from martin-frbg/issue2154
Add gfortran workaround for C->FORTRAN ABI violation
2019-06-06 13:43:12 +02:00
Martin Kroeker
a0caa762b3 Add gfortran workaround for ABI violations
for #2154 (see gcc bug 90329)
2019-06-06 10:24:16 +02:00
Martin Kroeker
900d5a3205 Add gfortran workaround for ABI violations in LAPACKE
for #2154 (see gcc bug 90329)
2019-06-06 10:18:40 +02:00
Martin Kroeker
a17cf36225 Merge pull request #2153 from quickwritereader/develop
improved power9 zgemm,sgemm
2019-06-06 07:42:56 +02:00
AbdelRauf
148c4cc5fd conflict resolve 2019-06-05 20:50:50 +00:00
AbdelRauf
d0c3543c3f power9 zgemm ztrmm optimized 2019-06-05 20:07:16 +00:00
Martin Kroeker
909ad04aef Merge pull request #2145 from martin-frbg/1912-3
Separate implementations of AMAX and IAMAX on arm
2019-06-05 20:27:45 +02:00
Martin Kroeker
417efd41c6 Merge pull request #2110 from pc2/cpu-detection
Fix detection of Skylake processors when using GCC
2019-06-05 20:27:05 +02:00
Michael Lass
9cdc828afa c_check: Unlink correct file 2019-06-05 17:31:01 +02:00
Michael Lass
7a9a4dbc4f Fix detection of AVX512 capable compilers in getarch
21eda8b5 introduced a check in getarch.c to test if the compiler is capable of
AVX512. This check currently fails, since the used __AVX2__ macro is only
defined if getarch itself was compiled with AVX2/AVX512 support. Make sure this
is the case by building getarch with -march=native on x86_64. It is only
supposed to run on the build host anyway.
2019-06-05 17:30:56 +02:00
AbdelRauf
a469b32cf4 sgemm pipeline improved, zgemm rewritten without inner packs, ABI lxvx v20 fixed with vs52 2019-06-04 07:11:30 +00:00
Martin Kroeker
27649b9543 Document NO_AVX512
for #2151
2019-06-03 11:01:33 +02:00
TiborGY
16f3df5d35 add c++ thread test option to Makefile.rule 2019-06-01 21:36:41 +02:00
TiborGY
1aded69821 hook up c++ thread safety test (main Makefile) 2019-06-01 21:32:52 +02:00
TiborGY
c00289ba54 upload thread safety test folder 2019-06-01 21:30:06 +02:00
AbdelRauf
8fe794f059 improved zgemm power9 based on power8 2019-05-30 15:31:25 +00:00
Martin Kroeker
74c10b57c6 Use generic kernels for complex (I)AMAX to support softfp 2019-05-30 11:38:11 +02:00
Martin Kroeker
c5495d2056 Ensure correct output for DAMAX with softfp 2019-05-30 11:25:43 +02:00
Martin Kroeker
c70496b108 Separate implementations of AMAX and IAMAX on arm
As noted in #1912 and comment on #1942, the combined implementation happens to "do the right thing" on hardfp, but cannot return both value and index on softfp where they would have to share the return register
2019-05-29 15:02:51 +02:00
Martin Kroeker
ca8d8835f5 Merge pull request #2144 from xianyi/revert-2142-issue1912-2
Revert "Add softfp support in min/max kernels"
2019-05-29 14:09:10 +02:00
Martin Kroeker
d76b20b4d2 Revert "Add softfp support in min/max kernels" 2019-05-29 14:07:17 +02:00
Martin Kroeker
85af04da3c Merge pull request #2142 from martin-frbg/issue1912-2
Add softfp support in min/max kernels
2019-05-28 22:56:08 +02:00
Martin Kroeker
11e0dcbffb Merge pull request #2141 from martin-frbg/issue1912
Build and run utests independently of fortran
2019-05-28 20:50:40 +02:00
Martin Kroeker
79366ff7a9 Add softfp support in min/max kernels
fix for #1912
2019-05-28 20:34:22 +02:00
Martin Kroeker
21d05a4835 Merge pull request #2140 from martin-frbg/pgi19
Do not try ancient PGI hacks with recent versions of that compiler
2019-05-26 12:39:20 +02:00
Martin Kroeker
940f38f6dd Build and run utests in any case, they do their own checks for fortran availability 2019-05-24 13:02:23 +02:00
Martin Kroeker
1778fd4219 Do not try ancient PGI hacks with recent versions of that compiler
should fix #2139
2019-05-22 13:48:27 +02:00
Martin Kroeker
969dd6175e Merge pull request #2136 from martin-frbg/issue2126
Add option to allow combining USE_THREAD=0 with thread locking support
2019-05-16 12:08:16 +02:00
Martin Kroeker
d8d5682481 Merge pull request #2134 from tylerjereddy/skylake_regress_guard_may14
TST: add SkylakeX AVX512 CI test
2019-05-15 23:40:06 +02:00
Martin Kroeker
f66c11fc22 Remove unrelated change 2019-05-15 23:38:12 +02:00
Martin Kroeker
5ecffc28f2 Add option USE_LOCKING but keep default settings intact 2019-05-15 23:36:17 +02:00
Martin Kroeker
86dda5c2fa Add option USE_LOCKING for SMP-like locking in USE_THREAD=0 builds 2019-05-15 23:21:20 +02:00
Martin Kroeker
1e52572be3 Add option USE_LOCKING for single-threaded build with locking support 2019-05-15 23:19:30 +02:00
Martin Kroeker
d2cb610272 Add option USE_LOCKING for single-threaded build with locking support
for calling from concurrent threads
2019-05-15 23:18:43 +02:00
Tyler Reddy
a211bc9b6a TST: add SkylakeX AVX512 CI test
* adapt the C-level reproducer code for some
recent SkylakeX AVX512 kernel issues, provided
by Isuru Fernando and modified by Martin Kroeker,
for usage in the utest suite

* add an Intel SDE SkylakeX emulation utest run to
the Azure CI matrix; a custom Docker build was required
because Ubuntu image provided by Azure does not support
AVX512VL instructions
2019-05-14 11:32:23 -07:00
Martin Kroeker
9208ab8603 Merge pull request #2130 from isuruf/drone
Drone CI for arm64 native builds
2019-05-14 09:37:00 +02:00
Isuru Fernando
b43deb4ad6 Fix typo 2019-05-12 15:26:18 -05:00
Isuru Fernando
b911525c81 arm32 build 2019-05-12 15:21:43 -05:00
Isuru Fernando
7ff44e0016 Remove qemu armv8 builds 2019-05-12 15:09:53 -05:00
Isuru Fernando
e3cb8ad2d6 See if ubuntu 19.04 fixes the ICE 2019-05-12 14:28:48 -05:00
Isuru Fernando
7aa6faad5f parallel build 2019-05-12 14:22:36 -05:00
Isuru Fernando
3d94ab660f build without lapack on cmake 2019-05-12 14:17:12 -05:00
Isuru Fernando
cd99dfe034 Add cmake builds and print options 2019-05-12 14:10:10 -05:00
Isuru Fernando
dadafcdcd8 Add a cmake build as well 2019-05-12 14:10:10 -05:00
Isuru Fernando
d40c109eb0 no need of gcc in clang build 2019-05-12 14:10:10 -05:00
Isuru Fernando
608cd69b66 update yes 2019-05-12 14:10:10 -05:00
Isuru Fernando
231472c4c6 Fix typo 2019-05-12 14:10:10 -05:00
Isuru Fernando
612c2d78e0 apt update 2019-05-12 14:10:10 -05:00
Isuru Fernando
dc110e179d Switch to ubuntu and parallel jobs 2019-05-12 14:10:09 -05:00
Isuru Fernando
9184590c33 gfortran->gcc-gfortran 2019-05-12 14:10:09 -05:00
Isuru Fernando
a0aaf308ed Install gfortran and add a clang job 2019-05-12 14:10:09 -05:00
Isuru Fernando
15f925fe9a Install perl 2019-05-12 14:10:09 -05:00
Isuru Fernando
21acf03e9a Install gcc 2019-05-12 14:10:09 -05:00
Isuru Fernando
ff807473bb remove sudo 2019-05-12 14:10:09 -05:00
Isuru Fernando
58829c0988 install make 2019-05-12 14:10:09 -05:00
Isuru Fernando
d86f0b9e74 Test drone CI 2019-05-12 14:10:09 -05:00
Martin Kroeker
63554d5dec Merge pull request #2129 from martin-frbg/armv8azure
Move ARMv8/gcc CI job from Travis to Azure
2019-05-12 09:55:57 +02:00
Martin Kroeker
43068288e9 Update .travis.yml 2019-05-11 22:37:06 +02:00
Martin Kroeker
999a04f101 Move ARMv8 gcc build from Travis to Azure 2019-05-11 16:08:23 +02:00
Martin Kroeker
3cb1c8d210 Move ARMv8 gcc build from Travis to Azure 2019-05-11 16:07:30 +02:00
Martin Kroeker
ff1bfe7b16 Merge pull request #2127 from martin-frbg/issue2114_2
Add NO_AFFINITY to available CMAKE options on Linux, and set it to ON
2019-05-09 15:25:09 +02:00
Martin Kroeker
9ea30f3788 Replace ISMIN and ISAMIN kernels on all x86_64 platforms (#2125)
* Mark iamax_sse.S as unsuitable for MIN due to issue #2116
* Use iamax.S rather than iamax_sse.S for ISMIN/ISAMIN on all x86_64 as workaround for #2116
2019-05-09 14:42:36 +02:00
Martin Kroeker
a3d4c65d62 Add NO_AFFINITY to available options on Linux, and set it to ON
to match the gmake default. Fixes second part of #2114
2019-05-09 11:52:02 +02:00
Martin Kroeker
e1fc02095c Merge pull request #2124 from tylerjereddy/manylinux1_azure
TST: Azure manylinux1 & clean-up
2019-05-09 08:57:37 +02:00
Martin Kroeker
0cd6d8508f Merge pull request #2123 from tylerjereddy/azure_readme_badge
DOC: Add Azure CI status badge to README
2019-05-09 08:10:19 +02:00
Martin Kroeker
c2f152c470 Merge pull request #2120 from brada4/getrf-2113
Address redundant code concern #2113
2019-05-09 08:10:00 +02:00
Tyler Reddy
4efbac28ed TST: Azure manylinux1 & clean-up
* remove some of the steps & comments
from the original Azure yml template

* modify the trigger section to use
develop since OpenBLAS primarily uses
this branch; use the same batching
behavior as downstream projects NumPy/
SciPy

* remove Travis emulated ARMv6 gcc build
because this now happens in Azure

* use documented Ubuntu vmImage name for Azure
and add in a manylinux1 test run to the matrix

[skip appveyor]
2019-05-08 21:58:49 -07:00
Martin Kroeker
406c7242f4 Add ARMV6 build to azure CI setup (#2122)
using aytekinar's Alpine image and docker script from the Travis setup

[skip ci]
2019-05-09 00:47:44 +02:00
Tyler Reddy
53703585aa DOC: Add Azure CI status badge 2019-05-08 15:15:50 -07:00
Martin Kroeker
ad20ceaa68 Update azure-pipelines.yml 2019-05-08 19:07:58 +02:00
Martin Kroeker
dd77a3f0e2 Update azure-pipelines.yml 2019-05-08 15:25:43 +02:00
Martin Kroeker
a598ab1d32 Update azure-pipelines.yml 2019-05-08 15:23:54 +02:00
Martin Kroeker
16fd8e3dbe Update azure-pipelines.yml 2019-05-08 14:14:22 +02:00
Martin Kroeker
aa4c41bad2 Update azure-pipelines.yml
take out offending lines (although stolen from https://github.com/conda-forge/opencv-feedstock azure-pipelines fiie)
2019-05-08 14:12:02 +02:00
Martin Kroeker
5cf434167a fix tabbing in azure commands 2019-05-08 13:58:59 +02:00
Martin Kroeker
3a49e8c05a first try migrating one of the arm builds from travis 2019-05-08 13:52:22 +02:00
Martin Kroeker
95e2cf32e1 Merge pull request #2121 from tylerjereddy/ppc64le-travis
TST: add native POWER8 to CI
2019-05-08 13:31:46 +02:00
Martin Kroeker
70cea0b96b Update link to IBM MASS library, update cpu support status 2019-05-08 12:20:00 +02:00
Martin Kroeker
ae0dec77ec Merge pull request #2118 from Diazonium/develop
Change two http links to https
2019-05-08 11:41:17 +02:00
Tyler Reddy
e47b63466b TST: add native POWER8 to CI
* add native POWER8 testing to
Travis CI matrix with ppc64le
os entry
2019-05-07 19:11:08 -07:00
Zhang Xianyi
7d1b468d9d Set up CI with Azure Pipelines
[skip ci]
2019-05-08 09:58:01 +08:00
Andrew
575a84398a remove redundant code #2113 2019-05-07 23:46:54 +03:00
Martin Kroeker
5cabda79d0 Merge pull request #2117 from martin-frbg/issue2114
Fix errors in cpu affinity setup with glibc 2.6
2019-05-07 18:18:16 +02:00
Diazonium
c516209581 Change two http links to https
Closes #2109
2019-05-07 14:55:20 +02:00
Martin Kroeker
a6a8cc2b7f Fix errors in cpu enumeration with glibc 2.6
for #2114
2019-05-07 13:34:52 +02:00
Andrew
3d7debbb28 init 2019-05-07 13:15:08 +03:00
Fabrice Fontaine
5a9cce2bf6 Makefile.arm: remove -march flags
The provided -march flags, especially for ARMv5 and ARMv6 may not
necessarily match the needed ones: for ARMv5, it might be armv5,
armv5te, armv5t, etc. If the wrong one is used, the incorrect toolchain
sysroot can be used in a multilib toolchain.

Therefore, let the user building OpenBLAS pass the appropriate -march
flag.

The other flags, such as -mfpu=vfp or -mfloat-abi=hard are kept, as they
are actually required for the build to proceed (OpenBLAS uses VFP
instructions, and assume an EABIhf ABI).

[Peter: update for v0.2.20]
Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: Peter Korsgaard <peter@korsgaard.com>
[Retrieved from:
https://git.buildroot.net/buildroot/tree/package/openblas/0001-Makefile.arm-remove-march-flags.patch]
Signed-off-by: Fabrice Fontaine <fontaine.fabrice@gmail.com>
2019-05-05 18:37:28 +02:00
Martin Kroeker
6a8b4269b5 Merge pull request #2111 from martin-frbg/issue1955
Disable the SkyLakeX DGEMMIxCOPY kernels as well
2019-05-05 18:08:49 +02:00
Martin Kroeker
b1561ecc68 Disable DGEMMINCOPY as well for now
#1955
2019-05-05 15:52:01 +02:00
Martin Kroeker
7ed8431527 Disable the SkyLakeX DGEMMITCOPY kernel as well
as a stopgap measure for https://github.com/numpy/numpy/issues/13401 as mentioned in #1955
2019-05-04 22:54:41 +02:00
Martin Kroeker
a387a23518 Merge pull request #2101 from luzpaz/misc-typos
Misc. typo fixes in comments and documentation
2019-05-04 22:28:29 +02:00
luz.paz
b46875b76b Revert Changelog.txt typos 2019-05-04 15:43:17 -04:00
luz.paz
858e609e1f Revert reference/ fixes 2019-05-04 15:01:29 -04:00
Martin Kroeker
3f427c0cf9 Merge pull request #2107 from quickwritereader/develop
sgemm/strmm kernel for power9
2019-05-02 07:56:57 +02:00
Martin Kroeker
c95317158f Merge pull request #2105 from martin-frbg/issue2104
Correct argument of CPU_ISSET for glibc <2.5
2019-05-02 07:56:37 +02:00
AbdelRauf
47f892198c conflict resolve 2019-05-01 19:36:22 +00:00
Martin Kroeker
b43c8382c8 Correct argument of CPU_ISSET for glibc <2.5
fixes #2104
2019-05-01 10:46:46 +02:00
luz.paz
daf2fec12d Misc. typo fixes
Found via `codespell -q 3 -w -L ith,als,dum,nd,amin,nto,wis,ba -S ./relapack,./kernel,./lapack-netlib`
2019-04-29 17:03:56 -04:00
Martin Kroeker
4f8143b098 Increment version to 0.3.7.dev 2019-04-29 19:25:32 +02:00
Martin Kroeker
bfeb9c16b0 Increment version to 0.3.7.dev 2019-04-29 19:24:53 +02:00
Martin Kroeker
97d5034ed3 Merge branch 'release-0.3.0' into develop 2019-04-29 19:21:54 +02:00
Martin Kroeker
9763f872fc Update Changelog with changes from 0.3.6 2019-04-29 19:18:26 +02:00
AbdelRauf
628b335e83 Merge branch 'develop' of https://github.com/quickwritereader/OpenBLAS into develop 2019-04-29 08:57:44 +00:00
AbdelRauf
0f105dd8a5 sgemm/strmm 2019-04-29 08:49:50 +00:00
Martin Kroeker
9c4edd38f2 Merge pull request #2099 from martin-frbg/rela-gbtrf
Disable repeated recursion on Ab_BR in ReLAPACK xGBTRF
2019-04-29 09:25:19 +02:00
Martin Kroeker
1036299da0 Disable repeated recursion on Ab_BR in ReLAPACK xGBTRF
due to crashes in LAPACK tests
2019-04-29 00:12:37 +02:00
Martin Kroeker
5b0398186e Merge pull request #2098 from martin-frbg/rela-malloc
Disable reallocation of work array in ReLAPACK xSYTRF
2019-04-28 19:31:01 +02:00
Martin Kroeker
452859f4e1 Merge pull request #2097 from martin-frbg/rela-getrf
Correct INFO=4 condition in ReLAPACK xGETRF
2019-04-28 19:28:57 +02:00
Martin Kroeker
2cd463eabd Disable reallocation of work array in xSYTRF
as it appears to cause memory management problems (seen in the LAPACK tests)
2019-04-28 10:02:28 +02:00
Martin Kroeker
11530b76f7 Correct INFO=4 condition 2019-04-28 09:58:56 +02:00
Martin Kroeker
91943b7325 Merge pull request #2096 from martin-frbg/eig-testing
Avoid out-of-bounds accesses in LAPACK EIG tests
2019-04-28 09:55:42 +02:00
Martin Kroeker
268c28db7d Merge pull request #2095 from martin-frbg/trsm
Correct length of name string in xerbla call
2019-04-28 09:55:25 +02:00
Martin Kroeker
2aad88d5b9 Avoid out-of-bounds accesses in LAPACK EIG tests
see https://github.com/Reference-LAPACK/lapack/issues/333
2019-04-27 23:01:49 +02:00
Martin Kroeker
0bd956fd21 Correct length of name string in xerbla call 2019-04-27 22:49:04 +02:00
Martin Kroeker
bbd9d98664 Merge pull request #2094 from martin-frbg/issue2066
Fix ReLAPACK integration problems
2019-04-27 22:45:47 +02:00
Martin Kroeker
798c448b0c Add support for INTERFACE64 and fix XERBLA calls
1. Replaced all instances of "int" with "blasint"
2. Added string length as "hidden" third parameter in calls to fortran XERBLA
2019-04-27 19:06:00 +02:00
Martin Kroeker
9a19616a28 Support INTERFACE64=1 2019-04-27 18:55:47 +02:00
Martin Kroeker
6b41eb9c0c Merge pull request #2092 from jeffbaylor/snprintf_with_MSC_VER
snprintf define consolidated to common.h
2019-04-23 20:12:06 +02:00
Martin Kroeker
ccfb7ead15 Merge pull request #2072 from martin-frbg/sum
Add (C)BLAS extension ?sum
2019-04-23 20:11:36 +02:00
Jeff Baylor
40e53e52d6 snprintf define consolidated to common.h 2019-04-22 17:01:34 -07:00
Martin Kroeker
744779d335 Merge pull request #2084 from RashmicaG/develop
Add in runtime CPU detection for POWER.
2019-04-14 21:40:07 +02:00
Rashmica Gupta
bcdf1d4917 Add in runtime CPU detection for POWER. 2019-04-09 14:20:16 +10:00
Martin Kroeker
e06b8438b4 Merge pull request #2080 from martin-frbg/issue2075
Add -lm and disable EXPRECISION support on *BSD
2019-04-02 21:40:58 +02:00
Martin Kroeker
9229d6859b Add -lm and disable EXPRECISION support on *BSD
fixes #2075
2019-04-02 09:38:18 +02:00
Martin Kroeker
21d146a8de Add declarations for ?sum 2019-03-31 22:12:23 +02:00
Martin Kroeker
7f4e36d219 Merge pull request #2073 from martin-frbg/issue2056-2
Detect 32bit environment on 64bit ARM hardware
2019-03-31 13:56:08 +02:00
Martin Kroeker
c04a729081 Add ?sum definitions for generic kernel 2019-03-31 13:55:49 +02:00
Martin Kroeker
100d94f94e Add ?sum 2019-03-31 13:55:05 +02:00
Martin Kroeker
d17da6c6a4 Add cmake defaults for ?sum kernels 2019-03-31 11:57:01 +02:00
Martin Kroeker
1679de5e59 Detect 32bit environment on 64bit ARM hardware
for #2056, using same approach as #2058
2019-03-31 10:50:43 +02:00
Martin Kroeker
246ca29679 Add ZARCH implementation of ?sum
as trivial copies of the respective ?asum kernels with the ABS and vflpsb calls removed
2019-03-30 22:49:05 +01:00
Martin Kroeker
9d717cb5ee Add x86_64 implementation of ?sum
as trivial copy of ?asum with the fabs calls removed
2019-03-30 22:27:04 +01:00
Martin Kroeker
e3bc83f2a8 Add x86 implementation of ?sum
as trivial copy of ?asum with the fabs calls removed
2019-03-30 22:26:10 +01:00
Martin Kroeker
70f2a4e0d7 Add SPARC implementation of ?sum
as trivial copy of ?asum with the fabs replaced by fmov to preserve code structure
2019-03-30 22:25:06 +01:00
Martin Kroeker
706dfe263b Add POWER implementation of ?sum
as trivial copy of ?asum with the fabs replaced by fmr to preserve code structure
2019-03-30 22:23:42 +01:00
Martin Kroeker
688fa9201c Add MIPS64 implementation of ?sum
as trivial copy of ?asum with the fabs replaced by mov to preserve code structure
2019-03-30 22:22:15 +01:00
Martin Kroeker
cdbe0f0235 Add MIPS implementation of ?sum
as trivial copy of ?asum with the fabs calls removed
2019-03-30 22:20:14 +01:00
Martin Kroeker
f8b82bc6dc Add ia64 implementation of ?sum
as trivial copy of asum with the fabs calls removed
2019-03-30 22:18:03 +01:00
Martin Kroeker
3e3ccb9011 Add ARM64 implementations of ?sum
as trivial copies of the respective ?asum kernels with the fabs calls removed
2019-03-30 22:13:36 +01:00
Martin Kroeker
94ab4e6fb2 Add ARM implementations of ?sum
(trivial copies of the respective ?asum with the fabs calls removed)
2019-03-30 22:11:38 +01:00
Martin Kroeker
c3cfc6986b Add implementations of ssum/dsum and csum/zsum
as trivial copies of asum/zsasum with the fabs calls replaced by fmov to preserve code structure
2019-03-30 22:05:11 +01:00
Martin Kroeker
b9f4943a14 Add ?sum 2019-03-30 22:01:13 +01:00
Martin Kroeker
79cfc24a62 Add interface for ?sum (derived from ?asum) 2019-03-30 21:59:18 +01:00
Martin Kroeker
5c42287c4f Add declarations for ?sum and cblas_?sum 2019-03-30 21:58:03 +01:00
Martin Kroeker
32c7063cb0 Merge pull request #2061 from martin-frbg/martin-frbg-patch-1
Disable the AVX512 DGEMM kernel (again)
2019-03-30 21:21:38 +01:00
Martin Kroeker
c19a449096 Merge pull request #2071 from martin-frbg/issue2068
Provide CBLAS interfaces to I?MIN and I?MAX
2019-03-30 14:54:28 +01:00
Martin Kroeker
3d1e36d4cb Build CBLAS interfaces for I?MIN and I?MAX 2019-03-30 12:38:41 +01:00
Martin Kroeker
4f9d3e4b28 Expose CBLAS interfaces for I?MIN and I?MAX 2019-03-30 12:37:13 +01:00
Martin Kroeker
4dec151d0b Merge pull request #2070 from quickwritereader/develop
power9 makefile. dgemm based on power8 kernel with following changes …
2019-03-29 21:46:21 +01:00
Martin Kroeker
7c51cc8527 Merge branch 'develop' into develop 2019-03-29 19:36:29 +01:00
AbdelRauf
853a18bc17 power9 makefile. dgemm based on power8 kernel with following changes : 32x unrolled 16x4 kernel and 8x4 kernel using (lxv stxv butterfly rank1 update). improvement from 17 to 22-23gflops. dtrmm cases were added into dgemm itself 2019-03-29 15:49:40 +00:00
Martin Kroeker
3ae122e2c7 Merge pull request #2069 from aixoss/aix-asm-change
AIX asm syntax changes needed for shared object creation
2019-03-25 21:34:30 +01:00
Ayappan P
b043a5962e AIX asm syntax changes needed for shared object creation 2019-03-25 18:53:25 +05:30
Martin Kroeker
8502030e5e Merge pull request #2064 from embray/cygwin/use-tls-thread-memory-cleanup
Fix for #2063
2019-03-19 22:12:51 +01:00
Erik M. Bray
8ba9e2a61a Also call CloseHandle on each thread, as well as on the event so as to not leak thread handles. 2019-03-19 11:21:44 +01:00
Erik M. Bray
4ad694eda1 Fix for #2063: The DllMain used in Cygwin did not run the thread memory
pool cleanup upon THREAD_DETACH which is needed when compiled with
USE_TLS=1.
2019-03-19 09:26:50 +01:00
Martin Kroeker
dff4a197a5 Merge pull request #2058 from xsacha/patch-3
Change 64-bit detection as explained in #2056
2019-03-16 11:57:23 +01:00
Martin Kroeker
a5425575b1 Merge pull request #2060 from embray/cygwin/readenv
Use POSIX getenv on Cygwin
2019-03-16 11:56:51 +01:00
Erik M. Bray
1006ff8a7b Use POSIX getenv on Cygwin
The Windows-native GetEnvironmentVariable cannot be relied on, as
Cygwin does not always copy environment variables set through Cygwin
to the Windows environment block, particularly after fork().
2019-03-15 15:06:30 +01:00
Martin Kroeker
e608d4f7fe Disable the AVX512 DGEMM kernel (again)
Due to as yet unresolved errors seen in #1955 and #2029
2019-03-13 22:10:28 +01:00
Martin Kroeker
4fc17d0d75 Trivial typo fix
as suggested in #2022
2019-03-13 19:20:23 +01:00
Sacha
c3e30b2bc2 Change 64-bit detection as explained in #2056 2019-03-13 23:21:54 +10:00
Martin Kroeker
03d7110900 Merge pull request #2042 from maomao194313/develop
add TARGET support for HiSilicon tsv110 CPUs
2019-03-12 22:57:39 +01:00
Martin Kroeker
3ce28fb81a Merge pull request #2055 from martin-frbg/atomid
Add CPUID data for Intel Denverton (as Nehalem)
2019-03-12 22:57:07 +01:00
Martin Kroeker
04f2226ea6 Add Intel Denverton 2019-03-12 16:09:55 +01:00
Martin Kroeker
b1393c7a97 Add Intel Denverton
for #2048
2019-03-12 16:03:56 +01:00
maomao194313
7e3eb9b25d make DYNAMIC_ARCH=1 package work on TSV110 2019-03-12 16:11:01 +08:00
maomao194313
f074d7d146 make DYNAMIC_ARCH=1 package work on TSV110. 2019-03-12 16:05:19 +08:00
Martin Kroeker
f18ab6c17b Merge pull request #2051 from martin-frbg/issue2048
Make TARGET=GENERIC compatible with DYNAMIC_ARCH=1
2019-03-09 16:39:35 +01:00
Martin Kroeker
946ec6c3b8 Merge pull request #2050 from kencu/PowerMacFix
PowerMac 970 fixes
2019-03-09 16:39:08 +01:00
Martin Kroeker
5b95534afc Make TARGET=GENERIC compatible with DYNAMIC_ARCH=1
for issue #2048
2019-03-09 11:21:16 +01:00
ken-cunningham-webuse
f7a06463d9 common_power.h: force DCBT_ARG 0 on PPC970 Darwin
without this, we see
../kernel/power/gemv_n.S:427:Parameter syntax error
and many more similar entries

that relates to this assembly command
dcbt 8, r24, r18

this change makes the DCBT_ARG = 0
and openblas builds through to completion on PowerMac 970
Tests pass
2019-03-07 12:03:45 -08:00
ken-cunningham-webuse
b0c714ef60 param.h : enable defines for PPC970 on DarwinOS
fixes:
gemm.c: In function 'sgemm_':
../common_param.h:981:18: error: 'SGEMM_DEFAULT_P' undeclared (first use in this function)
 #define SGEMM_P  SGEMM_DEFAULT_P
                  ^
2019-03-07 12:03:25 -08:00
Martin Kroeker
8d3d29e4d7 Merge pull request #2049 from Celelibi/fix_crash_sgemm_sse_x64
Fix crash in sgemm SSE/nano kernel on x86_64
2019-03-07 19:28:06 +01:00
Celelibi
b7f59da42d Fix crash in sgemm SSE/nano kernel on x86_64
Fix bug #2047.

Signed-off-by: Celelibi <celelibi@gmail.com>
2019-03-07 16:55:13 +01:00
Martin Kroeker
db3dc9e282 Merge pull request #2046 from kencu/powermac
ctest.c : add __POWERPC__ for PowerMac
2019-03-07 14:51:41 +01:00
ken-cunningham-webuse
4290afdae2 ctest.c : add __POWERPC__ for PowerMac 2019-03-06 20:55:06 -08:00
Martin Kroeker
4741ce803b Merge pull request #2045 from martin-frbg/2033-3
Do not compile in AVX512 check if AVX support is disabled
2019-03-06 22:40:26 +01:00
Martin Kroeker
11cfd0bd75 Do not compile in AVX512 check if AVX support is disabled
xgetbv is function depends on NO_AVX being undefined - we could change that too, but that combo is unlikely to work anyway
2019-03-05 16:04:25 +01:00
Martin Kroeker
651ab01d2b Merge pull request #2044 from martin-frbg/issue2043
Fix module definition conflicts between LAPACK and ReLAPACK
2019-03-05 12:11:32 +01:00
Martin Kroeker
d7b2c53c0b Merge pull request #2039 from brada4/meminit
Address warning in memory.c
2019-03-05 12:11:15 +01:00
Martin Kroeker
e4864a8933 Fix module definition conflicts between LAPACK and ReLAPACK
for #2043
2019-03-04 21:17:08 +01:00
Martin Kroeker
10d841d8b9 Merge pull request #2026 from martin-frbg/trmv_threads
Correct range limiting in trmv_thread and re-enable TRMV multithreading
2019-03-04 15:08:31 +01:00
Martin Kroeker
12f2b76748 Merge pull request #2038 from martin-frbg/issue2035
Improve handling of NO_STATIC and NO_SHARED
2019-03-04 15:07:48 +01:00
Martin Kroeker
6c83b878f6 Merge pull request #2040 from martin-frbg/locks2002
Restore locking optimizations for OpenMP case
2019-03-04 15:07:14 +01:00
maomao194313
fb4dae7124 add TARGET support for HiSilicon tsv110 CPUs 2019-03-04 16:48:49 +08:00
maomao194313
760842dda1 add TARGET support for HiSilicon tsv110 CPUs 2019-03-04 16:45:22 +08:00
maomao194313
53f482ee72 add TARGET support for HiSilicon tsv110 CPUs 2019-03-04 16:41:21 +08:00
maomao194313
783ba8058f HiSilicon tsv110 CPUs optimization branch
add HiSilicon tsv110 CPUs  optimization branch
2019-03-04 16:30:50 +08:00
Martin Kroeker
af480b02a4 Restore locking optimizations for OpenMP case
restore another accidentally dropped part of #1468 that was missed in #2004 to address performance regression reported in #1461
2019-03-03 14:17:07 +01:00
Andrew
e4a79be6bb address warning introed with #1814 et al 2019-03-03 09:05:11 +02:00
Andrew
e5c316c6b9 init 2019-03-03 08:59:27 +02:00
Martin Kroeker
25427926bc Improve handling of NO_STATIC and NO_SHARED
to avoid surprises from defining either as zero. Fixes #2035 by addressing some concerns from #1422
2019-03-02 23:36:36 +01:00
Martin Kroeker
edb8143141 Merge pull request #2037 from martin-frbg/issue2033-2
Make sure that AVX512 is disabled in 32bit builds
2019-03-01 11:45:02 +01:00
Martin Kroeker
c4868d11c0 Make sure that AVX512 is disabled in 32bit builds
for #2033
2019-03-01 09:23:03 +01:00
Martin Kroeker
4c321ae571 Merge pull request #2034 from martin-frbg/issue2033
Make x86_32 imply NO_AVX2, NO_AVX512 in addition to NO_AVX
2019-02-28 22:10:12 +01:00
Martin Kroeker
2ffb727187 Keep xcode8.3 for osx BINARY=32 build
as xcode10 deprecated i386
2019-02-28 10:51:54 +01:00
Martin Kroeker
d66214c946 Make x86_32 imply NO_AVX2, NO_AVX512 in addition to NO_AVX
fixes #2033
2019-02-28 09:58:25 +01:00
Martin Kroeker
fd34820b99 Fix AVX512 test always returning false due to missing compiler option 2019-02-25 17:58:31 +01:00
Martin Kroeker
918a0cc4d1 Fix missing -c option in AVX512 test 2019-02-25 17:55:36 +01:00
Martin Kroeker
0db9c03e7e Merge pull request #2028 from brada4/mv
Move one of clobber fixes to right place
2019-02-24 19:50:23 +01:00
Andrew
6eee1beac5 move fix to right place 2019-02-24 20:41:02 +02:00
Andrew
e5df5958cc init 2019-02-24 20:39:25 +02:00
Martin Kroeker
343b301d14 Reduce list of kernels in the dynamic arch build
to make compilation complete reliably within the 1h limit again
2019-02-20 10:27:48 +01:00
Martin Kroeker
45333d5793 Fix error introduced during cleanup 2019-02-19 22:16:33 +01:00
Martin Kroeker
e29b0cfcc4 Allow multithreading TRMV again
revert workaround introduced for issue #1332 as the actual cause appears to be my incorrect fix from #1262 (see #1388)
2019-02-19 21:03:30 +01:00
Martin Kroeker
78d9910236 Correct range_n limiting
same bug as seen in #1388, somehow missed in corresponding PR #1389
2019-02-19 20:59:48 +01:00
Martin Kroeker
e12cdf58ef Merge pull request #2024 from martin-frbg/gcc9fixes4
Fix inline assembly constraints in Bulldozer TRSM kernels
2019-02-17 11:49:15 +01:00
Martin Kroeker
1860c9456d Merge pull request #2023 from martin-frbg/gcc9fixes3
Fix inline assembly constraints in various x86_64 GEMVN kernels
2019-02-17 11:48:57 +01:00
Martin Kroeker
aec905498f Merge pull request #1988 from TiborGY/patch-1
Reword/expand comments in Makefile.rule
2019-02-17 11:36:04 +01:00
TiborGY
56089991e2 fix the the 2019-02-16 23:26:13 +01:00
Martin Kroeker
f9bb76d29a Fix inline assembly constraints in Bulldozer TRSM kernels
rework indices to allow marking i,as and bs as both input and output (marked operand n1 as well for simplicity). For #2009
2019-02-16 20:06:48 +01:00
Martin Kroeker
8242b1fe3f Fix inline assembly constraints 2019-02-16 18:51:09 +01:00
Martin Kroeker
efb9038f72 Fix inline assembly constraints 2019-02-16 18:46:17 +01:00
Martin Kroeker
e976557d29 Fix inline assembly constraints
rework indices to allow marking argument lda as input and output.
2019-02-16 18:36:39 +01:00
Martin Kroeker
9d8be15789 Fix inline assembly constraints
rework indices to allow marking argument lda4 as input and output. For #2009
2019-02-16 18:24:11 +01:00
Martin Kroeker
d752799a0f Merge pull request #2021 from martin-frbg/gcc9fixes2
Fix wrong constraints in inline assembly of Haswell DTRSM kernel
2019-02-16 18:05:40 +01:00
TiborGY
f209fc7fa9 Update Makefile.rule
add note about NUM_THREADS for package maintainers, add examples of programs that cause affinity troubles
2019-02-16 12:12:39 +01:00
Martin Kroeker
c26c0b77a7 Fix wrong constraints in inline assembly
for #2009
2019-02-15 15:08:16 +01:00
Martin Kroeker
1c6da2d03c Merge pull request #2019 from martin-frbg/gcc9fixes
Fix unannounced modification of input operand 8 (lda4) in Haswell GEMVN microkernel
2019-02-15 15:02:54 +01:00
Martin Kroeker
4255a58cd2 Rename operands to put lda on the input/output constraint list 2019-02-15 10:10:04 +01:00
Martin Kroeker
d3e4725548 Merge pull request #2020 from martin-frbg/issue1956
With the Intel compiler on Linux, prefer ifort for the final link step
2019-02-15 09:57:59 +01:00
Martin Kroeker
adb419ed67 With the Intel compiler on Linux, prefer ifort for the final link step
icc has known problems with mixed-language builds that ifort can handle just fine. Fixes #1956
2019-02-14 22:57:30 +01:00
Martin Kroeker
46e415b140 Save and restore input argument 8 (lda4)
Fixes miscompilation with gcc9 -ftree-vectorize (related to issue #2009)
2019-02-14 22:43:18 +01:00
Martin Kroeker
cd5a59b9cf Merge pull request #2018 from bartoldeman/fix-dgemv-znver1-tree-vectorize
dgemv_kernel_4x4(Haswell): add missing clobbers for xmm0,xmm1,xmm2,xmm3
2019-02-14 21:55:11 +01:00
Bart Oldeman
69a97ca7b9 dgemv_kernel_4x4(Haswell): add missing clobbers for xmm0,xmm1,xmm2,xmm3
This fixes a crash in dblat2 when OpenBLAS is compiled using
-march=znver1 -ftree-vectorize -O2

See also:
https://github.com/easybuilders/easybuild-easyconfigs/issues/7180
2019-02-14 16:27:58 +00:00
Martin Kroeker
b55c586fac Fix missing clobber in x86/x86_64 blas_quickdivide inline assembly function (#2017)
* Fix missing clobber in blas_quickdivide assembly
2019-02-14 15:21:36 +01:00
Martin Kroeker
056917d616 Merge pull request #2013 from martin-frbg/issue2011
Fix invalid memory access in PPC gemm_beta
2019-02-14 09:29:34 +01:00
Martin Kroeker
718efcec6f Fix out-of-bounds memory access in gemm_beta
Fixes #2011 (as suggested by davemq), assuming typo by K.Goto
2019-02-13 22:08:37 +01:00
Martin Kroeker
f9d67bb5e8 Fix out-of-bounds memory access in gemm_beta
Fixes #2011 (as suggested by davemq) presuming typo by K.Goto
2019-02-13 22:06:41 +01:00
Martin Kroeker
76bb74fcd4 Merge pull request #2012 from maamountki/z14
[ZARCH] Many improvements
2019-02-13 20:15:56 +01:00
maamountki
0a54c98b9d [ZARCH] Modify constraints 2019-02-13 21:06:25 +02:00
maamountki
bec54ae366 [ZARCH] Fix caxpy 2019-02-13 12:54:35 +02:00
Martin Kroeker
63d7bad8a5 Merge pull request #2010 from martin-frbg/issue2009
Fix declaration of input arguments in x86_64 GEMV, SYMV and DSCAL
2019-02-12 23:24:02 +01:00
Martin Kroeker
ab1630f9fa Fix declaration of arguments in inline assembly
Argument 0 is modified so should be input and output
2019-02-12 16:14:02 +01:00
Martin Kroeker
b824fa70eb Fix declaration of assembly arguments in SSYMV and DSYMV microkernels
Arguments 0 and 1 are both input and output
2019-02-12 16:00:18 +01:00
Martin Kroeker
91481a3e4e Fix declaration of input arguments in inline assembly
Argument 0 is modified as it doubles as a counter
2019-02-12 15:51:43 +01:00
Martin Kroeker
dc6ac9eab0 Fix declaration of input arguments in the x86_64 s/dGEMV_T and s/dGEMV_N kernels
Arguments 0 and 1 need to be tagged as both input and output
2019-02-12 15:33:48 +01:00
maamountki
f583674109 [ZARCH] Fix cgemv_t_4 2019-02-12 13:12:28 +02:00
maamountki
77fe70019f [ZARCH] Fix constraints and source code formatting 2019-02-11 16:01:13 +02:00
Martin Kroeker
03a2bf2602 Fix potential memory leak in cpu enumeration on Linux (#2008)
* Fix potential memory leak in cpu enumeration with glibc

An early return after a failed call to sched_getaffinity would leak the previously allocated cpu_set_t. Wrong calculation of the size argument in that call increased the likelyhood of that failure. Fixes #2003
2019-02-10 23:24:45 +01:00
Martin Kroeker
69edc5bbe7 Restore dropped patches in the non-TLS branch of memory.c (#2004)
* Restore dropped patches in the non-TLS branch of memory.c

As discovered in #2002, the reintroduction of the "original" non-TLS version of memory.c as an alternate branch had inadvertently used ba1f91f rather than a8002e2 , thereby dropping the commits for #1450, #1468, #1501, #1504 and #1520.
2019-02-07 20:06:13 +01:00
maamountki
7039770165 [ZARCH] Undo the last commit 2019-02-06 20:11:44 +02:00
Martin Kroeker
641767f846 Merge pull request #2001 from martin-frbg/cmake-dynlist
Support DYNAMIC_LIST option in cmake
2019-02-06 08:39:24 +01:00
Martin Kroeker
af6e2253a2 Merge pull request #2000 from martin-frbg/issue1989
Make c_check robust against old or incomplete perl installations
2019-02-06 00:29:30 +01:00
Martin Kroeker
5952e586ce Support DYNAMIC_LIST option in cmake
e.g. cmake -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST="NEHALEM;HASWELL;ZEN" ..
original issue was #1639
2019-02-05 23:51:40 +01:00
Martin Kroeker
f10408aae8 Merge pull request #1999 from martin-frbg/issue1996-2
fix second instance of complex.h for c++ as well
2019-02-05 22:02:11 +01:00
Martin Kroeker
d70ae3ab43 Make c_check robust against old or incomplete perl installations
by catching and working around failures to load modules, and avoiding object-oriented syntax in tempfile creation.
Fixes #1989
2019-02-05 20:06:34 +01:00
Martin Kroeker
1391fc46d2 fix second instance of complex.h for c++ as well 2019-02-05 19:29:33 +01:00
maamountki
11a43e8116 [ZARCH] Set alignment hint for vl/vst 2019-02-05 19:17:08 +02:00
Martin Kroeker
817fe9865c Merge pull request #1998 from martin-frbg/issue1992
Include complex rather than complex.h in C++ contexts
2019-02-05 17:39:59 +01:00
Martin Kroeker
f4b82d7bc4 Include complex rather than complex.h in C++ contexts
to avoid name clashes e.g. with boost headers that use I as a generic placeholder.
Fixes #1992 as suggested by aprokop in that issue ticket.
2019-02-05 13:30:13 +01:00
maamountki
61526480f9 [ZARCH] Fix copy constraint 2019-02-05 07:51:19 +02:00
maamountki
81daf6bc38 [ZARCH] Format source code, Fix constraints 2019-02-05 07:30:38 +02:00
maamountki
a38aa56e76 Merge pull request #1 from xianyi/develop
Update
2019-02-05 07:25:38 +02:00
Martin Kroeker
729e925174 Merge pull request #1996 from quickwritereader/develop
NBMAX=4096 for gemvn, added sgemvn 8x8 for future
2019-02-04 16:52:04 +01:00
Ubuntu
498ac98581 Note for unused kernels 2019-02-04 15:41:56 +00:00
Ubuntu
cd9ea45463 NBMAX=4096 for gemvn, added sgemvn 8x8 for future 2019-02-04 06:57:11 +00:00
Martin Kroeker
f9c5023e04 Merge pull request #1994 from quickwritereader/develop
sgemv cgemv pairs
2019-02-01 21:04:47 +01:00
Ubuntu
4abc375a91 sgemv cgemv pairs 2019-02-01 13:45:00 +00:00
Martin Kroeker
874df65491 Fix incorrect sgemv results for IBM z14
part of PR #1993 that was inadvertently misplaced into the toplevel directory
2019-02-01 12:58:59 +01:00
Martin Kroeker
1f4b61f572 Delete misplaced file sgemv_t_4.c
from #1993 , file should have gone into kernel/zarch
2019-02-01 12:57:01 +01:00
Martin Kroeker
282230c303 Merge pull request #1993 from martin-frbg/aarnes-zarch
Various fixes for the new Z14 target
2019-01-31 21:27:00 +01:00
Martin Kroeker
cce574c3e0 Improve the z14 SGEMVT kernel
from patch provided by aarnez in #991
2019-01-31 21:24:55 +01:00
Martin Kroeker
877023e1e1 Fix precision of zarch DSDOT
from patch provided by aarnez in #991
2019-01-31 21:22:26 +01:00
Martin Kroeker
265142edd5 Fix typo in the zarch min/max kernels
from patch provided by aarnez in #991
2019-01-31 21:21:40 +01:00
Martin Kroeker
885a3c4350 USE_TRMM on Z14
from patch provided by aarnez in #991
2019-01-31 21:18:09 +01:00
Martin Kroeker
4b512f84dd Add cache sizes for Z14
from patch provided by aarnez in #991
2019-01-31 21:16:44 +01:00
Martin Kroeker
72d3e7c9b4 Add FORCE Z14
from patch provided by aarnez in #991
2019-01-31 21:15:50 +01:00
Martin Kroeker
bdc73a49e0 Add parameters for Z14
from patch provided by aarnez in #991
2019-01-31 21:14:37 +01:00
Martin Kroeker
1249ee1fd0 Add Z14 target
from patch provided by aarnez in #991
2019-01-31 21:13:46 +01:00
Martin Kroeker
42df9efa0c Merge pull request #1991 from maamountki/z14
[ZARCH] Z14 Support, BLAS 1/2 single precision implementations
2019-01-31 19:10:03 +01:00
maamountki
82124729af Merge branch 'develop' into z14 2019-01-31 19:36:41 +02:00
maamountki
29416cb5a3 [ZARCH] Add Z13 version for max/min functions 2019-01-31 19:11:11 +02:00
maamountki
48b9b94f7f [ZARCH] Improve loading performance for camax/icamax 2019-01-31 18:52:11 +02:00
Martin Kroeker
86a824c97f Fix wrong comparison that made IMIN identical to IMAX
as reported by aarnez in #1990
2019-01-31 15:27:21 +01:00
Martin Kroeker
808410c2c7 Fix wrong comparison that made IMIN identical to IMAX
as suggested in #1990
2019-01-31 15:25:15 +01:00
maamountki
eaf20f0e7a Remove ztest 2019-01-31 09:26:50 +02:00
maamountki
fcd814a8d2 [ZARCH] Fix bug in max/min functions 2019-01-29 17:59:38 +02:00
maamountki
dc4d3bccd5 [ZARCH] Fix icamax/icamin 2019-01-29 03:47:49 +02:00
maamountki
c7143c1019 [ZARCH] Fix iamax/imax single precision 2019-01-28 17:52:23 +02:00
maamountki
04873bb174 [ZARCH] Undo the last commit 2019-01-28 17:32:24 +02:00
maamountki
c8ef9fb220 [ZARCH] Fix bug in iamax/iamin/imax/imin 2019-01-28 17:16:18 +02:00
Martin Kroeker
5be61f4b47 Merge pull request #1985 from martin-frbg/issue1984
Correct naming of getrf_parallel object
2019-01-28 15:44:57 +01:00
Martin Kroeker
3d155cff83 Merge pull request #1981 from edisongustavo/develop
Fix include directory of exported targets
2019-01-28 15:44:42 +01:00
Martin Kroeker
7d47f0a82d Merge pull request #1978 from danielgindi/feature/msvc_cmake
Better support for MSVC/Windows in CMake (v0.3.x)
2019-01-28 15:43:35 +01:00
Martin Kroeker
a529c71a74 Merge pull request #1962 from brada4/r
Modrenize R benchmarks slightly
2019-01-28 15:42:57 +01:00
TiborGY
ea1716ce2a Update Makefile.rule
Revert generate to install, explain the nature of the affinity conflict
2019-01-27 17:22:26 +01:00
TiborGY
0f24b39ebf Reword/expand comments in Makefile.rule
Lots of small changes in the wording of the comments, plus an expansion of the NUM_THREADS and NO_AFFINITY sections.
2019-01-27 15:33:00 +01:00
Martin Kroeker
89b60dab8a Merge pull request #1987 from martin-frbg/issue1961
Change ARMV8 target with BINARY=32 to ARMV7 automatically
2019-01-26 22:25:29 +01:00
Martin Kroeker
58dd7e4501 Change ARMV8 target to ARMV7 for BINARY=32 2019-01-26 17:52:33 +01:00
Martin Kroeker
36b844af88 Change ARMV8 target to ARMV7 when BINARY32 is set
fixes #1961
2019-01-26 17:47:22 +01:00
Martin Kroeker
e882b239aa Correct naming of getrf_parallel object
fixes #1984
2019-01-26 00:45:45 +01:00
Martin Kroeker
3f7bb87a2a Merge pull request #1971 from martin-frbg/trsm-threshold
Shift transition to multithreading towards larger matrix sizes
2019-01-24 09:17:48 +01:00
Edison Gustavo Muenz
e908ac2a51 Fix include directory of exported targets 2019-01-23 15:09:13 +01:00
Martin Kroeker
8533aca964 Avoid penalizing tall skinny matrices 2019-01-23 10:03:00 +01:00
Martin Kroeker
16494cb7c4 Merge pull request #1980 from martin-frbg/issue1979
Report SkylakeX as Haswell if compiler does not support AVX512
2019-01-22 21:10:38 +01:00
Martin Kroeker
b56b34a75c Syntax fix 2019-01-22 18:55:43 +01:00
Martin Kroeker
21eda8b577 Report SkylakeX as Haswell if compiler does not support AVX512
... or make was invoked with NO_AVX512=1
2019-01-22 18:47:12 +01:00
Daniel Cohen Gindi
24288803b3 Adjust test script for correct deployment 2019-01-22 14:38:01 +02:00
Martin Kroeker
f0d834b824 Use VERSION_LESS for comparisons involving software version numbers 2019-01-22 12:32:24 +01:00
Daniel Cohen Gindi
63bbd7b0d7 Better support for MSVC/Windows in CMake 2019-01-21 17:47:47 +02:00
maamountki
b111829226 [ZARCH] Update max/min functions 2019-01-21 15:56:04 +02:00
Martin Kroeker
010d59bfee Merge pull request #1973 from martin-frbg/issue1464
Increase Zen SWITCH_RATIO to 16
2019-01-20 20:30:11 +01:00
Martin Kroeker
83b5c6b92d Fix compilation with NO_AVX=1 set
fixes #1974
2019-01-20 12:18:53 +01:00
Martin Kroeker
bbfdd6c0fe Increase Zen SWITCH_RATIO to 16
following GEMM benchmarks on Ryzen2700X. For #1464
2019-01-19 23:01:31 +01:00
Martin Kroeker
cda81cfae0 Shift transition to multithreading towards larger matrix sizes
See #1886 and JuliaRobotics issue 500. trsm benchmarks on Haswell and Zen showed that with these values performance is roughly doubled for matrix sizes between 8x8 and 14x14, and still 10 to 20 percent better near the new cutoff at 32x32.
2019-01-19 00:10:01 +01:00
Martin Kroeker
32b0f1168e Fix declaration of input arguments in the Sandybridge GER microkernels (#1967)
* Tag arguments 0 and 1 as both input and output
2019-01-18 08:11:39 +01:00
Martin Kroeker
b495e54310 Fix declaration of input arguments in the x86_64 SCAL microkernels (#1966)
* Tag arguments 0 and 1 as both input and output (see #1964)
2019-01-18 08:11:07 +01:00
Martin Kroeker
d5e6940253 Fix declaration of input arguments in the x86_64 microkernels for DOT and AXPY (#1965)
* Tag operands 0 and 1 as both input and output

For #1964 (basically a continuation of coding problems first seen in #1292)
2019-01-17 23:20:32 +01:00
Martin Kroeker
24e697eadb Merge pull request #1970 from quickwritereader/develop
crot fix
2019-01-17 16:42:11 +01:00
Martin Kroeker
3e9fd6359d Bump xcode version to 10.1 to make sure it handles AVX512 2019-01-17 16:19:03 +01:00
Ubuntu
43a4572038 crot fix 2019-01-17 14:45:31 +00:00
Martin Kroeker
256eb588bb Merge pull request #1963 from quickwritereader/develop
Blas1 single missing kernels implemented with vector builtins
2019-01-16 18:41:03 +01:00
Abdelrauf
a034e65512 Merge branch 'develop' into develop 2019-01-16 19:25:13 +04:00
Ubuntu
8c3386be87 Added missing Blas1 single fp {saxpy, caxpy, cdot, crot(refactored version of srot),isamax ,isamin, icamax, icamin},
Fixed idamin,icamin choosing the first occurance index of equal minimals
2019-01-16 15:16:21 +00:00
Andrew
3e601bd419 disable NaN checks before BLAS calls dgemm.R 2019-01-16 11:54:22 +02:00
Andrew
478d3c4569 disable NaN checks before BLAS calls deig.R (shorten matrix def) 2019-01-16 11:41:46 +02:00
Andrew
3afceb6c2a disable NaN checks before BLAS calls deig.R 2019-01-16 11:38:14 +02:00
Andrew
7af8b21dbb disable NaN checks before BLAS calls dsolve.R (shorter formula) 2019-01-16 11:34:46 +02:00
Martin Kroeker
1e3ada6db4 Merge pull request #1960 from cnjsdfcy/Hygon
Add support for Hygon Dhyana
2019-01-16 10:27:14 +01:00
Andrew
2777a7f506 disable NaN checks before BLAS calls dsolve.R (shorter config part) 2019-01-16 11:23:51 +02:00
Andrew
b70fd23836 disable NaN checks before BLAS calls dsolve.R 2019-01-16 11:18:54 +02:00
Andrew
def0385caa init 2019-01-16 09:51:29 +02:00
caiyu
29dc72889f Add support for Hygon Dhyana 2019-01-16 14:25:19 +08:00
maamountki
b815a04c87 [ZARCH] fix a bug in max/min functions 2019-01-15 21:04:22 +02:00
Martin Kroeker
dbc9a060ef Fix missing braces in support_av() call 2019-01-14 22:41:31 +01:00
Martin Kroeker
00401489c2 Fix missing braces in support_avx() 2019-01-14 22:38:32 +01:00
maamountki
1a7925b3a3 [ZARCH] Update dgemv_n_4.c 2019-01-11 17:43:11 +02:00
maamountki
406f835f00 [ZARCH] update cgemv_n_4.c 2019-01-11 17:39:17 +02:00
maamountki
621dedb37b [ZARCH] Update cgemv_t_4.c 2019-01-11 17:37:11 +02:00
maamountki
b731e8246f Update sgemv_t_4.c 2019-01-11 17:14:04 +02:00
maamountki
ecc31b743f Update dgemv_t_4.c 2019-01-11 17:13:02 +02:00
maamountki
5d89d6b143 [ZARCH] fix sgemv_n_4.c 2019-01-11 17:08:24 +02:00
maamountki
67432b23c2 [ZARCH] fix cgemv_n_4.c 2019-01-11 16:44:46 +02:00
Martin Kroeker
21c0f2af7b Merge pull request #1957 from martin-frbg/issue1954
Move TLS key deletion to openblas_quit
2019-01-10 12:04:08 +01:00
Martin Kroeker
ad2c386d6a Move TLS key deletion to openblas_quit
fixes #1954 (as suggested by thrasibule in that issue)
2019-01-10 00:32:50 +01:00
maamountki
be66f5d5c2 [ZARCH] fix data prefetch type in sdot 2019-01-09 16:50:07 +02:00
maamountki
c2ffef8156 [ZARCH] fix data prefetch type in ddot 2019-01-09 16:49:44 +02:00
maamountki
e7455f500c [ZARCH] fix dsdot.c 2019-01-09 16:33:54 +02:00
maamountki
3eafcfa650 [ZARCH] fix cgemv_n_4.c 2019-01-09 07:43:45 +02:00
Martin Kroeker
8d99dba86b Merge pull request #1949 from martin-frbg/issue1947
Query AVX2 and AVX512VL support when selecting x86 kernels
2019-01-08 20:44:08 +01:00
Martin Kroeker
1650311246 Bump xcode to 8.3 2019-01-08 14:43:45 +01:00
Martin Kroeker
cf5d48e833 Update OSX environment to Sierra
as homebrew seems to have dropped support for El Capitan in their gcc packages
2019-01-08 14:41:48 +01:00
Martin Kroeker
191677b902 Add travis_wait to the OSX brew install phase 2019-01-08 10:46:47 +01:00
Martin Kroeker
31ed19e8b9 Add message for SkylakeX and KNL fallbacks to Haswell 2019-01-05 19:41:13 +01:00
Martin Kroeker
e1574fa2b4 Add xcr0 (os support) check 2019-01-05 18:08:02 +01:00
Martin Kroeker
68eb3146ce Add xcr0 (os support) check 2019-01-05 18:07:14 +01:00
Martin Kroeker
0afaae4b23 Query AVX2 and AVX512VL capability in x86 cpu detection 2019-01-05 16:58:56 +01:00
Martin Kroeker
ae1d1f74f7 Query AVX2 and AVX512 capability for runtime cpu selection 2019-01-05 16:55:33 +01:00
maamountki
94cd946b96 [ZARCH] fix cgemv_n_4.c 2019-01-04 17:45:56 +02:00
Martin Kroeker
ed01f4932a Merge pull request #1946 from martin-frbg/issue1908
More fixes for cross-compiling ARM64 targets
2019-01-04 01:37:37 +01:00
maamountki
1aa840a0a2 [ZARCH] fix sgemv_t_4.c 2019-01-04 01:38:18 +02:00
Martin Kroeker
802f0dbde1 More fixes for cross-compiling ARM64 targets
Fixed core naming for DYNAMIC_ARCH. Corrected GEMM_DEFAULT entries and added SYMV_P. Replaced outdated VULCAN define for ThunderX2T99 with ARMV8 to get basic definitions back. For issue #1908
2019-01-03 22:17:31 +01:00
Martin Kroeker
20d1aad13f Fix missing quotes around thunderx targets 2019-01-02 20:15:35 +01:00
TiborGY
d11554c88f Validate user supplied TARGET (#1941)
the build will now abort with an error message when an undefined build TARGET is named

Fixes #1938
2018-12-31 23:19:44 +01:00
Martin Kroeker
ed704185ab Increment version to 0.3.6.dev 2018-12-31 23:11:37 +01:00
Martin Kroeker
2940798ea7 Increment version to 0.3.6.dev 2018-12-31 23:10:59 +01:00
Martin Kroeker
eebc189287 Version 0.3.5 2018-12-31 23:09:59 +01:00
Martin Kroeker
9185d419d3 Version 0.3.5 2018-12-31 23:09:20 +01:00
Martin Kroeker
4cf9d32694 Merge pull request #1945 from xianyi/develop
Merge changes from develop for 0.3.5 release
2018-12-31 23:08:25 +01:00
Martin Kroeker
1c75b65d53 Merge branch 'release-0.3.0' into develop 2018-12-31 23:07:53 +01:00
Martin Kroeker
13d006339b Update ChangeLog.txt with changes from 0.3.5 2018-12-31 23:00:46 +01:00
Martin Kroeker
bf76162635 Merge pull request #1944 from hartzell/patch-1
Typo: Skyalke -> Skylake
2018-12-31 18:36:18 +01:00
George Hartzell
0d52aefc6b Typo: Skyalke -> Skylake
Worth fixing, it gets in the way of searching....
2018-12-30 14:55:34 -08:00
Martin Kroeker
a6787b0f81 Merge pull request #1939 from TiborGY/patch-2
Fix typo in UNKNOWN core name
2018-12-30 20:10:05 +01:00
Martin Kroeker
8643521127 Merge pull request #1943 from martin-frbg/issue1748
Re-enable loop unrolling in trmv and remove the scary warning
2018-12-30 20:07:01 +01:00
Martin Kroeker
5a720cf9ca Re-enable loop unrolling in trmv and remove the scary warning
fixes #1748 as that half of the fix for #1332 appears to have been an overreaction on my part.
2018-12-30 15:22:37 +01:00
Martin Kroeker
ccd5945d38 Merge pull request #1942 from martin-frbg/issue1720
Delete the pthread key on cleanup in TLS mode
2018-12-30 14:47:05 +01:00
Martin Kroeker
9f80e0f5fc Remove stray include of complex.h
already provided conditionally by common.h via openblas_utest.h
Unconditional inclusion breaks older Android and similar platforms that use OPENBLAS_COMPLEX_STRUCT
2018-12-30 14:39:18 +01:00
Martin Kroeker
bba1e67269 Delete the pthread key on cleanup in TLS mode
to avoid a crash when OpenBLAS was loaded via dlopen and libc tries to clean up the leaked TLS after dlclose
Fixes #1720
2018-12-29 21:59:31 +01:00
Martin Kroeker
93240f489e Fix wrong case in TARGET setting for Alpine 2018-12-29 18:12:54 +01:00
TiborGY
7cbc2c37d6 Update cpuid_mips64.c 2018-12-28 14:36:39 +01:00
TiborGY
c329de2931 Update Makefile 2018-12-28 14:35:41 +01:00
TiborGY
187233953c Update cpuid_mips.c 2018-12-28 14:34:38 +01:00
TiborGY
09170268a3 Update cpuid_arm.c 2018-12-28 14:33:18 +01:00
TiborGY
211120c508 Fix typo in UNKNOWN core name
Should be of no consequence, right?
2018-12-27 23:09:21 +01:00
Martin Kroeker
9e4d190f4f Merge pull request #1932 from martin-frbg/issue1915
Add -fPIC to provided CFLAGS/FFLAGS if required
2018-12-24 23:48:33 +01:00
Martin Kroeker
fe02ba86a4 Remove unnecessary change again 2018-12-24 20:46:04 +01:00
Martin Kroeker
284fb00971 Merge pull request #1934 from fenrus75/betagoof
Fix thinko in skylake beta handling
2018-12-24 19:53:50 +01:00
Arjan van de Ven
795285c587 Fix thinko in skylake beta handling
casting ints is cheaper but it has a rounding, not memory casing effect, resulting in
invalid outcome
2018-12-24 18:49:50 +00:00
Martin Kroeker
d6818777d1 Make sure that -fPIC is present if needed 2018-12-23 23:47:37 +01:00
Martin Kroeker
5bd21ab6e1 Make sure that -fPIC is present when needed
override user-provided FFLAGS if necessary
2018-12-23 23:46:48 +01:00
Martin Kroeker
e1eab96502 Merge pull request #1931 from martin-frbg/pr1921
Add -mavx2 to TARGET=HASWELL builds
2018-12-23 23:15:54 +01:00
Martin Kroeker
76b4b8980f Use -dumpversion with gcc only 2018-12-23 19:08:19 +01:00
Martin Kroeker
49e0f485da Add -mavx2 for TARGET=HASWELL if compiler supports and requires it 2018-12-23 17:26:09 +01:00
Martin Kroeker
43c2b0eb55 Add -mavx2 to TARGET=HASWELL builds
to leverage improvements from PR#1921
2018-12-23 17:16:43 +01:00
Martin Kroeker
942e229ed5 Merge pull request #1930 from martin-frbg/issue1908
Reflect ARMV8 target definition changes from PR1876
2018-12-23 15:06:33 +01:00
Martin Kroeker
26a3402773 Reflect ARMV8 target definition changes from PR1876
and create config target directory for cross-compiles.
2018-12-23 12:26:01 +01:00
Martin Kroeker
20033f992a Merge pull request #1929 from martin-frbg/issue1924
Avoid taking the root of a negative number in simple threaded syrk
2018-12-23 09:03:58 +01:00
Martin Kroeker
f343ed65b5 Avoid taking the root of a negative number
Fixes #1924 where numpy 1.17+ would report the (transient) FE_INVALID exception raised for the domain error.
2018-12-22 22:30:29 +01:00
Martin Kroeker
a5a1118527 Merge pull request #1 from xianyi/develop
rebase
2018-12-22 22:13:44 +01:00
Martin Kroeker
e23366e860 Merge pull request #1921 from fenrus75/haswelldgemm
Replicate some of the SKYLAKEX dgemm improvements also to HASWELL
2018-12-17 08:39:20 +01:00
Arjan van de Ven
b28f75cd7e set GEMM_PREFERED_SIZE for HASWELL
Haswell likes a GEMM_PREFERED_SIZE of 16 to improve the split that the
threading code does to make it a nice multiple of the SIMD kernel size
2018-12-16 23:09:27 +00:00
Arjan van de Ven
d321448a63 dgemm: use dgemm_ncopy_8_skylakex.c also for Haswell
The dgemm_ncopy_8_skylakex.c code is not avx512 specific and gives
a nice performance boost for medium sized matrices
2018-12-16 23:09:22 +00:00
Arjan van de Ven
c43331ad0a dgemm: Use the skylakex beta function also for haswell
it's more efficient for certain tall/skinny matrices
2018-12-16 23:09:17 +00:00
Martin Kroeker
e8ca5a59a9 Merge pull request #1919 from fenrus75/haswelltuning
(sgemm) Apply some of the SKYLAKEX optimizations also to HASWELL
2018-12-16 20:11:05 +01:00
Martin Kroeker
c4e23dd016 Update Makefile 2018-12-16 18:14:40 +01:00
Martin Kroeker
cfc4acc221 typo 2018-12-16 16:19:51 +01:00
Martin Kroeker
545c2b1bbb Add -mavx2 on Haswell only if the compiler supports it 2018-12-16 13:09:19 +01:00
Arjan van de Ven
69d206440a Make the skylakex/haswell sgemm code compile and run even with compilers without avx2 support 2018-12-16 00:19:41 +00:00
Martin Kroeker
3843e3e017 use -maxv2 on haswell 2018-12-15 23:30:31 +01:00
Martin Kroeker
fbcb14a74b should be core-avx2 2018-12-15 20:18:59 +01:00
Martin Kroeker
2a3190dc76 fix elseifeq and use older option core2-avx for compatibility 2018-12-15 20:17:44 +01:00
Martin Kroeker
1ebe5c0f49 Add -march=haswell to HASWELL part of DYNAMIC_ARCH build 2018-12-15 19:35:35 +01:00
Arjan van de Ven
0586899a10 Use sgemm_ncopy_4_skylakex.c also for Haswell
sgemm_ncopy_4_skylakex.c uses SSE transpose operations where the
real perf win happens; this also works great for Haswell.

This gives double digit percentage gains on small and skinny matrices
2018-12-15 13:49:19 +00:00
Arjan van de Ven
00dc09ad19 Use the skylake sgemm beta code also for haswell
with a few small changes it's possible to use the skylake sgemm code
also for haswell, this gives a modest gain (10% range) for smallish
matrixes but does wonders for very skinny matrixes
2018-12-15 13:49:13 +00:00
Martin Kroeker
78d877b54b Merge pull request #1914 from fenrus75/smallmatrix
Add a "sgemm direct" mode for small matrixes
2018-12-13 19:08:14 +01:00
Arjan van de Ven
cdc668d82b Add a "sgemm direct" mode for small matrixes
OpenBLAS has a fancy algorithm for copying the input data while laying
it out in a more CPU friendly memory layout.

This is great for large matrixes; the cost of the copy is easily
ammortized by the gains from the better memory layout.

But for small matrixes (on CPUs that can do efficient unaligned loads) this
copy can be a net loss.

This patch adds (for SKYLAKEX initially) a "sgemm direct" mode, that bypasses
the whole copy machinary for ALPHA=1/BETA=0/... standard arguments,
for small matrixes only.

What is small? For the non-threaded case this has been measured to be
in the M*N*K = 28 * 512 * 512 range, while in the threaded case it's
less, around M*N*K = 1 * 512 * 512
2018-12-13 13:47:31 +00:00
Martin Kroeker
87718807f0 Merge pull request #1910 from martin-frbg/issue1909
Fix for DYNAMIC_ARCH builds made on a AVX512-capable host
2018-12-12 14:56:25 +01:00
Martin Kroeker
51aec8e96b make sure the added march=skylake-avx512 does not cause problems on Windows 2018-12-11 22:47:32 +01:00
Martin Kroeker
06f7d78d70 Add -march=skylake-avx512 to SkylakeX part of DYNAMIC_ARCH builds 2018-12-11 21:10:38 +01:00
Martin Kroeker
38cc638591 Avoid adding blanket march=skylake-avx512 to dynamic_arch builds 2018-12-11 21:09:26 +01:00
Martin Kroeker
0bf6d74e5f Fix typo in previous commit for arm dynamic arch 2018-12-07 19:37:33 +01:00
Martin Kroeker
133c278ee5 Add DYNAMIC_CORE list for ARM64
cf #1908
2018-12-07 17:42:23 +01:00
Martin Kroeker
2b355592e3 Make sure to use the arm version of dynamic.c in ARM64 DYNAMIC_ARCH
cf. #1908
2018-12-07 16:25:55 +01:00
Martin Kroeker
ff3eb1d474 Merge pull request #1904 from martin-frbg/issue1870
Fix cmake parsing of GEMM kernels for ARMV8
2018-12-06 23:01:23 +01:00
Martin Kroeker
0b09516678 Fix missing parameter in popen call 2018-12-06 18:33:05 +01:00
Martin Kroeker
7639f2e1f0 Rewrite the conditional for OSX to fix cmake parsing on others
The Makefile variable parser in utils.cmake currently does not handle conditionals. Having the definitions for non-OSX last will at least make cmake builds work again on non-OSX platforms.
2018-12-06 14:04:27 +01:00
Martin Kroeker
2fc712469d Avoid creating spurious non-suffixed c/zgemm_kernels
Plain cgemm_kernel and zgemm_kernel are not used anywhere, only cgemm_kernel_b etc.
Needlessly building them (without any define like NN, CN, etc.) just happened to work on most platforms, but not on arm64. See #1870
2018-12-06 13:56:06 +01:00
Martin Kroeker
6ba30e270d Fix typo that broke CNRM2 on ARMV8 since 0.3.0
must have happened in my #1449
2018-12-06 13:42:25 +01:00
Martin Kroeker
bf23518e36 Merge pull request #1903 from rengolin/armv8
Fix two mistakes on Arm64 builds
2018-12-05 22:10:53 +01:00
Renato Golin
31a490ea88 Fix two mistakes on Arm64 builds
* Falkor is an ARMv8.0 with ARMv8.1 features, and chosing armv8.1-a for
   march generates instructions it cannot cope with. Reverting it back
   to armv8-a.
 * ThunderX2's build was left with a #define VULCAN, which made it miss
   the right compiler flags in Makefile.arm64, although it did create
   the right library in the end.
2018-12-05 18:51:38 +00:00
Martin Kroeker
701ea88347 Use p2align instead of align for OSX compatibility
fixes #1902
2018-12-03 13:06:43 +01:00
Martin Kroeker
721c56c224 Merge pull request #1899 from brada4/fbsd12
Add mutually supported architecture mappings for FreeBSD12 ports
2018-12-03 12:50:27 +01:00
Martin Kroeker
c5f8aeff2d Merge branch 'develop' into fbsd12 2018-12-03 12:50:14 +01:00
Martin Kroeker
8278cbe7f8 Merge pull request #1894 from pkubaj/patch-2
Use correct ARCH name on BSD powerpc64
2018-12-03 12:48:53 +01:00
Martin Kroeker
ea6d1b96bd Update Makefile.system 2018-12-03 08:59:10 +01:00
Martin Kroeker
360374be62 Update with the changes from 0.3.4 2018-12-02 23:44:13 +01:00
Martin Kroeker
f5acaad8f0 Increment version to 0.3.5.dev 2018-12-02 23:43:15 +01:00
Martin Kroeker
93fa6b7b76 Increment version to 0.3.5.dev 2018-12-02 23:42:33 +01:00
Martin Kroeker
c0827a7164 Update with changes from 0.3.4 2018-12-02 23:41:17 +01:00
Martin Kroeker
86cff4effc Merge pull request #1900 from xianyi/develop
Update from develop for 0.3.4
2018-12-02 23:40:21 +01:00
Martin Kroeker
b028960aba Merge branch 'release-0.3.0' into develop 2018-12-02 23:38:49 +01:00
Martin Kroeker
3c9e3faedb fixup BSD naming of powerpc arch 2018-12-02 23:24:53 +01:00
Andrew
44c81fd135 oops 2018-12-02 20:27:53 +01:00
Andrew
26b3710485 Add architecture mappings for FreeBSD12 2018-12-02 12:07:41 +01:00
Andrew
84e614d0fd init 2018-12-02 12:05:15 +01:00
Martin Kroeker
dceff5542c Handle Android environments that identify as Linux (#1898)
* Handle Android environments that identify as Linux

termux terminal emulator does this, causing build failures through missed defines in common.h
2018-12-01 20:56:11 +01:00
Martin Kroeker
6c7b691083 Really revert xDOT changes from 1832
neglected to rebase #1892 on merging
2018-11-30 21:32:01 +01:00
Martin Kroeker
5f4c550c27 Merge pull request #1892 from martin-frbg/mipsdot
revert MIPS64 xDOT kernel changes from #1832
2018-11-30 21:28:21 +01:00
pkubaj
731b2722ba Fix build on POWER, remove DragonFly, add NetBSD
__asm is complete on its own

DBSD developers state they will only support amd64, but NetBSD supports POWER.
2018-11-30 21:12:05 +01:00
pkubaj
f85ce54d4a Use correct Makefile on powerpc64
FreeBSD uses powerpc64 name for POWER architecture. Use correct Makefile for this platform.
2018-11-30 16:05:49 +00:00
Andrew
2601cd58ab remove surplus locking code , only enabled w x86, disabled or never enabled on all others 2018-11-30 11:38:19 +01:00
Martin Kroeker
95a5542e3c Revert DOT kernel changes from #1834
as the failures seen on Loongson3A appear to be limited to DSDOT/SDSDOT (i.e. my hackish "fix" from #1684)
2018-11-30 11:16:24 +01:00
Martin Kroeker
7a2e1bc804 Use generic kernel for DSDOT/SDSDOT
as discussed in #1834
2018-11-30 10:57:09 +01:00
Martin Kroeker
35653e38b3 Merge pull request #1834 from fengrl/develop
register push/pop command change
2018-11-30 10:48:46 +01:00
Martin Kroeker
71e25ae42f Merge pull request #1890 from martin-frbg/issue1889
Include version number in openblas_get_config output
2018-11-29 15:47:35 +01:00
Martin Kroeker
97d7298973 call it OpenBLAS not just version 2018-11-29 11:52:08 +01:00
Martin Kroeker
de0d0ed52f Improve formatting of config output 2018-11-29 11:28:19 +01:00
Martin Kroeker
081ceb3e02 Propagate version number for openblas_get_config 2018-11-29 00:12:04 +01:00
Martin Kroeker
a29ec458c2 propagate verison number for openblas_config_version 2018-11-29 00:10:49 +01:00
Martin Kroeker
816775e309 Add version information to openblas_get_config output 2018-11-29 00:06:44 +01:00
Martin Kroeker
b6363f4539 Merge pull request #1885 from brada4/freebsd
Fix freebsd clang compilation of skylakex
2018-11-25 22:20:13 +01:00
Andrew
19c4bdd8b3 Add return value so that freebsd system clang does not err out 2018-11-25 21:35:01 +01:00
Andrew
f049a4c84f init 2018-11-25 21:34:09 +01:00
Martin Kroeker
f72fdf525c Merge pull request #1875 from martin-frbg/issue1851
Serialize accesses to parallelized level3 functions from multiple cal…
2018-11-25 20:53:46 +01:00
Martin Kroeker
5393759a98 Merge pull request #1869 from martin-frbg/axpy0
Handle special case INCX=0,INCY=0 in the axpy interface
2018-11-25 20:52:49 +01:00
Martin Kroeker
5cf18e2875 Merge pull request #1878 from kiwifb/PGI_f_check
Correct link flags for PGI compiler.
2018-11-25 20:51:50 +01:00
Martin Kroeker
910050985a Merge pull request #1876 from rengolin/armv8-cleanup
Simplifying ARMv8 build parameters
2018-11-25 20:51:24 +01:00
François Bissey
0184713e1a Correct link flags for PGI compiler. 2018-11-21 14:24:56 +13:00
Martin Kroeker
45c3c459e1 Merge pull request #1868 from martin-frbg/aix_cpuid
Use prtconf to determine CPU type on AIX
2018-11-20 17:25:57 +01:00
Martin Kroeker
113cb00b95 fix missing parenthesis 2018-11-19 21:01:36 +01:00
Martin Kroeker
5192651706 Add CriticalSection handling instead of mutexes for Windows 2018-11-19 17:58:22 +01:00
Renato Golin
310ea55f29 Simplifying ARMv8 build parameters
ARMv8 builds were a bit mixed up, with ThunderX2 code in ARMv8 mode
(which is not right because TX2 is ARMv8.1) as well as requiring a few
redundancies in the defines, making it harder to maintain and understand
what core has what. A few other minor issues were also fixed.

Tests were made on the following cores: A53, A57, A72, Falkor, ThunderX,
ThunderX2, and XGene.

Tests were: OpenBLAS/test, OpenBLAS/benchmark, BLAS-Tester.

A summary:
 * Removed TX2 code from ARMv8 build, to make sure it is compatible with
   all ARMv8 cores, not just v8.1. Also, the TX2 code has actually
   harmed performance on big cores.
 * Commoned up ARMv8 architectures' defines in params.h, to make sure
   that all will benefit from ARMv8 settings, in addition to their own.
 * Adding a few more cores, using ARMv8's include strategy, to benefit
   from compiler optimisations using mtune. Also updated cache
   information from the manuals, making sure we set good conservative
   values by default. Removed Vulcan, as it's an alias to TX2.
 * Auto-detecting most of those cores, but also updating the forced
   compilation in getarch.c, to make sure the parameters are the same
   whether compiled natively or forced arch.

Benefits:
 * ARMv8 build is now guaranteed to work on all ARMv8 cores
 * Improved performance for ARMv8 builds on some cores (A72, Falkor,
   ThunderX1 and 2: up to 11%) over current develop
 * Improved performance for *all* cores comparing to develop branch
   before TX2's patch (9% ~ 36%)
 * ThunderX1 builds are 14% faster than ARMv8 on TX1, 9% faster than
   current develop's branch and 8% faster than deveop before tx2 patches

Issues:
 * Regression from current develop branch for A53 (-12%) and A57 (-3%)
   with ARMv8 builds, but still faster than before TX2's commit (+15%
   and +24% respectively). This can be improved with a simplification of
   TX2's code, to be done in future patches. At least the code is
   guaranteed to be ARMv8.0 now.

Comments:
 * CortexA57 builds are unchanged on A57 hardware from develop's branch,
   which makes sense, as it's untouched.
 * CortexA72 builds improve over A57 on A72 hardware, even if they're
   using the same includes due to new compiler tunning in the makefile.
2018-11-19 16:41:49 +00:00
Martin Kroeker
2e6fae2aad Serialize accesses to parallelized level3 functions from multiple callers
for #1851
2018-11-19 14:02:50 +01:00
Martin Kroeker
368d14f8c8 Fix harmless typo
fixes #1872
2018-11-16 14:58:28 +01:00
Martin Kroeker
42bc2a9202 Fix copy-paste errors (POWER8/9 and extraneous return) 2018-11-16 12:10:44 +01:00
fengruilin
43bb386b10 fix dot problem on 64bit mips 2018-11-15 11:11:59 +08:00
Martin Kroeker
c171b8ad13 Handle special case INCX=0,INCY=0 in the axpy interface 2018-11-13 13:57:18 +01:00
Martin Kroeker
2f04cf22ac Detect POWER9 as POWER8 on AIX and Linux
(already supported by the *BSD version)
2018-11-13 08:16:14 +01:00
Martin Kroeker
807f6e6922 Use prtconf to determine CPU type on AIX
for #1803
2018-11-12 18:52:29 +01:00
Martin Kroeker
ecbeb802a0 Merge pull request #1865 from martin-frbg/issue1844
Optimize gemv for small M, large N only if it can be done in a threadsafe manner
2018-11-12 17:30:44 +01:00
Martin Kroeker
2c5725cc39 Merge pull request #1864 from aytekinar/patch-1
Add ARM tests on Travis
2018-11-12 14:30:28 +01:00
Arda Aytekin
e3666931d8 Update .travis.yml
Updated `.travis.yml` file to add emulated tests for `ARMV6` and `ARMV8`
architectures with `gcc` and `clang`.  Created prebuilt images with
required dependencies. Squashed layers into one.
2018-11-11 20:50:38 +01:00
Martin Kroeker
ae02a57261 Merge pull request #1866 from martin-frbg/issue1859
Fix argument in SLASET call to zero S
2018-11-10 19:23:31 +01:00
Martin Kroeker
a6a52a73f7 Fix argument in SLASET call to zero S
fixes #1859 in accordance with https://github.com/LAPACK-Reference/issue/296
2018-11-10 17:16:53 +01:00
Martin Kroeker
0427277cef Allow optimization for small m, large n only if it can be made threadsafe
otherwise the introduction of a static array in 8e5a108 to improve #532 breaks concurrent calls from multiple threads as seen in #1844
2018-11-10 15:45:54 +01:00
Martin Kroeker
4f43668eec Merge pull request #2 from xianyi/develop
merge develop
2018-11-10 15:37:25 +01:00
Martin Kroeker
b0c15bacc1 Merge pull request #1863 from martin-frbg/aix_install3
Set LIBSONAME suffix to .a for AIX
2018-11-09 13:12:06 +01:00
Martin Kroeker
cfb0f5b0f8 Set LIBSONAME suffix to .a for AIX
another fix for #1803
2018-11-08 22:39:10 +01:00
Martin Kroeker
667fed579d Merge pull request #1856 from rengolin/armv8-a57
[Arm64) Revert A53 detection as A57
2018-11-07 21:01:29 +01:00
Martin Kroeker
96d2f2c9b2 Merge pull request #1831 from brada4/hemv
disable threading in C/ZSWAP copying from S/DSWAP
2018-11-07 08:49:21 +01:00
Martin Kroeker
653e657a58 Merge pull request #1857 from brada4/fc-1847
Add gfortran -frecursive option from upstream and #1847
2018-11-07 08:48:31 +01:00
Martin Kroeker
5f8f0583d4 Merge branch 'develop' into fc-1847 2018-11-07 08:47:52 +01:00
Martin Kroeker
974a6a30f2 Merge pull request #1858 from brada4/buff-1847
Add minimum threshold for number of buffers
2018-11-07 08:46:55 +01:00
Andrew
9531d0e175 lets fit it in one 4k page 2018-11-06 17:51:24 +00:00
Andrew
40cce0e353 handle cmake too 2018-11-06 09:45:49 +00:00
Andrew
3fd41313fc add low bound for number of buffers 2018-11-06 09:40:13 +00:00
Andrew
a931afe269 init 2018-11-06 09:39:05 +00:00
Andrew
7d3502b500 Add -frecursive gfortran option by default 2018-11-06 08:20:55 +00:00
Andrew
066f8065d1 init 2018-11-06 08:19:08 +00:00
Renato Golin
fb5b2177ca [Arm64) Revert A53 detection as A57
This patch reverts the decision of treating A53 like A57, which was
based on an analysis done on server class hardware and is not
representative of all A53s out there.

Fixes #1855.
2018-11-05 11:34:49 +00:00
Martin Kroeker
f1c02273cb Merge pull request #1846 from fenrus75/threadsize
gemm/dgemm: add a way for an arch kernel to specify preferred sizes
2018-11-02 13:18:01 +01:00
Martin Kroeker
661035477c Merge pull request #1850 from martin-frbg/issue1811
Restore Android/ARMv7 build fix from #778
2018-11-02 09:50:51 +01:00
Martin Kroeker
aa7e47aa0a Merge pull request #1849 from martin-frbg/aix_install2
Use installbsd on AIX
2018-11-01 20:39:16 +01:00
Martin Kroeker
9c177d270b Restore Android/ARMv7 build fix from #778
for #1811
2018-11-01 18:50:25 +01:00
Martin Kroeker
b025523197 Use installbsd on AIX
(and fix misplaced parenthesis from previous commit). See #1803
2018-11-01 18:26:08 +01:00
Martin Kroeker
5b50bd36f7 Merge pull request #1845 from martin-frbg/aix_install
Accomodate AIX install, which has different syntax
2018-11-01 09:53:10 +01:00
Arjan van de Ven
5b708e5eb1 sgemm/dgemm: add a way for an arch kernel to specify prefered sizes
The current gemm threading code can make very unfortunate choices, for
example on my 10 core system a 1024x1024x1024 matrix multiply ends up
chunking into blocks of 102... which is not a vector friendly size
and performance ends up horrible.

this patch adds a helper define where an architecture can specify
a preference for size multiples.
This is different from existing defines that are minimum sizes and such.

The performance increase with this patch for the 1024x1024x1024 sgemm
is 2.3x (!!)
2018-11-01 01:43:20 +00:00
Arjan van de Ven
dcc5d6291e skylakex: Make the sgemm/dgemm beta code robust for a N=0 or M=0 case
in the threading code there are cases where N or M can become 0,
and the optimized beta code did not handle this well, leading
to a crash

during the audit for the crash a few edge conditions on the if statements
were found and fixed as well
2018-11-01 01:42:09 +00:00
Martin Kroeker
7b5aea52bb Accomodate AIX install, which has different syntax
for #1803
2018-10-31 21:50:34 +01:00
Martin Kroeker
f5595d0262 Merge pull request #1843 from martin-frbg/aix_numprocs
Add get_num_procs implementation for AIX
2018-10-31 21:25:15 +01:00
Martin Kroeker
326d394a0f Add get_num_procs implementation for AIX
(and copy HAIKU implementation to the non-TLS version of the code as well)
2018-10-31 18:38:22 +01:00
Martin Kroeker
6af8e35a24 Merge pull request #1837 from embray/set-num-thread-after-fork
Ensure that blas_thread_init has been called in openblas_set_num_threads
2018-10-30 12:41:24 +01:00
Erik M. Bray
38cf5d9364 ensure that threading has been initialized in the first place before calling openblas_set_num_threads 2018-10-28 21:16:52 +00:00
Martin Kroeker
8a43baacb2 Merge pull request #1836 from martin-frbg/zen2core
Fix detection of Ryzen2 (missing CORE_ZEN)
2018-10-28 20:00:01 +01:00
Martin Kroeker
64ca44873b Fix detection of Ryzen2 (missing CORE_ZEN) 2018-10-28 18:36:55 +01:00
fengrl
2d8064174c register push/pop command change
64bit push/pop register command should be used. Otherwise, data will lost.
2018-10-26 17:55:15 +08:00
Martin Kroeker
76a66eaac8 Merge pull request #1829 from ashwinyes/develop_aarch64_dynamic_arch_support
Add DYNAMIC_ARCH support for ARM64
2018-10-23 18:14:28 +02:00
Andrew
2992e3886a disable threading in C/ZSWAP copying from S/DSWAP 2018-10-22 23:21:49 +03:00
Ashwin Sekhar T K
d5aeff636f ARM64: Enable DYNAMIC_ARCH
Enable DYNAMIC_ARCH feature on ARM64. This patch uses the cpuid
feature in linux kernel to detect the core type at runtime
(https://www.kernel.org/doc/Documentation/arm64/cpu-feature-registers.txt).

If this feature is missing in kernel, then the user should use the
OPENBLAS_CORETYPE env variable to select the desired core type.
2018-10-22 01:49:35 -07:00
Ashwin Sekhar T K
af2837c392 ARM64: Remove #define ARMV8 for THUNDERX 2018-10-22 01:49:35 -07:00
Ashwin Sekhar T K
e7b66cd36e ARM64: Fix DYNAMIC_ARCH compilation for cores which dont use GEMM3M 2018-10-22 01:45:51 -07:00
Ashwin Sekhar T K
d50abc8903 ARM64: Move parameters from parameter.c to param.h
Remove the runtime setting of P, Q, R parameters for
targets ARMV8, THUNDERX2T99. Instead set them as constants
in param.h at compile time.
2018-10-22 01:45:51 -07:00
Ashwin Sekhar T K
351a0c777c ARM64: Remove XGENE1 references
Remove XGENE1 target as the implementation for the
same is incomplete. Moreover whoever wishes to use
on XGENE1 can use the generic ARMV8 target as there
are no XGENE1 specific optimizations in OpenBLAS.
2018-10-22 01:45:51 -07:00
Martin Kroeker
e3c262e5cf Merge pull request #1825 from brada4/hemv
Delay _hemv threading in attempt to address #1820
2018-10-21 20:34:05 +02:00
Andrew
a293bdcd5e re-arrange new code for readability 2018-10-20 21:37:53 +03:00
Andrew
c7bbf9c987 Attempt to tame _hemv threading #1820 2018-10-20 11:13:29 +03:00
Andrew
898a8dcaba init 2018-10-20 10:55:04 +03:00
Martin Kroeker
71c6deed60 Merge pull request #1821 from ashwinyes/develop_aarch64_armv8neonkernels
Use ThunderX2 Neon Kernels for ARMV8 Target
2018-10-18 08:13:05 +02:00
Ashwin Sekhar T K
21f46a1cf2 ARM64: Use THUNDERX2T99 Neon Kernels for ARMV8
Currently the generic ARMV8 target uses C implementations
for many routines. Replace these with the neon implementations
written for THUNDERX2T99 target which are upto 6x faster for
certain routines.
2018-10-17 10:44:37 -07:00
Ashwin Sekhar T K
caf339412f ARM64: Remove dependency of THUNDERX2T99 Makefile on CORTEXA57 Makefile 2018-10-17 08:02:40 -07:00
Ashwin Sekhar T K
8001fdcd2a ARM64: Remove dependency of THUNDERX Makefile on ARMV8 Makefile 2018-10-17 08:02:16 -07:00
Ashwin Sekhar T K
162e312832 ARM64: Remove dependency of CORTEXA57 Makefile on ARMV8 Makefile 2018-10-17 08:01:45 -07:00
Ashwin Sekhar T K
c3d93caa8d ARM64: Remove dependency of XGENE1 Makefile on ARMV8 Makefile 2018-10-17 08:01:27 -07:00
Martin Kroeker
a71923514f Merge pull request #1815 from fenrus75/sgemm_beta_fix
enable the SGEMM/SKX C based kernel
2018-10-14 19:57:34 +02:00
Arjan van de Ven
55b244ca0d enable the SGEMM/SKX C based kernel
In QA the final bug was found so now the sklyakex sgemm C based kernel can
be activated....
2018-10-12 09:30:35 +00:00
Martin Kroeker
2263d3906c Merge pull request #1812 from martin-frbg/issue1806-2
Use KERNEL_DEFINITIONS rather than COMMON_OPTS to pass -march=skylake…
2018-10-11 21:51:31 +02:00
Martin Kroeker
81c9985c3a Use KERNEL_DEFINITIONS rather than COMMON_OPTS to pass -march=skylake-avx512 2018-10-11 11:03:27 +02:00
Martin Kroeker
56ebc7b53e Merge pull request #1808 from martin-frbg/issue1806
Add -march=skylake-avx512 to CFLAGS when the target is Skylake
2018-10-11 07:48:08 +02:00
Martin Kroeker
c5f88f5a57 Merge pull request #1807 from xianyi/revert-1798-cmake-avx512
Revert "Add -march=skylake-avx512 when required"
2018-10-11 07:47:53 +02:00
Martin Kroeker
8a11ec19d1 Syntax fix 2018-10-10 23:47:35 +02:00
Martin Kroeker
fa53b903db Add -march=skylake-avx512 to CFLAGS when the target is Skylake
Should fix 1806 and #1801
2018-10-10 19:22:01 +02:00
Martin Kroeker
84bcdf9c66 Revert "Add -march=skylake-avx512 when required" 2018-10-10 19:15:32 +02:00
Martin Kroeker
8f7e986184 Merge pull request #1802 from martin-frbg/issue1801
Use avx512 workaround with msys2/mingw64 as well
2018-10-10 08:52:53 +02:00
Martin Kroeker
d0e83666ad Merge pull request #1804 from fenrus75/sgemm
Add a C+intrinsics version of the SGEMM/skylakex kernel
2018-10-10 08:50:44 +02:00
Arjan van de Ven
d4bad73834 Add a C+intrinsics version of the SGEMM/skylakex kernel
for most sizes this is 1.2x to 1.4x faster than the current code
2018-10-10 01:49:22 +00:00
Martin Kroeker
065763adde Merge pull request #1800 from fengrl/patch-1
Update common_mips64.h for the 1st loop of blas_memory_alloc
2018-10-09 10:56:37 +02:00
Martin Kroeker
210b03b543 Merge pull request #1792 from martin-frbg/cmakesuffix
Improve CMake help output and add SYMBOLPREFIX and -SUFFIX options
2018-10-09 10:34:52 +02:00
Martin Kroeker
6234a32656 Use cygwin compilation workaround for avx512 on msys2/mingw64 as well 2018-10-09 10:31:59 +02:00
Martin Kroeker
c0d7cd3dac Merge pull request #1799 from martin-frbg/issue1796
Handle conflicting usage of ARCH in at least some BSD environments
2018-10-09 08:20:52 +02:00
Martin Kroeker
667f0cc1cb Merge pull request #1793 from fenrus75/ncopy
Add optimized *copy versions for skylakex
2018-10-09 08:19:14 +02:00
fengrl
d4c8853a02 Update common_mips64.h 2018-10-09 11:20:16 +08:00
Martin Kroeker
d3d58f8ee5 Catch conflicting usage of ARCH in at least some BSD environments
fixes #1796
2018-10-08 22:29:35 +02:00
Martin Kroeker
697dc1baf8 Use override for ARCH in make.inc
in case a conflicting setting of ARCH (for architecture) gets pulled in from the environment
(originally suggested by dloghin in #1753)
2018-10-08 22:26:59 +02:00
Martin Kroeker
a9b51b8448 Merge pull request #1798 from martin-frbg/cmake-avx512
Add -march=skylake-avx512 when required
2018-10-08 21:15:17 +02:00
Martin Kroeker
eba394c711 Add -march=skylake-avx512 when required
fixes #1797
2018-10-08 19:18:12 +02:00
Arjan van de Ven
582c589727 dgemm/skylakex: replace discrete mul/add with fma
very minor gains since it's not super hot code, but general principles
2018-10-06 23:13:26 +00:00
Arjan van de Ven
adbf6afa25 Add vector optimizations for ncopy as well for dgemm/skylakex 2018-10-06 21:18:12 +00:00
Arjan van de Ven
32bec8afbb add a skylakex optimized dgemm beta function 2018-10-06 16:36:26 +00:00
Martin Kroeker
6e2c494556 Merge pull request #1791 from dev-zero/develop
fix parallel build issues with APFS/HFS+/ext2/3 in netlib-lapack
2018-10-06 16:29:29 +02:00
Arjan van de Ven
20c5d668fe dgemm/avx512 simplify and speed up the 4x4 kernel 2018-10-06 14:12:32 +00:00
Arjan van de Ven
6d43c51ccf undo slow dgemm/skylake microoptimization
the compare is more costly than the work
2018-10-06 14:00:37 +00:00
Arjan van de Ven
d74dc39b0f Add optimized *copy versions for skylakex
Add optimized n/t copy versions for skylakex; in the patch the
tcopy is also rewritten using intrinsics; the ncopy file
will be worked on in a future commit
2018-10-06 13:51:44 +00:00
Martin Kroeker
41951da6d4 Merge pull request #6 from xianyi/develop
merge develop
2018-10-06 14:36:36 +02:00
Martin Kroeker
474f7e9583 Add SYMBOLPREFIX and -SUFFIX options and improve help output 2018-10-06 14:28:04 +02:00
Tiziano Müller
79ea839b63 fix parallel build issues with APFS/HFS+/ext2/3 in netlib-lapack
The problem is that OpenBLAS sets the LAPACKE_LIB and the TMGLIB to the
same object and uses the `ar` feature to update the archive file. If the
underlying filesystem does not have sub-second timestamp resolution and
the system is fast enough (or `ccache` is used), the timestamp of the
builds which should be added to the previously generated archive is the
same as the archive file itself and therefore `make` does not update the
archive.

Since OpenBLAS takes care to not run the different targets updating the
archive in parallel, the easiest solution is to declare the respective
targets `.PHONY`, forcing `make` to always update them.

fixes #1682
2018-10-06 14:10:05 +02:00
Martin Kroeker
f7f97c6148 Merge pull request #1789 from brada4/develop
update travis alpine chroot with avx512 intrinsics headers
2018-10-05 20:42:37 +02:00
Martin Kroeker
6f22e1cfb8 Merge pull request #1788 from fenrus75/avx512-8x16
skylake dgemm: Add a 16x8 kernel
2018-10-05 20:40:38 +02:00
Arjan van de Ven
66b43affbc Add a 24x8 kernel to the skylakex dgemm implementation
Minor gains for small matrixes, but at 512x512 and above the gain
gets more significant.
2018-10-05 13:22:21 +00:00
Arjan van de Ven
1938819c25 skylake dgemm: Add a 16x8 kernel
The next step for the avx512 dgemm code is adding a 16x8 kernel.
In the 8x8 kernel, each FMA has a matching load (the broadcast);
in the 16x8 kernel we can reuse this load for 2 FMAs, which
in turn reduces pressure on the load ports of the CPU and gives
a nice performance boost (in the 25% range).
2018-10-05 13:11:35 +00:00
Andrew
bda3dbe2eb update travis alpine chroot with avx512 intrinsics headers 2018-10-05 15:47:55 +03:00
Andrew
c3e0f0eb38 update travis alpine chroot with avx512 intrinsics headers 2018-10-05 15:41:52 +03:00
Martin Kroeker
a980953bd7 Merge pull request #1785 from brada4/develop
address #1782 2nd loop
2018-10-05 08:25:38 +02:00
Martin Kroeker
78c99d5231 Merge pull request #1784 from fenrus75/dgemm-avx512
Create a AVX512 enabled version of DGEMM
2018-10-05 08:03:27 +02:00
Martin Kroeker
b7496c3638 Function name needs to be CNAME, set from outside to allow suffixing for dynamic_arch 2018-10-04 19:14:59 +02:00
Martin Kroeker
95f4e87579 Merge pull request #1787 from jeromerobert/develop
Fix unknown type name __WAIT_STATUS on RHEL5
2018-10-04 18:41:47 +02:00
Jerome Robert
b095f2fad6 Fix unknown type name __WAIT_STATUS on RHEL5
With glibc 2.5 one must have #define _XOPEN_SOURCE >= 500 to use wait.
But reading glibc code this is actually needed only if stdlib.h was
included before sys/wait.h. This was the case here through
openblas_utest.h. So changing include fix compilation on RHEL5 and
should ne hurt with more recent distro.

* Problem found when using with gcc 5.5 and 4.7.2 on RHEL5/CENTOS5
* Fix #1519
2018-10-04 14:37:08 +02:00
Martin Kroeker
02ef20a1e4 Merge pull request #1786 from martin-frbg/immintrin
Check for Immintrin.h presence in the AVX512 compatibility test as well
2018-10-04 09:07:09 +02:00
Martin Kroeker
4c3643ed7f Check availability of immintrin.h in the AVX512 compatibility test 2018-10-04 07:36:49 +02:00
Martin Kroeker
591cca7cb0 Check availability of immintrin.h in the AVX512 compatibility test 2018-10-04 07:35:30 +02:00
Andrew
3439158dea address #1782 2nd loop 2018-10-03 21:20:50 +02:00
Arjan van de Ven
45fe8cb0c5 Create a AVX512 enabled version of DGEMM
This patch adds dgemm_kernel_4x8_skylakex.c which is
* dgemm_kernel_4x8_haswell.s converted to C + intrinsics
* 8x8 support added
* 8x8 kernel implemented using AVX512

Performance is a work in progress, but already shows a 10% - 20%
increase for a wide range of matrix sizes.
2018-10-03 14:45:25 +00:00
Martin Kroeker
544b069e85 Merge pull request #1780 from martin-frbg/issue1774-2
Convert fldmia/fstmia instructions to UAL syntax for clang7
2018-09-29 09:27:47 +02:00
Martin Kroeker
9b2a7ad40d Convert fldmia/fstmia instructions to UAL syntax for clang7
second part of fix for #1774, containing files missed in #1775
2018-09-28 23:05:15 +02:00
Martin Kroeker
10ce70701a Merge pull request #1778 from fengrl/develop
test_axpy work error on LOONGSON3A platform #1777
2018-09-26 11:14:58 +02:00
fengruilin
6fc85a6359 test_axpy work error on LOONGSON3A platform #1777 2018-09-26 15:14:04 +08:00
Martin Kroeker
831c661386 Merge pull request #1775 from martin-frbg/issue1774
Convert fldmia/fstmia instructions to UAL syntax for clang7
2018-09-25 18:58:39 +02:00
Martin Kroeker
7e5df34e6a Convert fldmia/fstmia instructions to UAL syntax for clang7
fixes #1774
2018-09-25 09:41:58 +02:00
Martin Kroeker
4f45040b89 Merge pull request #1773 from martin-frbg/issue1767
Include thread numbers in failure message from blas_thread_init
2018-09-23 23:25:15 +02:00
Martin Kroeker
28aa94bf4b Include thread numbers in failure message from blas_thread_init
to aid in debugging cases like #1767
2018-09-22 14:00:15 +02:00
Martin Kroeker
56e7c68810 Merge pull request #1771 from staticfloat/sf/ldflags
Add `$(LDFLAGS)` to `$(CC)` and `$(FC)` invocations within `exports/Makefile`
2018-09-22 13:11:39 +02:00
Martin Kroeker
cf6df9464c Document the stub status of the QUAD_PRECiSION code (#1772)
* Document the stub status of the QUAD_PRECiSION code inherited from GotoBLAS2

in response to #1769
2018-09-22 12:31:37 +02:00
Elliot Saba
6f77af2eef Add $(LDFLAGS) to $(CC) and $(FC) invocations within exports/Makefile 2018-09-21 09:19:51 +00:00
Martin Kroeker
4d183e5567 Merge pull request #1765 from martin-frbg/issue1761
Do not use the new TLS-enabled memory allocator for non-threaded builds, and disable TLS by default in gmake as well
2018-09-19 22:02:21 +02:00
Martin Kroeker
34d55fd165 Merge pull request #1764 from yurivict/64-suffix
Allow to install the 'interface64' version concurrently with the regular version
2018-09-19 18:16:38 +02:00
Martin Kroeker
b991570210 Merge pull request #1762 from martin-frbg/issue1710-2
Add explicit casts to silence compiler warnings
2018-09-19 18:16:21 +02:00
Martin Kroeker
288aeea8a2 Fix default settings - USE_TLS and USE_SIMPLE_THREADED_LEVEL3 should both be off 2018-09-19 18:08:31 +02:00
Martin Kroeker
1ad1e79062 Catch inadvertent USE_TLS=0 declaration
for #1766
2018-09-19 18:03:43 +02:00
Martin Kroeker
b402626509 Do not use the new TLS code for non-threaded builds even if USE_TLS is set
Workaround for #1761 as that exposed a problem in the new code (which was intended to speed up multithreaded code only anyway).
2018-09-16 12:43:36 +02:00
Martin Kroeker
ec0cac1669 Merge pull request #4 from xianyi/develop
Update branch
2018-09-16 12:36:49 +02:00
Yuri
2349e15149 Allow to install the 'interfare64' version concurrently with the regular version 2018-09-15 21:00:03 -07:00
Martin Kroeker
f3c262156e Add an explicit cast to silence a warning
for #1710
2018-09-13 14:24:29 +02:00
Martin Kroeker
30f5a69ab8 Add explicit cast to silence a warning
for #1710
2018-09-13 14:23:31 +02:00
Martin Kroeker
fd081a91e4 Merge pull request #1759 from martin-frbg/lapack283
Remove an unused variable from several LAPACKE 2stage_work functions
2018-09-11 13:52:09 +02:00
Martin Kroeker
094f8c3b57 remove unused variable ldb_t
Copied from Reference-LAPACK PR283
2018-09-11 10:53:47 +02:00
Martin Kroeker
5cf090f516 remove unused variable ldb_t
Copied from Reference-LAPACK PR283
2018-09-11 10:52:30 +02:00
Martin Kroeker
58363542e7 remove unused variable ldb_t
Copied from Reference-LAPACK PR283
2018-09-11 10:51:17 +02:00
Martin Kroeker
3abc22a5bf Merge pull request #1757 from brada4/develop
fix small typo in strmm_ LN
2018-09-09 22:55:15 +02:00
Andrew
1e531701b7 fix small typo 2018-09-09 16:52:25 +02:00
Martin Kroeker
5d42b6ea04 Merge pull request #1756 from martin-frbg/issue1754
Follow netlib renaming/aliasing CBLAS_ORDER to CBLAS_LAYOUT
2018-09-07 11:02:18 +02:00
Martin Kroeker
ba4f433321 Merge pull request #1749 from martin-frbg/issue1531
Fix ARMV8 cross-compilation for IOS
2018-09-07 11:02:01 +02:00
Martin Kroeker
4cf7315a5d Adjust ARMV8 SGEMM unrolling when using the C fallback kernel_2x2 for IOS 2018-09-06 21:41:54 +02:00
Martin Kroeker
b57af93792 just make CBLAS_LAYOUT an alias of the existing CBLAS_ORDER
to avoid having to change all instances of enum CBLAS_ORDER in this file
2018-09-06 16:54:31 +02:00
Martin Kroeker
8aeab0601e Follow netlib renaming/aliasing CBLAS_ORDER to CBLAS_LAYOUT
fixes #1754
2018-09-06 16:39:52 +02:00
Martin Kroeker
1cb7b9015e Conditional compilation of assembly files that IOS does not like 2018-09-04 11:06:51 +02:00
Martin Kroeker
a4bd41e9f2 Fix paths to C kernels for nrm2 2018-09-04 10:51:19 +02:00
Martin Kroeker
9e2bb0c641 Update with the changes from 0.3.3 2018-08-31 00:21:13 +02:00
Martin Kroeker
dbfd7524cd Update version to 0.3.4.dev 2018-08-31 00:19:21 +02:00
Martin Kroeker
2982ce505d Update version to 0.3.4.dev 2018-08-31 00:18:37 +02:00
Martin Kroeker
fd8d1868a1 Updates for 0.3.3 2018-08-31 00:07:48 +02:00
Martin Kroeker
f0563f14ba Version 0.3.3 2018-08-30 23:43:57 +02:00
Martin Kroeker
3197f86762 Version 0.3.3 2018-08-30 23:43:14 +02:00
Martin Kroeker
422a8fa953 Merge pull request #1747 from xianyi/develop
Merge develop into 0.3.x for 0.3.3
2018-08-30 23:42:19 +02:00
Martin Kroeker
5bac15adbd Merge pull request #1746 from martin-frbg/issue1674
Assume cross-compilation if host and target os differ
2018-08-30 17:48:07 +02:00
Martin Kroeker
e17f969fa0 Assume cross-compilation if host and target os differ
fixes 1674
2018-08-30 13:28:46 +02:00
Martin Kroeker
e11126b26a Merge pull request #1745 from martin-frbg/issue1743
Set USE_TRMM for all ZARCH variants to fix TRMM faults with zarch-gen…
2018-08-29 07:43:58 +02:00
Martin Kroeker
74608e470d Merge pull request #1744 from martin-frbg/lapack272
Fix missing replacements of ILAENV by ILAENV_2STAGE (lapack PR 272)
2018-08-28 22:58:58 +02:00
Martin Kroeker
f3fd44a731 Set USE_TRMM for all ZARCH variants to fix TRMM faults with zarch-generic
fixes #1743
2018-08-28 21:34:07 +02:00
Martin Kroeker
9e917b16db Fix missing replacements of ILAENV by ILAENV_2STAGE (lapack PR 272)
This could cause spurious "parameter has an illegal value" errors in DSYEVR and related routines, see https://github.com/Reference-LAPACK/lapack/issues/262
2018-08-28 21:11:54 +02:00
Martin Kroeker
8440a4cb1a Merge pull request #1742 from martin-frbg/interim033
Add combination of old and new thread memory code selectable by new option USE_TLS
2018-08-28 08:02:15 +02:00
Martin Kroeker
b55690a659 typo fix 2018-08-26 11:31:07 +02:00
Martin Kroeker
b902a40986 Rewrite glibc version check 2018-08-26 11:18:02 +02:00
Martin Kroeker
5991d1a6cd Update memory.c 2018-08-25 22:12:40 +02:00
Martin Kroeker
b1b743f434 Merge branch 'develop' into interim033 2018-08-25 19:45:19 +02:00
Martin Kroeker
2caa2210bb Add USE_TLS option to choose between old and new implementation of memory.c 2018-08-25 19:37:11 +02:00
Martin Kroeker
2a589c4b28 Add USE_TLS option to switch between old and new memory.c 2018-08-25 19:36:12 +02:00
Martin Kroeker
fd42ca462d Combo of default pre-0.3.1 memory.c and band-aided version of PR1739 2018-08-25 19:35:16 +02:00
Martin Kroeker
52d3f7af50 Merge pull request #1738 from sharkcz/s390x
detect z14 arch on s390x
2018-08-16 09:46:34 +02:00
Dan Horák
5c6e020f49 detect z14 arch on s390x 2018-08-14 12:30:38 +02:00
maamountki
e6c0e39492 Optimize Zgemv 2018-08-13 12:23:40 +03:00
Martin Kroeker
d4d3113adc Merge pull request #1731 from fenrus75/readme
add short blurb about avx512 and needed compiler to README
2018-08-13 00:01:37 +02:00
Martin Kroeker
375dff54fc Merge pull request #1733 from fenrus75/dsymv
Add an AVX512 enabled DSYMV (L) function
2018-08-12 18:18:36 +02:00
Martin Kroeker
a5f165275a Merge pull request #1732 from fenrus75/dgemv
Add an AVX512 enabled DGEMV (n)  function
2018-08-12 18:17:42 +02:00
Martin Kroeker
8c13aa495a Merge pull request #1730 from fenrus75/fix-sdot
Fix typo in sdot function
2018-08-12 18:17:01 +02:00
Martin Kroeker
1ee6d087c3 Merge pull request #1729 from fenrus75/dscal
Add an AVX512 enabled DSCAL function
2018-08-12 18:16:45 +02:00
Martin Kroeker
a95a784ab2 Merge pull request #1723 from maamountki/develop
Disable zgemv scale in gemv benchmark by default
2018-08-11 21:08:45 +02:00
Arjan van de Ven
9bec34cb67 Add an AVX512 enabled DSYMV (L) function
written in C intrinsics for best readability.
(the same C code works for Haswell as well)

For logistical reasons the code falls back to the existing
haswell AVX2 implementation if the GCC or LLVM compiler is not new enough
2018-08-11 17:46:24 +00:00
Arjan van de Ven
87bebdbd8a Add an AVX512 enabled DGEMV (n) function
written in C intrinsics for best readability.
(the same C code works for Haswell as well)

For logistical reasons the code falls back to the existing
haswell AVX2 implementation if the GCC or LLVM compiler is not new enough
2018-08-11 17:38:12 +00:00
Arjan van de Ven
9493f26309 add short blurb about avx512 and needed compiler to README 2018-08-11 17:21:46 +00:00
Arjan van de Ven
36add7570a Fix typo in sdot function
it looks like my previous pull request was short the final commit;
fix a typo in sdot
2018-08-11 17:16:45 +00:00
Arjan van de Ven
cacacc8007 Add an AVX512 enabled DSCAL function
written in C intrinsics for best readability.
(the same C code works for Haswell as well)

For logistical reasons the code falls back to the existing
haswell AVX2 implementation if the GCC or LLVM compiler is not new enough
2018-08-11 17:14:57 +00:00
Martin Kroeker
1a00ef3d27 Merge pull request #1725 from fenrus75/axpy
Add a AVX512 enabled SAXPY/DAXPY functions
2018-08-11 11:01:20 +02:00
Martin Kroeker
4c0d832ec3 Merge pull request #1724 from fenrus75/sdot
Add an AVX512 enabled SDOT function
2018-08-11 11:00:56 +02:00
Martin Kroeker
fc33cbc7bb Merge pull request #1728 from martin-frbg/changelog
Add changes from the 0.3.x releases
2018-08-10 13:24:36 +02:00
Martin Kroeker
c52a831ae4 Add changes from the 0.3.x releases
fixes #1727
2018-08-10 13:23:47 +02:00
Arjan van de Ven
2e99873ff7 Add a AVX512 enabled SAXPY/DAXPY functions
written in C intrinsics for best readability.
(the same C code works for Haswell as well)

For logistical reasons the code falls back to the existing
haswell AVX2 implementation if the GCC or LLVM compiler is not new enough
2018-08-10 02:58:32 +00:00
Arjan van de Ven
00abaa865b Add an AVX512 enabled SDOT function
written in C intrinsics for best readability.
(the same C code works for Haswell as well)

For logistical reasons the code falls back to the existing
haswell AVX2 implementation if the GCC or LLVM compiler is not new enough
2018-08-10 02:33:43 +00:00
maamountki
33043f563f Disable scal to benchmark zgemv separately by default 2018-08-10 01:54:18 +03:00
Martin Kroeker
66da7677bd Merge pull request #1721 from fenrus75/ddot2
Add an AVX512 enabled DDOT function
2018-08-09 15:39:06 +02:00
Arjan van de Ven
7932ff3ea9 Add an AVX512 enabled DDOT function
written in C intrinsics for best readability.
(the same C code works for Haswell as well)

For logistical reasons the code falls back to the existing
haswell AVX2 implementation if the GCC or LLVM compiler is not new enough
2018-08-09 03:55:52 +00:00
Martin Kroeker
62f4c69708 Merge pull request #1717 from martin-frbg/issue1708
Add workaround for avx512 compilations on Cygwin
2018-08-06 22:05:47 +02:00
maamountki
453bfa7e71 [ZARCH] Restore detect() function 2018-08-06 20:03:49 +03:00
maamountki
23229011db [ZARCH] Z14 support, BLAS 1/2 single precision implementations, Some missing double precision implementations, Gemv optimization 2018-08-06 18:20:40 +03:00
Martin Kroeker
73478664d4 Add workaround for avx512 compilations on Cygwin
fixes #1708
2018-08-06 16:40:32 +02:00
Martin Kroeker
ee955757f9 Merge pull request #1715 from stevengj/patch-1
fix blasabs for windows
2018-08-05 22:48:44 +02:00
Steven G. Johnson
48610a4524 fix blasabs for windows
Bugfix in #1713 for Windows (LLP64), where `blasabs` needs to be `llabs` rather than `labs` for the 64-bit API.
2018-08-05 08:18:51 -04:00
Martin Kroeker
4a553e8678 Merge pull request #1713 from martin-frbg/issue1710
Introduce blasabs macro and use it to switch between abs and labs for INTERFACE64
2018-08-04 23:51:31 +02:00
Martin Kroeker
e788102c10 Merge pull request #1709 from stevengj/patch-1
fabs -> fabsl
2018-08-04 23:51:10 +02:00
Martin Kroeker
165f00c159 fabs -> fabsl 2018-08-04 20:14:51 +02:00
Martin Kroeker
40c068a875 Introduce blasabs() to switch between abs() and labs() for INTERFACE64 2018-08-04 20:07:59 +02:00
Martin Kroeker
933896a1d0 Use blasabs to switch between abs and labs as needed for INTERFACE64 2018-08-04 20:06:49 +02:00
Steven G. Johnson
a4e321400b fabs -> fabsl
Fixes two calls that were using `fabs` on a `long double` argument rather than `fabsl`, which looks like it is doing an unintentional truncation to `double` precision.
2018-08-03 13:00:10 -04:00
Martin Kroeker
9e65430504 Merge pull request #1703 from wsttiger/cmake_fix
Set EXPORT_NAME to match OpenBLASConfig.cmake
2018-08-02 23:48:42 +02:00
Martin Kroeker
2cfa86b406 Merge pull request #1707 from extrowerk/haiku_support
Haiku supporting patches
2018-08-02 22:27:00 +02:00
Scott Thornton
2a9a9389ef Added target_include_directories() 2018-08-02 14:58:52 -05:00
Zoltán Mizsei
6463bffd59 Haiku supporting patches 2018-08-02 20:49:14 +02:00
Martin Kroeker
8ef7d4fb54 Merge pull request #1706 from oon3m0oo/develop
Fix #1705 where we incorrectly calculate page locations.
2018-08-02 18:53:34 +02:00
Craig Donner
6400868e55 Fix #1705 where we incorrectly calculate page locations.
Since we now use an allocation size that isn't a multiple of PAGESIZE, finding
the pages for run_bench wasn't terminating properly.  Now we detect if we've
found enough pages for the allocation and terminate the loop.
2018-08-02 16:21:19 +01:00
Scott Thornton
8ebf541e97 Set EXPORT_NAME to match OpenBLASConfig.cmake 2018-07-30 15:18:29 -05:00
Martin Kroeker
b03ae3f4dc Set version to 0.3.3.dev 2018-07-30 08:23:13 +02:00
Martin Kroeker
2cc8fb0ad2 Set version to 0.3.3.dev 2018-07-30 08:22:38 +02:00
Martin Kroeker
e8a68ef261 Merge pull request #1702 from xianyi/develop
Merge develop for 0.3.2
2018-07-30 07:25:01 +02:00
Martin Kroeker
64826a0d7d Merge branch 'release-0.3.0' into develop 2018-07-29 22:37:09 +02:00
Martin Kroeker
25f2d25cfe Merge pull request #1697 from martin-frbg/issue1696
Do not treat WIndows UWB builds as cross-compiling
2018-07-25 19:55:29 +02:00
Martin Kroeker
73131fa30a Do not treat WIndows UWB builds as cross-compiling 2018-07-24 17:46:33 +02:00
Martin Kroeker
66fcdd5be8 Merge pull request #1695 from martin-frbg/issue1692
Unset memory table entry, not just the local pointer to it on shutdown
2018-07-22 16:34:09 +02:00
Martin Kroeker
43ac839c16 Unset memory table entry, not just the temporary pointer to it on shutdown
to fix crash with multiple instances of OpenBLAS, #1692
2018-07-22 09:19:19 +02:00
Martin Kroeker
7ba5936ecd Merge pull request #1688 from martin-frbg/issue1673
Temporarily disable special handling of OPENMP thread memory allocation
2018-07-19 19:03:45 +02:00
Martin Kroeker
b14f44d2ad Temporarily disable special handling of OPENMP thread memory allocation
for issue #1673
2018-07-19 08:57:56 +02:00
Martin Kroeker
e71d70ba87 Merge pull request #1681 from martin-frbg/issue1671
Add cpu identification via mfpvr call for the BSDs
2018-07-16 22:47:05 +02:00
Martin Kroeker
d671870f5f Merge pull request #1684 from martin-frbg/issue1672
Work around utest failures in the MIPS64 SICORTEX target
2018-07-16 22:46:49 +02:00
Martin Kroeker
4e103c822c typo fix 2018-07-16 12:56:39 +02:00
Martin Kroeker
d2142760e0 Fix precision problem in DSDOT 2018-07-15 17:11:40 +02:00
Martin Kroeker
2fbfc64da8 Use C kernels for default c/zAXPY, xROT, c/zSWAP 2018-07-15 17:09:55 +02:00
Martin Kroeker
8d5b33b6be Add cpu identification via mfpvr call for the BSDs
fixes #1671
2018-07-12 23:39:00 +02:00
Martin Kroeker
36aea5ce2d Merge pull request #1680 from martin-frbg/snprint
Fix wrong redefinitions of snprintf for older MSVC
2018-07-12 14:05:13 +02:00
Martin Kroeker
1309711e24 Fix declaration of snprintf for older MSVC
_snprintf_s takes an additional (size) argument, so is no direct replacement.
(Note that this code is currently unused - the two instances of snprintf here are within ifdef blocks that are not compiled for MSVC)
2018-07-12 11:47:52 +02:00
Martin Kroeker
571e9de2ac Fix definition of snprintf for MSVC
MS _snprintf_s takes an additional argument for the size of the buffer, so is not a direct replacement (utest/ctest.h from which I copied was wrong)
2018-07-12 11:42:25 +02:00
Martin Kroeker
448ed15115 Merge pull request #1678 from martin-frbg/issue1677
Define snprintf for older versions of MSVC
2018-07-12 09:21:34 +02:00
Martin Kroeker
045fb5ea2c Define snprintf for older versions of MSVC
for #1677
2018-07-12 07:30:58 +02:00
Martin Kroeker
4dd70d98d7 Merge pull request #1667 from xianyi/revert-1642-develop
Revert "Rewrite &= -> = and simplify the initial blocking phase."
2018-07-04 08:27:21 +02:00
Martin Kroeker
504310eeb9 Merge pull request #1665 from martin-frbg/cpuid-ryzen2
Add cpuid for AMD Ryzen 2
2018-07-04 08:19:40 +02:00
Martin Kroeker
ea1f39518f Merge pull request #1663 from martin-frbg/issue1641
Double MAX_ALLOCATING_THREADS to fix segfaults with Go and Octave
2018-07-04 08:19:11 +02:00
Martin Kroeker
5f2a3c05cd Revert "Rewrite &= -> = and simplify the initial blocking phase." 2018-07-03 21:42:28 +02:00
Martin Kroeker
d0ec4325cf Add cpuid for AMD Ryzen 2 2018-07-03 21:03:24 +02:00
Martin Kroeker
3f73e8b8cf Add cpuid for AMD Ryzen 2
for #1664
2018-07-03 21:01:35 +02:00
Martin Kroeker
a83f01e0ee Merge pull request #1662 from martin-frbg/cmake-avx512
Add -march=skylake-avx512 to AVX512 compile check and suppress its ou…
2018-07-03 17:40:09 +02:00
Martin Kroeker
a49203b48c Double MAX_ALLOCATING_THREADS to fix segfaults with Go and Octave
for #1641
2018-07-03 17:35:54 +02:00
Martin Kroeker
b74aef2816 Add -march=skylake-avx512 to AVX512 compile check and suppress its output 2018-07-03 14:41:44 +02:00
Martin Kroeker
a9fa805007 Merge pull request #1660 from martin-frbg/issue1659
Fix typo that broke compilation with DYNAMIC_ARCH and NO_AVX2
2018-07-02 17:48:19 +02:00
Martin Kroeker
9d15a3bd16 Fix typo that broke compilation with DYNAMIC_ARCH and NO_AVX2
fixes 1659
2018-07-02 14:40:41 +02:00
Martin Kroeker
c6aec89d10 Merge pull request #1657 from martin-frbg/release-0.3.0
Release 0.3.1
2018-07-01 12:03:07 +02:00
Martin Kroeker
bbf2124970 set version number to 0.3.2.dev 2018-07-01 12:01:51 +02:00
Martin Kroeker
1392eba488 set version number to 0.3.2.dev 2018-07-01 12:01:16 +02:00
Martin Kroeker
e6d7711199 remove dev suffix from version number 2018-07-01 11:59:47 +02:00
Martin Kroeker
7a914347c5 remove dev suffix from version number 2018-07-01 11:58:57 +02:00
Martin Kroeker
61659f8765 Merge pull request #1648 from martin-frbg/nofort
Handle NOFORTRAN=0
2018-07-01 11:56:40 +02:00
Martin Kroeker
3a8f0a6a1f Merge pull request #1656 from xianyi/develop
Update the 0.3 branch from develop
2018-07-01 11:55:21 +02:00
Martin Kroeker
3d3c19717c Merge pull request #1655 from martin-frbg/issue1641
Fix apparent off-by-one error in calculation of MAX_ALLOCATING_THREADS
2018-07-01 08:41:22 +02:00
Martin Kroeker
24e344038d Merge pull request #1654 from martin-frbg/avx512check
Add compiler option to avx512 test and hide test output
2018-07-01 01:17:03 +02:00
Martin Kroeker
4e9c34018e Fix apparent off-by-one error in calculation of MAX_ALLOCATING_THREADS
fixes #1641
2018-06-30 23:57:50 +02:00
Martin Kroeker
f5243e8e1f Add compiler option to avx512 test and hide test output 2018-06-30 23:47:44 +02:00
Martin Kroeker
ba8388cee0 Merge pull request #1651 from martin-frbg/avx512-nodgemm
Disable the 16x2 DTRMM kernel on SkylakeX as well
2018-06-30 17:48:03 +02:00
Martin Kroeker
6e54b0a027 Disable the 16x2 DTRMM kernel on SkylakeX as well 2018-06-30 17:31:06 +02:00
Martin Kroeker
40c8cbc3bf Merge pull request #1650 from martin-frbg/avx512-nodgemm
Disable the AVX512 DGEMM kernel for now
2018-06-30 13:05:46 +02:00
Martin Kroeker
d3c9eb4c7d Merge pull request #1639 from martin-frbg/dyn_list
Add DYNAMIC_LIST option for user-defined list of dynamic targets
2018-06-30 13:05:30 +02:00
Martin Kroeker
f0a8dc2eec Disable the AVX512 DGEMM kernel for now
due to #1643
2018-06-30 11:34:48 +02:00
Martin Kroeker
cc92257ea6 Update Makefile 2018-06-27 00:09:21 +02:00
Martin Kroeker
2aba1b1658 Merge branch 'develop' into nofort 2018-06-27 00:07:32 +02:00
Martin Kroeker
8396e9e777 Handle NOFORTRAN=0 2018-06-27 00:00:27 +02:00
Martin Kroeker
bfad307ed7 Merge pull request #1647 from martin-frbg/armv7-dot
Remove premature exits from ARMV7 xdot codes
2018-06-26 22:27:30 +02:00
Martin Kroeker
b83e4c60c7 Remove premature exit for INC_X or INC_Y zero 2018-06-26 20:46:42 +02:00
Martin Kroeker
e344db269b Remove premature exit for INC_X or INC_Y zero 2018-06-26 20:45:57 +02:00
Martin Kroeker
545b82efd3 Remove premature exit for INC_X or INC_Y zero 2018-06-26 20:45:00 +02:00
Martin Kroeker
e322a951fe Remove premature exit for INC_X or INC_Y zero 2018-06-26 20:44:13 +02:00
Martin Kroeker
ff2f171036 Merge pull request #1644 from martin-frbg/revert-filterout
Revert changes to NOFORTRAN handling in Makefile
2018-06-26 10:15:15 +02:00
Martin Kroeker
092175cfec Revert changes to NOFORTRAN handling from 952541e 2018-06-26 08:09:52 +02:00
Martin Kroeker
750162a05f Try gradual fallback for cores not in the dynamic core list 2018-06-25 21:02:31 +02:00
Martin Kroeker
e6d93f20f1 Merge pull request #2 from martin-frbg/develop
merge develop
2018-06-25 20:48:10 +02:00
Martin Kroeker
c38c65eb65 Merge pull request #1 from xianyi/develop
Merge xianyi:develop into develop
2018-06-25 20:45:56 +02:00
Martin Kroeker
ce3651516f Merge pull request #1642 from oon3m0oo/develop
Rewrite &= -> = and simplify the initial blocking phase.
2018-06-25 19:23:40 +02:00
Craig Donner
0144068537 Rewrite &= -> = and simplify the initial blocking phase. 2018-06-25 15:08:55 +01:00
Martin Kroeker
1833a67071 Add support for a user-defined list of dynamic targets 2018-06-23 19:42:15 +02:00
Martin Kroeker
0b2b83d9ed Add support for a user-defined list of dynamic targets 2018-06-23 19:41:32 +02:00
Martin Kroeker
62cf769aa6 Merge pull request #1638 from martin-frbg/issue1637
Expose the CBLAS interface to the IxAMIN functions and have make build it
2018-06-23 15:01:02 +02:00
Martin Kroeker
eb71d61c7c Expose CBLAS interface to BLAS extensions iXamin 2018-06-23 13:31:09 +02:00
Martin Kroeker
9cf22b7d91 Build cblas_iXamin interfaces 2018-06-23 13:27:30 +02:00
Martin Kroeker
cc66743b66 Merge pull request #1634 from oon3m0oo/develop
Fix data races reported by TSAN.
2018-06-21 21:01:03 +02:00
oon3m0oo
2aa0a5804e Use BLAS rather than CBLAS in test_fork.c (#1626)
This is handy for people not using lapack.
2018-06-21 18:47:45 +02:00
Craig Donner
28c28ed275 Fix data races reported by TSAN. 2018-06-21 16:41:02 +01:00
oon3m0oo
a399d00425 Further improvements to memory.c. (#1625)
- Compiler TLS is now used only used when the compiler supports it
- If compiler TLS is unsupported, we use platform-specific TLS
- Only one variable (an index) is now in TLS
- We only access TLS once per alloc, and never when freeing
- Allocation / release info is now stored within the allocation itself, by
  over-allocating; this saves having external structures do the bookkeeping, and
  reduces some of the redundant data that was being stored (such as addresses)
- We never hit the alloc lock when not using SMP or when using OpenMP (that was
  my fault)
- Now that there are fewer tracking structures I think this is a bit easier to
  read than before
2018-06-20 22:04:03 +02:00
Martin Kroeker
f66b9c8826 Merge pull request #1630 from martin-frbg/x86-march
Add -march=skylake-avx512 to flags if target is skylake x
2018-06-20 21:51:57 +02:00
Martin Kroeker
2946c46024 Merge pull request #1631 from oon3m0oo/stack
Avoid declaring arrays of size 0 when making large stack allocations.
2018-06-20 21:51:38 +02:00
Craig Donner
05978528c3 Avoid declaring arrays of size 0 when making large stack allocations. 2018-06-20 17:03:18 +01:00
Martin Kroeker
ef6f0b645e Merge pull request #1629 from martin-frbg/issue1628
Make gfortran link libomp for clang in the tests; avoid two typical gotchas with NOFORTRAN
2018-06-20 16:41:13 +02:00
Martin Kroeker
0c5b7b400b Add -march=skylake-avx512 to flags if target is skylake x 2018-06-20 15:16:19 +02:00
Martin Kroeker
952541e840 Need to use filter-out to handle NOFORTRAN not set 2018-06-20 13:20:30 +02:00
Martin Kroeker
9369d3e6e5 Modify NOFORTRAN tests to always check the value; fix rewriting of NO_FORTRAN 2018-06-19 23:28:06 +02:00
Martin Kroeker
10b70c904d Handle erroneous user settings NOFORTRAN=0 and NO_FORTRAN 2018-06-19 20:53:19 +02:00
Martin Kroeker
6a5ab083b7 Handle special case of gfortran+clang+OpenMP 2018-06-19 20:47:33 +02:00
Martin Kroeker
1f9e4f3193 Handle special case of gfortran+clang+OpenMP 2018-06-19 20:46:36 +02:00
Martin Kroeker
5a6a2bed9a Merge pull request #1623 from fenrus75/fast-thread
Initialize only the required subset of the jobs array, fix barriers and improve switch ratio on SkylakeX and Haswell. For issue #1622
2018-06-18 09:02:40 +02:00
Martin Kroeker
2d8cc7193a Support upcoming Intel Cannon Lake CPUs as Skylake X (#1621)
* Support  upcoming Cannon Lake as Skylake X
2018-06-17 23:38:14 +02:00
Arjan van de Ven
2ddc96c9e5 make WMB / MB safer on x86-64
make it so that

if (foo)
	RMB;
else
	MB;

is always done correctly and without syntax surprises
2018-06-17 18:06:24 +00:00
Arjan van de Ven
7e39ffe113 On x86-64, make MB/WMB compiler barriers
Whie on x86(64) one does not normally need full memory barriers, it's
good practice to at least use compiler barriers for places where on other
architectures memory barriers are used; this prevents the compiler
from over-optimizing.
2018-06-17 17:53:15 +00:00
Arjan van de Ven
73de17664d Add missing barriers in gemm scheduler
a few places in the gemm scheduler code were missing barriers;
the code likely worked OK due to heavy use of volatile / _Atomic
but there's no reason to get this incorrect
2018-06-17 17:50:43 +00:00
Arjan van de Ven
6eb4b9ae7c Tune HASWELL SWITCH_RATIO as well
Similar to the SKYLAKEX patch, 32 seems to work best
(much better than 4 or 16)

Before (4)

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               15554.3    7.2       0.2%                             30353.8      3.7       0.3%
  64 x 64               30346.8    8.7       1.6%                             63495.0      4.1      -0.1%
  65 x 65               81668.1    3.4    -123.3%                             82705.2      3.3     -21.2%
  80 x 80              105045.9    4.9     -95.5%                            115226.0      4.5      -2.2%
  96 x 96              152461.2    5.8     -74.3%                            148156.3      6.0      16.4%
 112 x 112             188505.2    7.5     -42.2%                            171187.3      8.2      36.4%
 128 x 128             257884.0    8.1     -39.5%                            224764.8      9.3      46.0%

Intermediate (16)

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               15565.7    7.2       0.2%                             30378.9      3.7       0.2%
  64 x 64               30430.2    8.7       1.3%                             63046.4      4.2       0.6%
  65 x 65               27306.0   10.1      25.3%                             38879.2      7.1      43.0%
  80 x 80               51008.7   10.1       5.1%                             61007.6      8.4      45.9%
  96 x 96               70856.7   12.5      19.0%                             83403.1     10.6      53.0%
 112 x 112              84769.9   16.6      36.0%                             99920.1     14.1      62.9%
 128 x 128              84213.2   25.0      54.5%                            113024.2     18.6      72.8%

After (32)

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               15537.3    7.2       0.3%                             30537.0      3.6      -0.3%
  64 x 64               30352.7    8.7       1.6%                             62597.8      4.2       1.3%
  65 x 65               36857.0    7.5      -0.8%                             56167.6      4.9      17.7%
  80 x 80               42552.6   12.1      20.8%                             69536.7      7.4      38.3%
  96 x 96               52101.5   17.1      40.5%                             91016.1      9.7      48.7%
 112 x 112              63853.7   22.1      51.8%                            110507.4     12.7      58.9%
 128 x 128              73966.1   28.4      60.0%                            163146.4     12.9      60.8%
2018-06-17 17:08:36 +00:00
Arjan van de Ven
5c6f008365 Tune param.h for SkylakeX
param.h defines a per-platform SWITCH_RATIO, which is used as a measure for how fine
grained the blocks for gemm need to be split up. Many platforms define this to 4.

The reality is that the gemm low level implementation for SkylakeX likes bigger blocks
due to the nature of SIMD... by tuning the SWITCH_RATIO to 32 the threading performance
improves significantly:

Before
   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10756.0   10.5      -0.5%                             18296.7      6.1      -1.7%
  64 x 64               20490.0   12.9       1.4%                             40615.0      6.5       0.0%
  65 x 65               83528.3    3.3    -210.9%                             96319.0      2.9     -83.3%
  80 x 80              101453.5    5.1    -166.3%                            128021.7      4.0     -76.6%
  96 x 96              149795.1    5.9    -143.1%                            168059.4      5.3     -47.4%
 112 x 112             191481.2    7.3    -105.8%                            204165.0      6.9     -14.6%
 128 x 128             265019.2    7.9     -99.0%                            272006.4      7.7      -5.3%

After
   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10666.3   10.6       0.4%                             18236.9      6.2      -1.4%
  64 x 64               20410.1   13.0       1.8%                             39925.8      6.6       1.7%
  65 x 65               34983.0    7.9     -30.2%                             51494.6      5.4       2.0%
  80 x 80               39769.1   13.0      -4.4%                             63805.2      8.1      12.0%
  96 x 96               45169.6   19.7      26.7%                             80065.8     11.1      29.8%
 112 x 112              57026.1   24.7      38.7%                             99535.5     14.2      44.1%
 128 x 128              64789.8   32.5      51.3%                            117407.2     17.9      54.6%

With this change, threading starts to be a win already at 96x96
2018-06-17 15:47:50 +00:00
Arjan van de Ven
d148ec4ea1 Don't use _Atomic for jobs sometimes...
The use of _Atomic leads to really bad code generation in the compiler
(on x86, you get 2 "mfence" memory barriers around each access with gcc8, despite
x86 being ordered and cache coherent). But there's a fallback in the code that
just uses volatile which is more than plenty in practice.

If we're nervous about cross thread synchronization for these variables, we should
make the YIELD function be a compiler/memory barrier instead.

performance before (after last commit)

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10630.0   10.6       0.7%                             18112.8      6.2      -0.7%
  64 x 64               20374.8   13.0       1.9%                             40487.0      6.5       0.4%
  65 x 65              141955.2    1.9    -428.3%                            146708.8      1.9    -179.2%
  80 x 80              178921.1    2.9    -369.6%                            186032.7      2.8    -156.6%
  96 x 96              205436.2    4.3    -233.4%                            224513.1      3.9     -97.0%
 112 x 112             244408.2    5.8    -162.7%                            262158.7      5.4     -47.1%
 128 x 128             321334.5    6.5    -141.3%                            333829.0      6.3     -29.2%

Performance with this patch (roughly a 2x improvement):

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10756.0   10.5      -0.5%                             18296.7      6.1      -1.7%
  64 x 64               20490.0   12.9       1.4%                             40615.0      6.5       0.0%
  65 x 65               83528.3    3.3    -210.9%                             96319.0      2.9     -83.3%
  80 x 80              101453.5    5.1    -166.3%                            128021.7      4.0     -76.6%
  96 x 96              149795.1    5.9    -143.1%                            168059.4      5.3     -47.4%
 112 x 112             191481.2    7.3    -105.8%                            204165.0      6.9     -14.6%
 128 x 128             265019.2    7.9     -99.0%                            272006.4      7.7      -5.3%
2018-06-17 15:39:15 +00:00
Arjan van de Ven
9e162146a9 Only initialize the part of the jobs array that will get used
The jobs array is getting initialized in O(compiled cpus^2) complexity.
Distros and people with bigger systems will use pretty high values
(128 or 256 or more) for this value, leading to interesting bubbles
in performance.

Baseline (single threaded performance) gets roughly 13 - 15 multiplications per cycle
in the interesting range (threading kicks in at 65x65 mult by 65x65).
The hardware is capable of 32 multiplications per cycle theoretically.

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10703.9   10.6       0.0%                             17990.6      6.3       0.0%
  64 x 64               20778.4   12.8       0.0%                             40629.2      6.5       0.0%
  65 x 65               26869.9   10.3       0.0%                             52545.7      5.3       0.0%
  80 x 80               38104.5   13.5       0.0%                             72492.7      7.1       0.0%
  96 x 96               61626.4   14.4       0.0%                            113983.8      7.8       0.0%
 112 x 112              91803.8   15.3       0.0%                            180987.3      7.8       0.0%
 128 x 128             133161.4   15.8       0.0%                            258374.3      8.1       0.0%

When threading is turned on
TARGET=SKYLAKEX F_COMPILER=GFORTRAN  SHARED=1 DYNAMIC_THREADS=1 USE_OPENMP=0  NUM_THREADS=128

  Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10725.9   10.5      -0.2%                             18134.9      6.2      -0.8%
  64 x 64               20500.6   12.9       1.3%                             40929.1      6.5      -0.7%
  65 x 65             2040832.1    0.1   -7495.2%                           2097633.6      0.1   -3892.0%
  80 x 80             2063129.1    0.2   -5314.4%                           2119925.2      0.2   -2824.3%
  96 x 96             2070374.5    0.4   -3259.6%                           2173604.4      0.4   -1806.9%
 112 x 112            2111721.5    0.7   -2169.6%                           2263330.8      0.6   -1170.0%
 128 x 128            2276181.5    0.9   -1609.3%                           2377228.9      0.9    -820.1%

There is a deep deep cliff once you hit 65x65

With this patch

   Matrix          SGEMM cycles    MPC                                   DGEMM cycles      MPC
  48 x 48               10630.0   10.6       0.7%                             18112.8      6.2      -0.7%
  64 x 64               20374.8   13.0       1.9%                             40487.0      6.5       0.4%
  65 x 65              141955.2    1.9    -428.3%                            146708.8      1.9    -179.2%
  80 x 80              178921.1    2.9    -369.6%                            186032.7      2.8    -156.6%
  96 x 96              205436.2    4.3    -233.4%                            224513.1      3.9     -97.0%
 112 x 112             244408.2    5.8    -162.7%                            262158.7      5.4     -47.1%
 128 x 128             321334.5    6.5    -141.3%                            333829.0      6.3     -29.2%

The cliff is very significantly reduced.
(more to follow)
2018-06-17 15:32:03 +00:00
Martin Kroeker
47bf0dba8f Add build-time option for OMP scheduler; document MULTITHREAD_THRESHOLD range (#1620)
* Allow choosing the OpenMP scheduler and add range hint for GEMM_MULTITHREAD_THRESHOLD
* Amended description of GEMM_MULTITHREAD_THRESHOLD
to reflect #742 making it track floating point operations rather than matrix size
2018-06-15 11:25:05 +02:00
Martin Kroeker
12603b7dbb Merge pull request #1618 from oon3m0oo/less_locking
Remove the need for most locking in memory.c.
2018-06-15 00:10:29 +02:00
Craig Donner
bf40f806ef Remove the need for most locking in memory.c.
Using thread local storage for tracking memory allocations means that threads
no longer have to lock at all when doing memory allocations / frees. This
particularly helps the gemm driver since it does an allocation per invocation.
Even without threading at all, this helps, since even calling a lock with
no contention has a cost:

Before this change, no threading:
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4          102 ns        102 ns   13504412
BM_SGEMM/6          175 ns        175 ns    7997580
BM_SGEMM/8          205 ns        205 ns    6842073
BM_SGEMM/10         266 ns        266 ns    5294919
BM_SGEMM/16         478 ns        478 ns    2963441
BM_SGEMM/20         690 ns        690 ns    2144755
BM_SGEMM/32        1906 ns       1906 ns     716981
BM_SGEMM/40        2983 ns       2983 ns     473218
BM_SGEMM/64        9421 ns       9422 ns     148450
BM_SGEMM/72       12630 ns      12631 ns     112105
BM_SGEMM/80       15845 ns      15846 ns      89118
BM_SGEMM/90       25675 ns      25676 ns      54332
BM_SGEMM/100      29864 ns      29865 ns      47120
BM_SGEMM/112      37841 ns      37842 ns      36717
BM_SGEMM/128      56531 ns      56532 ns      25361
BM_SGEMM/140      75886 ns      75888 ns      18143
BM_SGEMM/150      98493 ns      98496 ns      14299
BM_SGEMM/160     102620 ns     102622 ns      13381
BM_SGEMM/170     135169 ns     135173 ns      10231
BM_SGEMM/180     146170 ns     146172 ns       9535
BM_SGEMM/189     190226 ns     190231 ns       7397
BM_SGEMM/200     194513 ns     194519 ns       7210
BM_SGEMM/256     396561 ns     396573 ns       3531
```
with this change:
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4           95 ns         95 ns   14500387
BM_SGEMM/6          166 ns        166 ns    8381763
BM_SGEMM/8          196 ns        196 ns    7277044
BM_SGEMM/10         256 ns        256 ns    5515721
BM_SGEMM/16         463 ns        463 ns    3025197
BM_SGEMM/20         636 ns        636 ns    2070213
BM_SGEMM/32        1885 ns       1885 ns     739444
BM_SGEMM/40        2969 ns       2969 ns     472152
BM_SGEMM/64        9371 ns       9372 ns     148932
BM_SGEMM/72       12431 ns      12431 ns     112919
BM_SGEMM/80       15615 ns      15616 ns      89978
BM_SGEMM/90       25397 ns      25398 ns      55041
BM_SGEMM/100      29445 ns      29446 ns      47540
BM_SGEMM/112      37530 ns      37531 ns      37286
BM_SGEMM/128      55373 ns      55375 ns      25277
BM_SGEMM/140      76241 ns      76241 ns      18259
BM_SGEMM/150     102196 ns     102200 ns      13736
BM_SGEMM/160     101521 ns     101525 ns      13556
BM_SGEMM/170     136182 ns     136184 ns      10567
BM_SGEMM/180     146861 ns     146864 ns       9035
BM_SGEMM/189     192632 ns     192632 ns       7231
BM_SGEMM/200     198547 ns     198555 ns       6995
BM_SGEMM/256     392316 ns     392330 ns       3539
```

Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost
of small matrix operations was overshadowed by thread locking (look smaller than
32) even when not explicitly spawning threads:
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4          328 ns        328 ns    4170562
BM_SGEMM/6          396 ns        396 ns    3536400
BM_SGEMM/8          418 ns        418 ns    3330102
BM_SGEMM/10         491 ns        491 ns    2863047
BM_SGEMM/16         710 ns        710 ns    2028314
BM_SGEMM/20         871 ns        871 ns    1581546
BM_SGEMM/32        2132 ns       2132 ns     657089
BM_SGEMM/40        3197 ns       3196 ns     437969
BM_SGEMM/64        9645 ns       9645 ns     144987
BM_SGEMM/72       35064 ns      32881 ns      50264
BM_SGEMM/80       37661 ns      35787 ns      42080
BM_SGEMM/90       36507 ns      36077 ns      40091
BM_SGEMM/100      32513 ns      31850 ns      48607
BM_SGEMM/112      41742 ns      41207 ns      37273
BM_SGEMM/128      67211 ns      65095 ns      21933
BM_SGEMM/140      68263 ns      67943 ns      19245
BM_SGEMM/150     121854 ns     115439 ns      10660
BM_SGEMM/160     116826 ns     115539 ns      10000
BM_SGEMM/170     126566 ns     122798 ns      11960
BM_SGEMM/180     130088 ns     127292 ns      11503
BM_SGEMM/189     120309 ns     116634 ns      13162
BM_SGEMM/200     114559 ns     110993 ns      10000
BM_SGEMM/256     217063 ns     207806 ns       6417
```
and after, it's gone (note this includes my other change which reduces calls
to num_cpu_avail):
```
----------------------------------------------------
Benchmark             Time           CPU Iterations
----------------------------------------------------
BM_SGEMM/4           95 ns         95 ns   12347650
BM_SGEMM/6          166 ns        166 ns    8259683
BM_SGEMM/8          193 ns        193 ns    7162210
BM_SGEMM/10         258 ns        258 ns    5415657
BM_SGEMM/16         471 ns        471 ns    2981009
BM_SGEMM/20         666 ns        666 ns    2148002
BM_SGEMM/32        1903 ns       1903 ns     738245
BM_SGEMM/40        2969 ns       2969 ns     473239
BM_SGEMM/64        9440 ns       9440 ns     148442
BM_SGEMM/72       37239 ns      33330 ns      46813
BM_SGEMM/80       57350 ns      55949 ns      32251
BM_SGEMM/90       36275 ns      36249 ns      42259
BM_SGEMM/100      31111 ns      31008 ns      45270
BM_SGEMM/112      43782 ns      40912 ns      34749
BM_SGEMM/128      67375 ns      64406 ns      22443
BM_SGEMM/140      76389 ns      67003 ns      21430
BM_SGEMM/150      72952 ns      71830 ns      19793
BM_SGEMM/160      97039 ns      96858 ns      11498
BM_SGEMM/170     123272 ns     122007 ns      11855
BM_SGEMM/180     126828 ns     126505 ns      11567
BM_SGEMM/189     115179 ns     114665 ns      11044
BM_SGEMM/200      89289 ns      87259 ns      16147
BM_SGEMM/256     226252 ns     222677 ns       7375
```

I've also tested this with ThreadSanitizer and found no data races during
execution.  I'm not sure why 200 is always faster than it's neighbors, we must
be hitting some optimal cache size or something.
2018-06-14 16:54:58 +01:00
Martin Kroeker
ed682a4a0c Merge pull request #1619 from martin-frbg/issue1580
Update OSX deployment target to 10.8
2018-06-14 17:48:51 +02:00
Martin Kroeker
fcb77ab129 Update OSX deployment target to 10.8
fixes #1580
2018-06-14 16:57:58 +02:00
Martin Kroeker
26e1cfb653 Merge pull request #1607 from martin-frbg/dynarch
Move some x86_64 DYNAMIC_ARCH targets to new DYNAMIC_OLDER option
2018-06-14 16:52:55 +02:00
Martin Kroeker
c628c6fa59 Merge pull request #1612 from oon3m0oo/cpus
Fixed a few more unnecessary calls to num_cpu_avail.
2018-06-14 16:51:31 +02:00
Martin Kroeker
67d81ab49d Merge pull request #1609 from martin-frbg/issue1529
Create OpenBLASConfig.cmake in cmake builds as well
2018-06-12 23:00:24 +02:00
Martin Kroeker
2f957947a6 Merge pull request #1613 from xianyi/revert-1600-noyield
Revert "Use usleep instead of sched_yield by default"
2018-06-11 17:14:49 +02:00
Martin Kroeker
de8fff671d Revert "Use usleep instead of sched_yield by default" 2018-06-11 17:05:27 +02:00
Martin Kroeker
6f71c0fce4 Return a somewhat sane default value for L2 cache size if cpuid retur… (#1611)
* Return a somewhat sane default value for L2 cache size if cpuid returned something unexpected

Fixes #1610, the KVM hypervisor on Google Chromebooks returning zero for CPUID  0x80000006, causing DYNAMIC_ARCH
builds of OpenBLAS to hang
2018-06-11 13:26:19 +02:00
Craig Donner
c2545b0fd6 Fixed a few more unnecessary calls to num_cpu_avail.
I don't have as many benchmarks for these as for gemm, but it should still
make a difference for small matrices.
2018-06-11 10:17:16 +01:00
Martin Kroeker
e65f451409 include CMakePackageConfigHelpers 2018-06-10 15:09:43 +02:00
Martin Kroeker
02634b549b Add template for OpenBLASConfig.cmake 2018-06-10 09:25:46 +02:00
Martin Kroeker
0bea6bb9e7 Create OpenBLASConfig.cmake from cmake as well 2018-06-10 09:24:37 +02:00
Martin Kroeker
3313e4b946 Merge pull request #1608 from martin-frbg/issue874
Enable parallel make on MS Windows by default
2018-06-09 19:57:33 +02:00
Martin Kroeker
e9cd11768c Enable parallel make on MS Windows by default
fixes #874
2018-06-09 17:54:36 +02:00
Martin Kroeker
63f7395fb4 Move some DYNAMIC_ARCH targets to new DYNAMIC_OLDER option 2018-06-09 16:31:38 +02:00
Martin Kroeker
1cbd8f3ae4 Move some DYNAMIC_ARCH targets to new DYNAMIC_OLDER option 2018-06-09 16:30:46 +02:00
Martin Kroeker
6c2d90ba77 Move some DYNAMIC_ARCH targets to new DYNAMIC_OLDER option 2018-06-09 16:29:17 +02:00
Martin Kroeker
0297b3211a Merge pull request #1605 from oon3m0oo/develop
Improve performance of GEMM for small matrices when SMP is defined.
2018-06-09 12:42:34 +02:00
Craig Donner
66316b9f4c Improve performance of GEMM for small matrices when SMP is defined.
Always checking num_cpu_avail() regardless of whether threading will actually
be used adds noticeable overhead for small matrices.  Most other uses of
num_cpu_avail() do so only if threading will be used, so do the same here.
2018-06-07 15:29:13 +01:00
Martin Kroeker
6adc4b7b36 Merge pull request #1601 from martin-frbg/zaxpy
Use a single thread for small input size in zaxpy
2018-06-07 14:09:58 +02:00
Martin Kroeker
2ade0ef085 Merge pull request #1600 from martin-frbg/noyield
Use usleep instead of sched_yield by default
2018-06-07 12:42:00 +02:00
Martin Kroeker
e8880c1699 Use a single thread for small input size
copies daxpy improvement from #27, see #1560
2018-06-07 10:26:55 +02:00
Martin Kroeker
ed7c4a043b Use usleep instead of sched_yield by default
sched_yield only burns cpu cycles, fixes #900,  see also #923, #1560
2018-06-07 10:18:26 +02:00
Martin Kroeker
cf234a0561 Merge pull request #1589 from fenrus75/skylakex
Initial support for SkylakeX / AVX512
2018-06-06 22:07:09 +02:00
Martin Kroeker
ae2a33128b Merge pull request #1599 from martin-frbg/c_check_avx512
Improved AVX512 test case for c_check
2018-06-06 18:42:42 +02:00
Martin Kroeker
e4718b1fee Better AVX512 test case 2018-06-06 16:51:30 +02:00
Martin Kroeker
9b87b64262 Improve AVX512 testcase
clang 3.4 managed to accept the original test code, only to fail on the actual Skylake asm later
2018-06-06 16:49:00 +02:00
Martin Kroeker
0218b884c1 Merge pull request #1598 from martin-frbg/issue1593-2
Restore _Atomic define before stdatomic.h for old gcc
2018-06-06 12:48:26 +02:00
Martin Kroeker
83da278093 Update common.h 2018-06-06 09:27:49 +02:00
Martin Kroeker
358d4df2bd Merge branch 'develop' into issue1593-2 2018-06-06 09:21:41 +02:00
Martin Kroeker
06d43760e4 Restore _Atomic define before stdatomic.h for old gcc
see #1593
2018-06-06 09:18:10 +02:00
Martin Kroeker
a4af8861ff Merge pull request #1597 from martin-frbg/cmake-avx512
Check build system support for AVX512 instructions
2018-06-06 07:22:20 +02:00
Martin Kroeker
7fb62aed7e Check build system support for AVX512 instructions 2018-06-05 23:29:33 +02:00
Martin Kroeker
f6021c798d Re-enable QUIET_MAKE 2018-06-05 19:09:38 +02:00
Martin Kroeker
e8002536ec disable quiet_make for the moment 2018-06-05 18:23:01 +02:00
Martin Kroeker
ce6317f6c0 Merge pull request #1594 from martin-frbg/issue1593
Fix inverted condition in _Atomic declaration
2018-06-05 16:02:51 +02:00
Martin Kroeker
15a78d6b66 export NO_AVX512 setting 2018-06-05 15:58:34 +02:00
Martin Kroeker
354a976a59 Fix inverted condition in _Atomic declaration
fixes #1593
2018-06-05 10:31:34 +02:00
Martin Kroeker
38ad05bd04 Extend loop range to find SkylakeX in force_coretype 2018-06-05 10:26:49 +02:00
Martin Kroeker
b7feded85a Propagate NO_AVX512 via CCOMMON_OPT 2018-06-05 10:24:05 +02:00
Martin Kroeker
dc9fe05ab5 Update cpuid_x86.c 2018-06-04 17:10:19 +02:00
Martin Kroeker
8be027e4c6 Update dynamic.c 2018-06-04 14:36:39 +02:00
Martin Kroeker
ac7b6e3e9a Fix misplaced endif 2018-06-04 08:23:40 +02:00
Martin Kroeker
fc66a0ec0b Merge pull request #1590 from martin-frbg/avx512_check
Disable AVX512 (Skylake X) support if the build system is too old
2018-06-04 08:18:38 +02:00
Arjan van de Ven
89372e0993 Use AVX512 also for DGEMM
this required switching to the generic gemm_beta code (which is faster anyway on SKX)
for both DGEMM and SGEMM

Performance for the not-retuned version is in the 30% range
2018-06-03 22:17:27 +00:00
Martin Kroeker
ef626c6824 typo fix 2018-06-04 00:13:19 +02:00
Martin Kroeker
83fec56a3f Disable AVX512 (Skylake X) support if the build system is too old 2018-06-04 00:01:11 +02:00
Martin Kroeker
5a51cf4576 Separate Skylake X from Skylake 2018-06-03 23:41:33 +02:00
Martin Kroeker
5a92b311e0 Separate Skylake X from Skylake 2018-06-03 23:29:07 +02:00
Martin Kroeker
a7d0f49cec Add SKYLAKEX to DYNAMIC_CORE list only if AVX512 is available 2018-06-03 23:13:25 +02:00
Martin Kroeker
f1fb9a4745 Propagate NO_AVX512 if needed 2018-06-03 13:48:27 +02:00
Martin Kroeker
0023515733 Typo fix (misplaced parenthesis) 2018-06-03 13:22:59 +02:00
Arjan van de Ven
99c7bba8e4 Initial support for SkylakeX / AVX512
This patch adds the basic infrastructure for adding the SkylakeX (Intel Skylake server)
target. The SkylakeX target will use the AVX512 (AVX512VL level) instruction set,
which brings 2 basic things:
1) 512 bit wide SIMD (2x width of AVX2)
2) 32 SIMD registers (2x the number on AVX2)

This initial patch only contains a trivial transofrmation of the Haswell SGEMM kernel
to AVX512VL; more will follow later but this patch aims to get the infrastructure
in place for this "later".

Full performance tuning has not been done yet; with more registers and wider SIMD
it's in theory possible to retune the kernels but even without that there's an
interesting enough performance increase (30-40% range) with just this change.
2018-06-03 07:58:52 +00:00
Martin Kroeker
36c4523d85 Merge pull request #1587 from matthew-brett/fix-compile-error-early-glibc
Revert "take out unused variables"
2018-06-02 10:02:38 +02:00
Matthew Brett
a8002e283a Revert "take out unused variables"
This reverts commit e5752ff9b3.

The variables i and n are used in the `#if !__GLIBC_PREREQ(2, 7)`
branch.

Closes gh-1586.
2018-06-01 23:20:00 +01:00
Martin Kroeker
401adddb2b Merge pull request #1585 from martin-frbg/lapack-253
Fixes from Lapack-Reference PR 253
2018-06-01 18:59:33 +02:00
Martin Kroeker
c5b13d4e10 Fixes from netlib PR 253 2018-06-01 15:14:45 +02:00
Martin Kroeker
677e42d7b0 Fixes from netlib PR 253
When minimal workspace is given in ?hesv_aa, ?sysv_aa, ?hesv_aa_2stage, ?sysv_aa_2stage, now no error is given
Quick return for ?laqr1
2018-06-01 15:12:59 +02:00
Martin Kroeker
e2a8c35e5a Fixes from netlib PR253
LAPACKE interfaces for Aasen's functions now call ?sytrf_aa and ?hetrf_aa instead of ?sytrf and ?hetrf
2018-06-01 15:08:14 +02:00
Martin Kroeker
1a49fb1c05 Merge pull request #1584 from martin-frbg/issue1503
Work around name clash with Windows10's winnt.h
2018-05-31 21:56:04 +02:00
Martin Kroeker
8562d5787a Merge pull request #1583 from martin-frbg/issue1575
Handle INCX=0,INCY=0 case
2018-05-31 21:55:26 +02:00
Martin Kroeker
93f1eb09c3 Merge pull request #1582 from martin-frbg/develop-031
Update version number on the develop branch to 0.3.1.dev
2018-05-31 21:55:07 +02:00
Martin Kroeker
c90bbda3df Merge pull request #1581 from martin-frbg/issue1574-2
Fix paths to LIN and EIG tests
2018-05-31 21:54:45 +02:00
Martin Kroeker
7df8c4f76f typo fix 2018-05-31 17:23:08 +02:00
Martin Kroeker
2fc748bf72 Restore optimized swap kernel now that we have a proper fix 2018-05-31 13:41:12 +02:00
Martin Kroeker
a91f1587b9 Work around name clash with Windows10's winnt.h
fixes #1503
2018-05-31 13:26:00 +02:00
Martin Kroeker
d1b7be14aa Handle INCX=0,INCY=0 case
Fixes #1575 (sswap/dswap failing the swap utest on x86) as suggested by atsampson.
2018-05-31 12:52:04 +02:00
Martin Kroeker
b491b10057 Update version to 0.3.1.dev 2018-05-31 12:44:36 +02:00
Martin Kroeker
5fae96fb70 Update version to 0.3.1.dev 2018-05-31 12:43:45 +02:00
Martin Kroeker
a7dbd4c57d Fix paths to LIN and EIG tests
should fix 1574
2018-05-31 11:19:33 +02:00
Martin Kroeker
2cae104b5e Merge pull request #1579 from martin-frbg/issue1574
Adapt lapack-test and blas-test to changes in netlib directory layout
2018-05-29 22:02:06 +02:00
Martin Kroeker
908d40be71 Adapt lapack-test and blas-test to changes in netlib directory layout
partial fix for #1574 - the problem with lapack_testing.py looks like an upstream bug
2018-05-29 14:27:46 +02:00
Zhang Xianyi
43e592ceb3 Add -lm for Android.
Conflicts:
	exports/Makefile
2018-05-24 21:02:42 +08:00
Martin Kroeker
f0f27868d8 Merge pull request #1572 from martin-frbg/issue1571
Use the new zrot.c on POWER8 for crot as well
2018-05-23 22:55:37 +02:00
Martin Kroeker
961d25e9c7 Use the new zrot.c on POWER8 for crot as well
fixes #1571 (the old zrot.S assembly does not handle incx=0 correctly)
2018-05-23 22:54:39 +02:00
762 changed files with 82187 additions and 12208 deletions

143
.drone.yml Normal file
View File

@@ -0,0 +1,143 @@
---
kind: pipeline
name: arm64_gcc_make
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:19.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm32_gcc_make
platform:
os: linux
arch: arm
steps:
- name: Build and Test
image: ubuntu:19.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm64_clang_make
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm32_clang_cmake
platform:
os: linux
arch: arm
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: arm64_gcc_cmake
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: gcc
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: arm64_clang_cmake
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest

View File

@@ -4,11 +4,10 @@ dist: precise
sudo: true
language: c
jobs:
matrix:
include:
- &test-ubuntu
os: linux
stage: test
compiler: gcc
addons:
apt:
@@ -26,6 +25,15 @@ jobs:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64"
- <<: *test-ubuntu
os: linux-ppc64le
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
env:
# for matrix annotation only
- TARGET_BOX=PPC64LE_LINUX
- BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-ubuntu
env:
- TARGET_BOX=LINUX64
@@ -59,7 +67,6 @@ jobs:
- BTYPE="BINARY=32"
- os: linux
stage: test
compiler: gcc
addons:
apt:
@@ -80,13 +87,12 @@ jobs:
# that don't require sudo.
- &test-alpine
os: linux
stage: test
dist: trusty
sudo: true
language: minimal
before_install:
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \
&& echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1"
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
&& echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1"
- alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
install:
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
@@ -120,11 +126,10 @@ jobs:
- <<: *test-alpine
env:
- TARGET_BOX=LINUX64_MUSL
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
- &test-cmake
os: linux
stage: test
compiler: clang
addons:
apt:
@@ -153,8 +158,7 @@ jobs:
- &test-macos
os: osx
stage: test
osx_image: xcode8
osx_image: xcode10.1
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
- brew update
@@ -165,6 +169,7 @@ jobs:
- BTYPE="BINARY=64 INTERFACE64=1"
- <<: *test-macos
osx_image: xcode8.3
env:
- BTYPE="BINARY=32"

View File

@@ -6,21 +6,35 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 0.dev)
set(OpenBLAS_PATCH_VERSION 8.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions
include(GNUInstallDirs)
set(OpenBLAS_LIBNAME openblas)
include(CMakePackageConfigHelpers)
#######
if(MSVC)
option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
endif()
option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF)
option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF)
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
else()
set(NO_AFFINITY 1)
endif()
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoids conflicts with other BLAS libraries, especially when using
# 64 bit integer interfaces in OpenBLAS.
set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" )
set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" )
#######
if(BUILD_WITHOUT_LAPACK)
set(NO_LAPACK 1)
@@ -33,12 +47,27 @@ endif()
#######
if(MSVC AND MSVC_STATIC_CRT)
set(CompilerFlags
CMAKE_CXX_FLAGS
CMAKE_CXX_FLAGS_DEBUG
CMAKE_CXX_FLAGS_RELEASE
CMAKE_C_FLAGS
CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_RELEASE
)
foreach(CompilerFlag ${CompilerFlags})
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
endforeach()
endif()
message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.")
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE})
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
if (NOT DYNAMIC_ARCH)
@@ -51,10 +80,10 @@ endif ()
set(SUBDIRS ${BLASDIRS})
if (NOT NO_LAPACK)
list(APPEND SUBDIRS lapack)
if(BUILD_RELAPACK)
list(APPEND SUBDIRS relapack/src)
endif()
list(APPEND SUBDIRS lapack)
endif ()
# set which float types we want to build for
@@ -123,7 +152,7 @@ endif ()
# Only generate .def for dll on MSVC and always produce pdb files for debug and release
if(MSVC)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
endif()
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
@@ -138,14 +167,9 @@ if (${DYNAMIC_ARCH})
endforeach()
endif ()
# Only build shared libs for MSVC
if (MSVC)
set(BUILD_SHARED_LIBS ON)
endif()
# add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
# Android needs to explicitly link against libm
if(ANDROID)
@@ -154,7 +178,7 @@ endif()
# Handle MSVC exports
if(MSVC AND BUILD_SHARED_LIBS)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
else()
# Creates verbose .def file (51KB vs 18KB)
@@ -165,6 +189,7 @@ endif()
# Set output for libopenblas
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
@@ -186,7 +211,8 @@ if (USE_THREAD)
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
endif()
if (MSVC OR NOT NOFORTRAN)
#if (MSVC OR NOT NOFORTRAN)
if (NOT NO_CBLAS)
# Broken without fortran on unix
add_subdirectory(utest)
endif()
@@ -204,14 +230,92 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
SOVERSION ${OpenBLAS_MAJOR_VERSION}
)
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
if (NOT MSVC)
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
else()
target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE")
endif()
endif()
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64")
else()
set(ARCH_IN ${ARCH})
endif()
if (${CORE} STREQUAL "generic")
set(ARCH_IN "GENERIC")
endif ()
if (NOT DEFINED EXPRECISION)
set(EXPRECISION_IN 0)
else()
set(EXPRECISION_IN ${EXPRECISION})
endif()
if (NOT DEFINED NO_CBLAS)
set(NO_CBLAS_IN 0)
else()
set(NO_CBLAS_IN ${NO_CBLAS})
endif()
if (NOT DEFINED NO_LAPACK)
set(NO_LAPACK_IN 0)
else()
set(NO_LAPACK_IN ${NO_LAPACK})
endif()
if (NOT DEFINED NO_LAPACKE)
set(NO_LAPACKE_IN 0)
else()
set(NO_LAPACKE_IN ${NO_LAPACKE})
endif()
if (NOT DEFINED NEED2UNDERSCORES)
set(NEED2UNDERSCORES_IN 0)
else()
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
endif()
if (NOT DEFINED ONLY_CBLAS)
set(ONLY_CBLAS_IN 0)
else()
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
endif()
if (NOT DEFINED BU)
set(BU _)
endif()
if (NOT ${SYMBOLPREFIX} STREQUAL "")
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif()
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif()
add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
COMMENT "renaming symbols"
)
endif()
# Install project
# Install libraries
install(TARGETS ${OpenBLAS_LIBNAME}
EXPORT "OpenBLAS${SUFFIX64}Targets"
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
# Install headers
set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
set(CMAKE_INSTALL_FULL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})
message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h)
@@ -231,7 +335,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
if(NOT NOFORTRAN)
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h)
set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h)
file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n")
file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n")
file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n")
@@ -244,10 +348,11 @@ endif()
if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h)
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif()
if(NOT NO_LAPACKE)
@@ -259,11 +364,31 @@ if(NOT NO_LAPACKE)
ADD_CUSTOM_TARGET(genlapacke
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
)
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
endif()
include(FindPkgConfig QUIET)
if(PKG_CONFIG_FOUND)
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY)
install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY)
install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
endif()
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
set(PN OpenBLAS)
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}")
configure_package_config_file(cmake/${PN}Config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
VERSION ${${PN}_VERSION}
COMPATIBILITY AnyNewerVersion)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
RENAME ${PN}${SUFFIX64}ConfigVersion.cmake
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
install(EXPORT "${PN}${SUFFIX64}Targets"
NAMESPACE "${PN}${SUFFIX64}::"
DESTINATION ${CMAKECONFIG_INSTALL_DIR})

View File

@@ -167,4 +167,7 @@ In chronological order:
* [2017-02-26] ztrmm kernel for IBM z13
* [2017-03-13] strmm and ctrmm kernel for IBM z13
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
* [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
* [2019-03-14] power9 dgemm/dtrmm kernel
* [2019-04-29] power9 sgemm/strmm kernel

View File

@@ -1,4 +1,367 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.7
11-Aug 2019
common:
* having the gmake special variables TARGET_ARCH or TARGET_MACH
defined no longer causes build failures in ctest or utest
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
has the same effect as setting them to 1
* a new test program was added to allow checking the library for
thread safety
* a new option USE_LOCKING was added to ensure thread safety when
OpenBLAS itself is built without multithreading but will be
called from multiple threads.
* a build failure on Linux with glibc versions earlier than 2.5
was fixed
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
on glibc 2.6 was fixed
* NO_AFFINITY was added to the CMAKE options (and defaults to being
active on Linux, as in the gmake builds)
x86_64:
* the build-time logic for detection of AVX512 availability in
the processor and compiler was fixed
* gmake builds on OSX now set the internal name of the library to
libopenblas.0.dylib (consistent with CMAKE)
* the Haswell DGEMM kernel received a significant speedup through
improved prefetch and load instructions
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
increased by avoiding vpermpd instructions
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
to fix remaining errors in DGEMM, DSYMM and DTRMM
## POWER:
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
* added optimized kernels for POWER9 SGEMM and STRMM
## ARMV7:
* fixed the softfp implementations of xAMAX and IxAMAX
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
they were appropriate for only a subset of platforms
====================================================================
Version 0.3.6
29-Apr-2019
common:
* the build tools now check that a given cpu TARGET is actually valid
* the build-time check of system features (c_check) has been made
less dependent on particular perl features (this should mainly
benefit building on Windows)
* several problem with the ReLAPACK integration were fixed,
including INTERFACE64 support and building a shared library
* building with CMAKE on BSD systems was improved
* a non-absolute SUM function was added based on the
existing optimized code for ASUM
* CBLAS interfaces to the IxMIN and IxMAX functions were added
* a name clash between LAPACKE and BOOST headers was resolved
* CMAKE builds with OpenMP failed to include the appropriate getrf_parallel
kernels
* a crash on thread (key) deletion with the USE_TLS=1 memory management
option was fixed
* restored several earlier fixes, in particular for OpenMP performance,
building on BSD, and calling fork on CYGWIN, which had inadvertently
been dropped in the 0.3.3 rewrite of the memory management code.
x86_64:
* the AVX512 DGEMM kernel has been disabled again due to unsolved problems
* building with old versions of MSVC was fixed
* it is now possible to build a static library on Windows with CMAKE
* accessing environment variables on CYGWIN at run time was fixed
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
* Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected
* building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported
with CMAKE as well
* building for DYNAMIC_ARCH with GENERIC as the default target is now supported
* a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed
* assembly bugs involving undeclared modification of input operands were fixed
in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem,
Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause
test failures or segfaults when compiled with recent versions of gcc from 8 onward.
* a similar bug was fixed in the blas_quickdivide code used to split workloads
in most functions
* a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX
* fixed building on SkylakeX systems when either the compiler or the (emulated) operating
environment does not support AVX512
* improved GEMM performance on ZEN targets
x86:
* build failures caused by the recently added checks for AVX512 were fixed
* an inline assembly bug involving undeclared modification of an input argument was
fixed in the blas_quickdivide code used to split workloads in most functions
* a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX
MIPS32:
* a bug in the IMIN implementation made it return the result of IMAX
POWER:
* single precision BLAS1/2 functions have received optimized POWER8 kernels
* POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel
* building on PPC970 systems under OSX Leopard or Tiger is now supported
* out-of-bounds memory accesses in the gemm_beta microkernels were fixed
* building a shared library on AIX is now supported for POWER6
* DYNAMIC_ARCH support has been added for POWER6 and newer
ARMv7:
* corrected xDOT behaviour with zero INC_X or INC_Y
* a bug in the IMIN implementation made it return the result of IMAX
ARMv8:
* added support for HiSilicon TSV110 cpus
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
* cross-compilation with CMAKE now works again
* a bug in the IMIN implementation made it return the result of IMAX
* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7
IBM Z:
* optimized microkernels for single precicion BLAS1/2 functions have been added
for both Z13 and Z14
====================================================================
Version 0.3.5
31-Dec-2018
common:
* loop unrolling in TRMV has been enabled again.
* A domain error in the thread workload distribution for SYRK
has been fixed.
* gmake builds will now automatically add -fPIC to the build
options if the platform requires it.
* a pthreads key leakage (and associate crash on dlclose) in
the USE_TLS codepath was fixed.
* building of the utest cases on systems that do not provide
an implementation of complex.h was fixed.
x86_64:
* the SkylakeX code was changed to compile on OSX.
* unwanted application of the -march=skylake-avx512 option
to the common code parts of a DYNAMIC_ARCH build was fixed.
* improved performance of SGEMM for small workloads on Skylake X.
* performance of SGEMM and DGEMM was improved on Haswell.
ARMV8:
* a configuration error that broke the CNRM2 kernel was corrected.
* compilation of the GEMM kernels with CMAKE was fixed.
* DYNAMIC_ARCH builds are now available with CMAKE as well.
* using CMAKE for cross-compilation to the new cpu TARGETs
introduced in 0.3.4 now works.
POWER:
* a problem in cpu autodetection for AIX has been corrected.
====================================================================
Version 0.3.4
02-Dec-2018
common:
* the new, experimental thread-local memory allocation had
inadvertently been left enabled for gmake builds in 0.3.3
despite the announcement. It is now disabled by default, and
single-threaded builds will keep using the old allocator even
if the USE_TLS option is turned on.
* OpenBLAS will now provide enough buffer space for at least 50
threads by default.
* The output of openblas_get_config() now contains the version
number.
* A serious thread safety bug in GEMV operation with small M and
large N size has been fixed.
* The code will now automatically call blas_thread_init after a
fork if needed before handling a call to openblas_set_num_threads
* Accesses to parallelized level3 functions from multiple callers
are now serialized to avoid thread races (unless using OpenMP).
This should provide better performance than the known-threadsafe
(but non-default) USE_SIMPLE_THREADED_LEVEL3 option.
* When building LAPACK with gfortran, -frecursive is now (again)
enabled by default to ensure correct behaviour.
* The OpenBLAS version cblas.h now supports both CBLAS_ORDER and
CBLAS_LAYOUT as the name of the matrix row/column order option.
* Externally set LDFLAGS are now passed through to the final compile/link
steps to facilitate setting platform-specific linker flags.
* A potential race condition during the build of LAPACK (that would
usually manifest itself as a failure to build TESTING/MATGEN) has been
fixed.
* xHEMV has been changed to stay single-threaded for small input sizes
where the overhead of multithreading exceeds any possible gains
* CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or
ThunderX hardware with sizable input.
* Linker flags for the PGI compiler have been updated
* Behaviour of AXPY with zero increments is now handled in the C interface,
correcting the result on at least Intel Atom.
* The result matrix from calling SGELSS with an all-zero input matrix is
now zeroed completely.
x86_64:
* Autodetection of AMD Ryzen2 has been fixed (again).
* CMAKE builds now support labeling of an INTERFACE64=1 build of
the library with the _64 suffix.
* AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel
has been sped up by rewriting with C intrinsics
* Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS)
POWER:
* added support for building on AIX (with gcc and GNU tools from AIX Toolbox).
* CPU type detection has been implemented for AIX.
* CPU type detection has been fixed for NETBSD.
MIPS64:
* AXPY on LOONGSON3A has been corrected to pass "zero increment" utest.
* DSDOT on LOONGSON3A has been fixed.
* the SGEMM microkernel has been hardened against potential data loss.
ARMV8:
* DYNAMic_ARCH support is now available for 64bit ARM
* cross-compiling for ARMV8 under iOS now works.
* cpu-specific code has been rearranged to make better use of both
hardware commonalities and model-specific compiler optimizations.
* XGENE1 has been removed as a TARGET, superseded by the improved generic
ARMV8 support.
ARMV7:
* Older assembly mnemonics have been converted to UAL form to allow
building with clang 7.0
* Cross compiling LAPACKE for Android has been fixed again (broken by
update to LAPACK 3.7.0 some while ago).
====================================================================
Version 0.3.3
31-Aug-2018
common:
* thread memory allocation has been switched back to the method
used before version 0.3.1 due to unexpected problems caused by
the new code under some circumstances. A new compile-time option
USE_TLS has been added to enable the new code, and it is hoped
that this can become the default again in the next version.
* LAPAck PR272 has been integrated, which fixes spurious errors
in DSYEVR and related functions caused by missing conversion
from ILAENV to ILAENV_2STAGE in several _2stage routines.
* the cmake-generated OpenBLASConfig.cmake now uses correct case
for the name of the library
* added support for Haiku OS
x86_64:
* added AVX512 implementations of SDOT, DDOT, SAXPY, DAXPY,
DSCAL, DGEMVN and DSYMVL
* added a workaround for a cygwin issue that prevented compilation
of AVX512 code
IBM Z:
* added autodetection of Z14
* fixed TRMM errors in the generic target
====================================================================
Version 0.3.2
30-Jul-2018
common:
* fixes for regressions caused by the rewrite of the thread
initialization code in 0.3.1
POWER:
* fixed cpu autodetection for the BSDs
MIPS64:
* fixed utest errors in AXPY, DSDOT, ROT and SWAP
x86_64:
* added autodetection of AMD Ryzen 2
* fixed build with older versions of MSVC
====================================================================
Version 0.3.1
01-Jul-2018
common:
* rewritten thread initialization code with significantly reduced overhead
* added CBLAS interfaces to the IxAMIN BLAS extension functions
* fixed the lapack-test target
* CMAKE builds now create an OpenBLASConfig.cmake file
* ZAXPY now uses a single thread for small input sizes
* the LAPACK code was updated from Reference-LAPACK/lapack#253
(fixing LAPACKE interfaces to Aasen's functions)
POWER:
* corrected CROT and ZROT behaviour with zero INC_X
ARMV7:
* corrected xDOT behaviour with zero INC_X or INC_Y
x86_64:
* retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER,
this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO
(which will still be supported via the slower PRESCOTT kernels when this option is not set)
* added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to
specify the list of x86_64 targets to include. Any target not on the list will be supported
by the Sandybridge or Nehalem kernels if available, or by Prescott.
* improved SWITCH_RATIO on Haswell for increased GEMM throughput
* added initial support for Intel Skylake X, including an AVX512 SGEMM kernel
* added autodetection of Intel Cannon Lake series as Skylake X
* added a default L2 cache size for hypervisors that return zero here (Chromebook)
* fixed a name clash with recent Windows10 headers that broke the build with (at least)
recent mingw from MSYS2
* fixed a link error in mixed clang/gfortran builds with OpenMP
* updated the OSX deployment target to 10.8
* switched on parallel make for builds on MS Windows by default
x86:
* fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y
====================================================================
Version 0.3.0
23-May-2108
common:
* fixed some more thread race and locking bugs
* added preliminary support for calling an OpenMP build of the library from multiple threads
* removed performance impact of thread locks added in 0.2.20 on OpenMP code
* general code cleanup
* optimized DSDOT implementation
* improved thread distribution for GEMM
* corrected IMATCOPY/OMATCOPY implementation
* fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations
* cmake build improvements
* pkgconfig file now contains build options
* openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build
* corrections and improvements for systems with more than 64 cpus
* LAPACK code updated to 3.8.0 including later fixes
* added ReLAPACK, a recursive implementation of several LAPACK functions
* Rewrote ROTMG to handle cases that the netlib code failed to address
* Disabled (broken) multithreading code for xTRMV
* corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard
* shared memory access failures on startup are now handled more gracefully
* restored utests from earlier releases (and made them pass on all affected systems)
SPARC:
* several fixes for cpu autodetection
POWER:
* corrected vector register overwriting in several Power8 kernels
* optimized additional BLAS functions
ARM:
* added support for CortexA53 and A72
* added autodetection for ThunderX2T99
* made most optimized kernels the default for generic ARMv8 targets
x86_64:
* parallelized DDOT kernel for Haswell
* changed alignment directives in assembly kernels to boost performance on OSX
* fixed register handling in the GEMV microkernels (bug exposed by gcc7)
* added support for building on OpenBSD and Dragonfly
* updated compiler options to work with Intel release 2018
* support fully optimized build with clang/flang on Microsoft Windows
* fixed building on AIX
IBM Z:
* added optimized BLAS 1/2 functions
MIPS:
* fixed cpu autodetection helper code
* added mips32 1004K cpu (Mediatek MT7621 and similar SoC)
* added mips64 I6500 cpu
====================================================================
Version 0.2.20
24-Jul-2017

View File

@@ -21,9 +21,20 @@ ifeq ($(BUILD_RELAPACK), 1)
RELA = re_lapack
endif
ifeq ($(NO_FORTRAN), 1)
define NOFORTRAN
1
endef
define NO_LAPACK
1
endef
export NOFORTRAN
export NO_LAPACK
endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
.PHONY : all libs netlib $(RELA) test ctest shared install
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
@@ -47,7 +58,7 @@ endif
endif
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
ifndef NOFORTRAN
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
endif
ifneq ($(OSNAME), AIX)
@@ -85,8 +96,8 @@ endif
@echo
shared :
ifndef NO_SHARED
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
ifneq ($(NO_SHARED), 1)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@@ -98,6 +109,7 @@ endif
ifeq ($(OSNAME), Darwin)
@$(MAKE) -C exports dyn
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq ($(OSNAME), WINNT)
@$(MAKE) -C exports dll
@@ -108,19 +120,22 @@ endif
endif
tests :
ifndef NOFORTRAN
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
$(MAKE) -C utest all
endif
$(MAKE) -C utest all
ifndef NO_CBLAS
$(MAKE) -C ctest all
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
$(MAKE) -C cpp_thread_test all
endif
endif
endif
libs :
ifeq ($(CORE), UNKOWN)
ifeq ($(CORE), UNKNOWN)
$(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
endif
ifeq ($(NOFORTRAN), 1)
@@ -153,6 +168,9 @@ ifeq ($(DYNAMIC_ARCH), 1)
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
done
@echo DYNAMIC_ARCH=1 >> Makefile.conf_last
ifeq ($(DYNAMIC_OLDER), 1)
@echo DYNAMIC_OLDER=1 >> Makefile.conf_last
endif
endif
ifdef USE_THREAD
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
@@ -207,7 +225,7 @@ netlib :
else
netlib : lapack_prebuild
ifndef NOFORTRAN
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
endif
@@ -228,7 +246,7 @@ prof_lapack : lapack_prebuild
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
lapack_prebuild :
ifndef NOFORTRAN
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -237,7 +255,7 @@ ifndef NOFORTRAN
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -253,6 +271,8 @@ ifeq ($(F_COMPILER), GFORTRAN)
ifdef SMP
ifeq ($(OSNAME), WINNT)
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
else ifeq ($(OSNAME), Haiku)
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
else
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
@@ -271,21 +291,21 @@ endif
endif
large.tgz :
ifndef NOFORTRAN
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/large.tgz;
fi
endif
timing.tgz :
ifndef NOFORTRAN
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/timing.tgz;
fi
endif
lapack-timing : large.tgz timing.tgz
ifndef NOFORTRAN
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING
@@ -294,9 +314,10 @@ endif
lapack-test :
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
ifneq ($(CROSS), 1)
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
endif
@@ -308,9 +329,9 @@ lapack-runtest:
blas-test:
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out)
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
dummy :

View File

@@ -1,7 +1,7 @@
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -mfpu=neon -march=armv7-a
FCOMMON_OPT += -mfpu=neon -march=armv7-a
CCOMMON_OPT += -mfpu=neon
FCOMMON_OPT += -mfpu=neon
else
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
@@ -9,11 +9,6 @@ endif
endif
ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -mfpu=vfp -march=armv6
FCOMMON_OPT += -mfpu=vfp -march=armv6
endif
ifeq ($(CORE), ARMV5)
CCOMMON_OPT += -march=armv5
FCOMMON_OPT += -march=armv5
CCOMMON_OPT += -mfpu=vfp
FCOMMON_OPT += -mfpu=vfp
endif

View File

@@ -4,22 +4,42 @@ CCOMMON_OPT += -march=armv8-a
FCOMMON_OPT += -march=armv8-a
endif
ifeq ($(CORE), CORTEXA57)
CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
ifeq ($(CORE), CORTEXA53)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
endif
ifeq ($(CORE), VULCAN)
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
ifeq ($(CORE), CORTEXA57)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
endif
ifeq ($(CORE), CORTEXA72)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
endif
ifeq ($(CORE), CORTEXA73)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
endif
ifeq ($(CORE), THUNDERX)
CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
FCOMMON_OPT += -march=armv8-a -mtune=thunderx
endif
ifeq ($(CORE), FALKOR)
CCOMMON_OPT += -march=armv8-a -mtune=falkor
FCOMMON_OPT += -march=armv8-a -mtune=falkor
endif
ifeq ($(CORE), THUNDERX2T99)
CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif
ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
endif

View File

@@ -48,6 +48,7 @@ ifndef NO_CBLAS
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
endif
ifneq ($(OSNAME), AIX)
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@@ -57,21 +58,22 @@ ifndef NO_LAPACKE
endif
#for install static library
ifndef NO_STATIC
ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifndef NO_SHARED
ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
@@ -81,7 +83,8 @@ ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@@ -93,12 +96,39 @@ ifeq ($(OSNAME), CYGWIN_NT)
endif
endif
else
#install on AIX has different options syntax
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
endif
#for install static library
ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
endif
#Generating openblas.pc
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@@ -109,7 +139,7 @@ endif
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
ifndef NO_SHARED
ifneq ($(NO_SHARED),1)
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"

View File

@@ -9,7 +9,15 @@ else
USE_OPENMP = 1
endif
ifeq ($(CORE), POWER9)
ifeq ($(USE_OPENMP), 1)
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
else
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math
endif
endif
ifeq ($(CORE), POWER8)
ifeq ($(USE_OPENMP), 1)
@@ -21,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas
endif
endif
# workaround for C->FORTRAN ABI violation in LAPACKE
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -fno-optimize-sibling-calls
endif
FLAMEPATH = $(HOME)/flame/lib

View File

@@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.0.dev
VERSION = 0.3.8.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -17,6 +17,11 @@ VERSION = 0.3.0.dev
# If you want to support multiple architecture in one binary
# DYNAMIC_ARCH = 1
# If you want the full list of x86_64 architectures supported in DYNAMIC_ARCH
# mode (including individual optimizied codes for PENRYN, DUNNINGTON, OPTERON,
# OPTERON_SSE3, ATOM and NANO rather than fallbacks to older architectures)
# DYNAMIC_OLDER = 1
# C compiler including binary type(32bit / 64bit). Default is gcc.
# Don't use Intel Compiler or PGI, it won't generate right codes as I expect.
# CC = gcc
@@ -43,6 +48,8 @@ VERSION = 0.3.0.dev
# HOSTCC = gcc
# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64
# Please note that AVX is not available on 32-bit.
# Setting BINARY=32 disables AVX/AVX2/AVX-512.
# BINARY=64
# About threaded BLAS. It will be automatically detected if you don't
@@ -51,40 +58,63 @@ VERSION = 0.3.0.dev
# For force setting for multi threaded, specify USE_THREAD = 1
# USE_THREAD = 0
# If you want to build a single-threaded OpenBLAS, but expect to call this
# from several concurrent threads in some other program, comment this in for
# thread safety. (This is done automatically for USE_THREAD=1 , and should not
# be necessary when USE_OPENMP=1)
# USE_LOCKING = 1
# If you're going to use this library with OpenMP, please comment it in.
# This flag is always set for POWER8. Don't modify the flag
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
# USE_OPENMP = 1
# You can define maximum number of threads. Basically it should be
# less than actual number of cores. If you don't specify one, it's
# automatically detected by the the script.
# The OpenMP scheduler to use - by default this is "static" and you
# will normally not want to change this unless you know that your main
# workload will involve tasks that have highly unbalanced running times
# for individual threads. Changing away from "static" may also adversely
# affect memory access locality in NUMA systems. Setting to "runtime" will
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
# CCOMMON_OPT += -DOMP_SCHED=dynamic
# You can define the maximum number of threads. Basically it should be less
# than or equal to the number of CPU threads. If you don't specify one, it's
# automatically detected by the build system.
# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to
# restrict NUM_THREADS to the number of physical cores. By default, the automatic
# detection includes logical CPUs, thus allowing the use of SMT.
# Users may opt at runtime to use less than NUM_THREADS threads.
#
# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS
# value (eg. 32-256) if you expect your users to use that many threads. Due to the way
# some internal structures are allocated, using a large NUM_THREADS value has a RAM
# footprint penalty, even if users reduce the actual number of threads at runtime.
# NUM_THREADS = 24
# If you have enabled USE_OPENMP and your application would call
# OpenBLAS's calculation API from multi threads, please comment it in.
# This flag defines how many instances of OpenBLAS's calculation API can
# actually run in parallel. If more threads call OpenBLAS's calculation API,
# OpenBLAS's calculation API from multiple threads, please comment this in.
# This flag defines how many instances of OpenBLAS's calculation API can actually
# run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API,
# they need to wait for the preceding API calls to finish or risk data corruption.
# NUM_PARALLEL = 2
# if you don't need to install the static library, please comment it in.
# If you don't need to install the static library, please comment this in.
# NO_STATIC = 1
# if you don't need generate the shared library, please comment it in.
# If you don't need to generate the shared library, please comment this in.
# NO_SHARED = 1
# If you don't need CBLAS interface, please comment it in.
# If you don't need the CBLAS interface, please comment this in.
# NO_CBLAS = 1
# If you only want CBLAS interface without installing Fortran compiler,
# please comment it in.
# If you only want the CBLAS interface without installing a Fortran compiler,
# please comment this in.
# ONLY_CBLAS = 1
# If you don't need LAPACK, please comment it in.
# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1.
# If you don't need LAPACK, please comment this in.
# If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1.
# NO_LAPACK = 1
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
# If you don't need LAPACKE (C Interface to LAPACK), please comment this in.
# NO_LAPACKE = 1
# Build LAPACK Deprecated functions since LAPACK 3.6.0
@@ -93,12 +123,18 @@ BUILD_LAPACK_DEPRECATED = 1
# Build RecursiveLAPACK on top of LAPACK
# BUILD_RELAPACK = 1
# If you want to use legacy threaded Level 3 implementation.
# If you want to use the legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1
# If you want to use the new, still somewhat experimental code that uses
# thread-local storage instead of a central memory buffer in memory.c
# Note that if your system uses GLIBC, it needs to have at least glibc 2.21
# for this to work.
# USE_TLS = 1
# If you want to drive whole 64bit region by BLAS. Not all Fortran
# compiler supports this. It's safe to keep comment it out if you
# are not sure(equivalent to "-i8" option).
# compilers support this. It's safe to keep this commented out if you
# are not sure. (This is equivalent to the "-i8" ifort option).
# INTERFACE64 = 1
# Unfortunately most of kernel won't give us high quality buffer.
@@ -106,10 +142,18 @@ BUILD_LAPACK_DEPRECATED = 1
# but it will consume time. If you don't like it, you can disable one.
NO_WARMUP = 1
# If you want to disable CPU/Memory affinity on Linux.
# Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling.
# This feature is only implemented on Linux, and is always disabled on other platforms.
# Enabling affinity handling may improve performance, especially on NUMA systems, but
# it may conflict with certain applications that also try to manage affinity.
# This conflict can result in threads of the application calling OpenBLAS ending up locked
# to the same core(s) as OpenBLAS, possibly binding all threads to a single core.
# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing
# else modifies affinity settings.
# Note: enabling affinity has been known to cause problems with NumPy and R
NO_AFFINITY = 1
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# BIGNUMA = 1
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
@@ -119,6 +163,10 @@ NO_AFFINITY = 1
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
# NO_AVX2 = 1
# Don't use SkylakeX optimizations if binutils or compiler are too old (the build
# system will try to determine this automatically)
# NO_AVX512 = 1
# Don't use parallel make.
# NO_PARALLEL_MAKE = 1
@@ -133,6 +181,9 @@ NO_AFFINITY = 1
# FUNCTION_PROFILE = 1
# Support for IEEE quad precision(it's *real* REAL*16)( under testing)
# This option should not be used - it is a holdover from unfinished code present
# in the original GotoBLAS2 library that may be usable as a starting point but
# is not even expected to compile in its present form.
# QUAD_PRECISION = 1
# Theads are still working for a while after finishing BLAS operation
@@ -140,22 +191,25 @@ NO_AFFINITY = 1
# time out to improve performance. This number should be from 4 to 30
# which corresponds to (1 << n) cycles. For example, if you set to 26,
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
# system). Also you can control this mumber by THREAD_TIMEOUT
# system). Also you can control this number by THREAD_TIMEOUT
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26
# Using special device driver for mapping physically contigous memory
# Using special device driver for mapping physically contiguous memory
# to the user space. If bigphysarea is enabled, it will use it.
# DEVICEDRIVER_ALLOCATION = 1
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
# CONSISTENT_FPCSR = 1
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading
# in small matrix sizes. The default value is 4.
# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
# with single thread. (Actually in recent versions this is a factor proportional to the
# number of floating point operations necessary for the given problem size, no longer
# an individual dimension). You can use this setting to avoid the overhead of multi-
# threading in small matrix sizes. The default value is 4, but values as high as 50 have
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
# GEMM_MULTITHREAD_THRESHOLD = 4
# If you need santy check by comparing reference BLAS. It'll be very
# If you need sanity check by comparing results to reference BLAS. It'll be very
# slow (Not implemented yet).
# SANITY_CHECK = 1
@@ -167,8 +221,8 @@ NO_AFFINITY = 1
# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT
# COMMON_OPT = -O2
# gfortran option for LAPACK
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
# gfortran option for LAPACK to improve thread-safety
# It is enabled by default in Makefile.system for gfortran
# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT
# FCOMMON_OPT = -frecursive
@@ -195,6 +249,21 @@ COMMON_PROF = -pg
# SYMBOLPREFIX=
# SYMBOLSUFFIX=
# Run a C++ based thread safety tester after the build is done.
# This is mostly intended as a developer feature to spot regressions, but users and
# package maintainers can enable this if they have doubts about the thread safety of
# the library, given the configuration in this file.
# By default, the thread safety tester launches 52 concurrent calculations at the same
# time.
#
# Please note that the test uses ~1300 MiB of RAM for the DGEMM test.
#
# The test requires CBLAS to be built, a C++11 capable compiler and the presence of
# an OpenMP implementation. If you are cross-compiling this test will probably not
# work at all.
#
# CPP_THREAD_SAFETY_TEST = 1
#
# End of user configuration
#

View File

@@ -9,6 +9,22 @@ ifndef TOPDIR
TOPDIR = .
endif
# If ARCH is not set, we use the host system's architecture.
ifndef ARCH
ARCH := $(shell uname -m)
endif
# Catch conflicting usage of ARCH in some BSD environments
ifeq ($(ARCH), amd64)
override ARCH=x86_64
else ifeq ($(ARCH), powerpc64)
override ARCH=power
else ifeq ($(ARCH), i386)
override ARCH=x86
else ifeq ($(ARCH), aarch64)
override ARCH=arm64
endif
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
# Default C compiler
@@ -54,6 +70,7 @@ endif
ifdef TARGET
GETARCH_FLAGS := -DFORCE_$(TARGET)
GETARCH_FLAGS += -DUSER_TARGET
endif
# Force fallbacks for 32bit
@@ -62,6 +79,9 @@ ifeq ($(BINARY), 32)
ifeq ($(TARGET), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SKYLAKEX)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
@@ -80,6 +100,9 @@ endif
ifeq ($(TARGET), ZEN)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), ARMV8)
GETARCH_FLAGS := -DFORCE_ARMV7
endif
endif
@@ -95,6 +118,9 @@ ifeq ($(BINARY), 32)
ifeq ($(TARGET_CORE), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SKYLAKEX)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
@@ -116,7 +142,12 @@ endif
endif
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
ifeq ($(ARCH), x86_64)
ifeq ($(findstring pgcc,$(HOSTCC)),)
GETARCH_FLAGS += -march=native
endif
endif
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
@@ -134,13 +165,18 @@ GETARCH_FLAGS += -DNO_AVX
endif
ifeq ($(BINARY), 32)
GETARCH_FLAGS += -DNO_AVX
GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512
NO_AVX512 = 1
endif
ifeq ($(NO_AVX2), 1)
GETARCH_FLAGS += -DNO_AVX2
endif
ifeq ($(NO_AVX512), 1)
GETARCH_FLAGS += -DNO_AVX512
endif
ifeq ($(DEBUG), 1)
GETARCH_FLAGS += -g
endif
@@ -211,6 +247,10 @@ SMP = 1
endif
endif
ifeq ($(SMP), 1)
USE_LOCKING =
endif
ifndef NEED_PIC
NEED_PIC = 1
endif
@@ -227,9 +267,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy
OBJCONV = $(CROSS_SUFFIX)objconv
# For detect fortran failed, only build BLAS.
# When fortran support was either not detected or actively deselected, only build BLAS.
ifeq ($(NOFORTRAN), 1)
NO_LAPACK = 1
override FEXTRALIB =
endif
#
@@ -238,7 +279,7 @@ endif
ifeq ($(OSNAME), Darwin)
ifndef MACOSX_DEPLOYMENT_TARGET
export MACOSX_DEPLOYMENT_TARGET=10.6
export MACOSX_DEPLOYMENT_TARGET=10.8
endif
MD5SUM = md5 -r
endif
@@ -362,6 +403,12 @@ ifneq ($(MAX_STACK_ALLOC), 0)
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
endif
ifdef USE_LOCKING
ifneq ($(USE_LOCKING), 0)
CCOMMON_OPT += -DUSE_LOCKING
endif
endif
#
# Architecture dependent settings
#
@@ -462,13 +509,50 @@ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
endif
ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
DYNAMIC_CORE = PRESCOTT CORE2
ifeq ($(DYNAMIC_OLDER), 1)
DYNAMIC_CORE += PENRYN DUNNINGTON
endif
DYNAMIC_CORE += NEHALEM
ifeq ($(DYNAMIC_OLDER), 1)
DYNAMIC_CORE += OPTERON OPTERON_SSE3
endif
DYNAMIC_CORE += BARCELONA
ifeq ($(DYNAMIC_OLDER), 1)
DYNAMIC_CORE += BOBCAT ATOM NANO
endif
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
endif
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += HASWELL ZEN
endif
ifneq ($(NO_AVX512), 1)
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += SKYLAKEX
endif
endif
endif
ifdef DYNAMIC_LIST
override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST)
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
CCOMMON_OPT += $(XCCOMMON_OPT)
#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
endif
ifeq ($(ARCH), arm64)
DYNAMIC_CORE = ARMV8
DYNAMIC_CORE += CORTEXA57
DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99
endif
ifeq ($(ARCH), power)
DYNAMIC_CORE = POWER6
DYNAMIC_CORE += POWER8
DYNAMIC_CORE += POWER9
endif
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
@@ -679,6 +763,10 @@ endif
ifeq ($(F_COMPILER), GFORTRAN)
CCOMMON_OPT += -DF_INTERFACE_GFORT
FCOMMON_OPT += -Wall
# make single-threaded LAPACK calls thread-safe #1847
FCOMMON_OPT += -frecursive
# work around ABI problem with passing single-character arguments
FCOMMON_OPT += -fno-optimize-sibling-calls
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
ifneq ($(NO_LAPACK), 1)
EXTRALIB += -lgfortran
@@ -902,6 +990,10 @@ ifeq ($(DYNAMIC_ARCH), 1)
CCOMMON_OPT += -DDYNAMIC_ARCH
endif
ifeq ($(DYNAMIC_OLDER), 1)
CCOMMON_OPT += -DDYNAMIC_OLDER
endif
ifeq ($(NO_LAPACK), 1)
CCOMMON_OPT += -DNO_LAPACK
#Disable LAPACK C interface
@@ -924,6 +1016,10 @@ ifeq ($(NO_AVX2), 1)
CCOMMON_OPT += -DNO_AVX2
endif
ifeq ($(NO_AVX512), 1)
CCOMMON_OPT += -DNO_AVX512
endif
ifdef SMP
CCOMMON_OPT += -DSMP_SERVER
@@ -976,6 +1072,12 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif
ifeq ($(USE_TLS), 1)
CCOMMON_OPT += -DUSE_TLS
endif
CCOMMON_OPT += -DVERSION=\"$(VERSION)\"
ifndef SYMBOLPREFIX
SYMBOLPREFIX =
endif
@@ -1023,8 +1125,12 @@ endif
endif
ifdef NO_AFFINITY
ifeq ($(NO_AFFINITY), 0)
override undefine NO_AFFINITY
else
CCOMMON_OPT += -DNO_AFFINITY
endif
endif
ifdef FUNCTION_PROFILE
CCOMMON_OPT += -DFUNCTION_PROFILE
@@ -1086,8 +1192,6 @@ ifndef FCOMMON_OPT
FCOMMON_OPT = -O2 -frecursive
endif
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
@@ -1095,6 +1199,12 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES =
ifdef NEED_PIC
ifeq (,$(findstring PIC,$(FFLAGS)))
override FFLAGS += -fPIC
endif
endif
#For LAPACK Fortran codes.
#Disable -fopenmp for LAPACK Fortran codes on Windows.
ifdef OS_WINDOWS
@@ -1153,7 +1263,11 @@ endif
LIBDLLNAME = $(LIBPREFIX).dll
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
ifneq ($(OSNAME), AIX)
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
else
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
endif
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
@@ -1230,6 +1344,7 @@ export MSA_FLAGS
export KERNELDIR
export FUNCTION_PROFILE
export TARGET_CORE
export NO_AVX512
export SGEMM_UNROLL_M
export SGEMM_UNROLL_N

View File

@@ -8,6 +8,38 @@ endif
endif
endif
ifeq ($(CORE), SKYLAKEX)
ifndef DYNAMIC_ARCH
ifndef NO_AVX512
CCOMMON_OPT += -march=skylake-avx512
FCOMMON_OPT += -march=skylake-avx512
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
endif
endif
endif
endif
ifeq ($(CORE), HASWELL)
ifndef DYNAMIC_ARCH
ifndef NO_AVX2
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -mavx2
endif
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -mavx2
endif
endif
endif
endif
ifeq ($(OSNAME), Interix)
ARFLAGS = -m x64
endif

View File

@@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
FCOMMON_OPT += -march=z13 -mzvector
endif
ifeq ($(CORE), Z14)
CCOMMON_OPT += -march=z14 -mzvector
FCOMMON_OPT += -march=z14 -mzvector
endif

View File

@@ -6,11 +6,13 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
## Binary Packages
@@ -22,7 +24,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
## Installation from Source
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
using Git from https://github.com/xianyi/OpenBLAS.git.
### Dependencies
@@ -63,9 +65,7 @@ A debug version can be built using `make DEBUG=1`.
### Compile with MASS support on Power CPU (optional)
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
consists of a set of mathematical functions for C, C++, and Fortran applications that are
are tuned for optimum performance on POWER architectures.
The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
The library can be installed as shown:
@@ -110,10 +110,12 @@ Please read `GotoBLAS_01Readme.txt`.
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
- **AMD ZEN**: Uses Haswell codes with some optimizations.
#### MIPS64
@@ -132,11 +134,13 @@ Please read `GotoBLAS_01Readme.txt`.
#### PPC/PPC64
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
#### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)
### Supported OS
@@ -200,6 +204,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
Clang 3.0 will generate the wrong AVX binary code.
* Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels.
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
the library with `BIGNUMA=1`.

View File

@@ -20,6 +20,7 @@ DUNNINGTON
NEHALEM
SANDYBRIDGE
HASWELL
SKYLAKEX
ATOM
b)AMD CPU:
@@ -47,6 +48,7 @@ POWER5
POWER6
POWER7
POWER8
POWER9
PPCG4
PPC970
PPC970MP
@@ -82,11 +84,16 @@ ARMV5
8.ARM 64-bit CPU:
ARMV8
CORTEXA53
CORTEXA57
VULCAN
CORTEXA72
CORTEXA73
FALKOR
THUNDERX
THUNDERX2T99
TSV110
9.System Z:
ZARCH_GENERIC
Z13
Z14

View File

@@ -35,7 +35,14 @@ environment:
DYNAMIC_ARCH: ON
WITH_FORTRAN: no
- COMPILER: cl
- COMPILER: MinGW64-gcc-7.2.0-mingw
DYNAMIC_ARCH: OFF
WITH_FORTRAN: ignore
- COMPILER: MinGW64-gcc-7.2.0
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
COMPILER: MinGW-gcc-5.3.0
WITH_FORTRAN: ignore
install:
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
@@ -52,10 +59,17 @@ install:
before_build:
- ps: if (-Not (Test-Path .\build)) { mkdir build }
- cd build
- set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
- if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl ..
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
build_script:
- cmake --build .
@@ -64,3 +78,4 @@ test_script:
- echo Running Test
- cd utest
- openblas_utest

51
azure-pipelines.yml Normal file
View File

@@ -0,0 +1,51 @@
trigger:
# start a new build for every push
batch: False
branches:
include:
- develop
jobs:
# manylinux1 is useful to test because the
# standard Docker container uses an old version
# of gcc / glibc
- job: manylinux1_gcc
pool:
vmImage: 'ubuntu-16.04'
steps:
- script: |
echo "FROM quay.io/pypa/manylinux1_x86_64
COPY . /tmp/openblas
RUN cd /tmp/openblas && \
COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \
BTYPE='BINARY=64' CC=gcc && \
make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \
make -C test $COMMON_FLAGS $BTYPE && \
make -C ctest $COMMON_FLAGS $BTYPE && \
make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile
docker build .
displayName: Run manylinux1 docker build
- job: Intel_SDE_skx
pool:
vmImage: 'ubuntu-16.04'
steps:
- script: |
# at the time of writing the available Azure Ubuntu vm image
# does not support AVX512VL, so use more recent LTS version
echo "FROM ubuntu:bionic
COPY . /tmp/openblas
RUN apt-get -y update && apt-get -y install \\
cmake \\
gfortran \\
make \\
wget
RUN mkdir /tmp/SDE && cd /tmp/SDE && \\
mkdir sde-external-8.35.0-2019-03-11-lin && \\
wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\
tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1
RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64
CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile
docker build -t intel_sde .
# we need a privileged docker run for sde process attachment
docker run --privileged intel_sde
displayName: 'Run AVX512 SkylakeX docker build / test'

View File

@@ -207,7 +207,7 @@ int main(int argc, char *argv[]){
for (i = 0; i < m * n * COMPSIZE; i++) {
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
fprintf(stderr, " SIZE Flops Time\n");
for (i = from; i <= to; i += step) {

View File

@@ -122,7 +122,7 @@ int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
FLOAT beta [] = {1.0, 0.0};
char trans='N';
blasint m, i, j;
blasint inc_x=1,inc_y=1;

View File

@@ -2,6 +2,8 @@
argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128
nto <- 2048
nstep <- 128
@@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z])
}
}
}
p <- Sys.getenv("OPENBLAS_LOOPS")
@@ -27,29 +28,21 @@ if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom
while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
A <- matrix(rnorm(n * n), nrow = n)
ev <- 0
z <- system.time(for (l in 1:loops) {
ev <- eigen(A)
})
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6)
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06)
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep
}

View File

@@ -2,6 +2,8 @@
argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128
nto <- 2048
nstep <- 128
@@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z])
}
}
}
p <- Sys.getenv("OPENBLAS_LOOPS")
@@ -27,26 +28,13 @@ if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom
while (n <= nto) {
A <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
B <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
A <- matrix(runif(n * n), nrow = n)
B <- matrix(runif(n * n), nrow = n)
C <- 1
z <- system.time(for (l in 1:loops) {
@@ -54,11 +42,10 @@ while (n <= nto) {
l <- l + 1
})
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6)
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06)
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep
}

View File

@@ -2,6 +2,8 @@
argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128
nto <- 2048
nstep <- 128
@@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z])
}
}
}
p <- Sys.getenv("OPENBLAS_LOOPS")
@@ -27,31 +28,22 @@ if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom
while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
B <- matrix(rnorm(n * n), ncol = n, nrow = n)
A <- matrix(rnorm(n * n), nrow = n)
B <- matrix(rnorm(n * n), nrow = n)
z <- system.time(for (l in 1:loops) {
solve(A, B)
})
mflops <-
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6)
mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06)
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep
}

87
c_check
View File

@@ -1,7 +1,7 @@
#!/usr/bin/perl
use File::Basename;
use File::Temp qw(tempfile);
#use File::Basename;
# use File::Temp qw(tempfile);
# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
@@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$hostarch = "zarch" if ($hostarch eq "s390x");
$tmpf = new File::Temp( UNLINK => 1 );
#$tmpf = new File::Temp( UNLINK => 1 );
$binary = $ENV{"BINARY"};
$makefile = shift(@ARGV);
@@ -31,12 +31,25 @@ if ($?) {
$cross_suffix = "";
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
eval "use File::Basename";
if ($@){
warn "could not load PERL module File::Basename, emulating its functionality";
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
if ($dirnam ne ".") {
$cross_suffix .= $dirnam . "/";
}
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
if ($basnam =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
} else {
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
}
$compiler = "";
@@ -64,6 +77,7 @@ $os = WINNT if ($data =~ /OS_WINNT/);
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
$os = Interix if ($data =~ /OS_INTERIX/);
$os = Android if ($data =~ /OS_ANDROID/);
$os = Haiku if ($data =~ /OS_HAIKU/);
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
@@ -170,20 +184,26 @@ if ($?) {
$have_msa = 0;
if (($architecture eq "mips") || ($architecture eq "mips64")) {
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
} else {
$have_msa = 1;
$tmpf = new File::Temp( UNLINK => 1 );
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
} else {
$have_msa = 1;
}
unlink("$tmpf.o");
}
unlink("$tmpf.o");
}
$architecture = x86 if ($data =~ /ARCH_X86/);
@@ -201,6 +221,29 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/);
$binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/);
$no_avx512= 0;
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
$no_avx512 = 0;
} else {
# $tmpf = new File::Temp( UNLINK => 1 );
($fh,$tmpf) = tempfile( UNLINK => 1 );
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
} else {
$no_avx512 = 0;
}
unlink("$tmpf.o");
}
}
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
$data =~ /globl\s([_\.]*)(.*)/;
@@ -208,7 +251,6 @@ $data =~ /globl\s([_\.]*)(.*)/;
$need_fu = $1;
$cross = 0;
$cross = 1 if ($os ne $hostos);
if ($architecture ne $hostarch) {
$cross = 1;
@@ -216,6 +258,8 @@ if ($architecture ne $hostarch) {
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
}
$cross = 1 if ($os ne $hostos);
$openmp = "" if $ENV{USE_OPENMP} != 1;
$linker_L = "";
@@ -288,6 +332,7 @@ print MAKEFILE "CROSS=1\n" if $cross != 0;
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
$os =~ tr/[a-z]/[A-Z]/;
$architecture =~ tr/[a-z]/[A-Z]/;

23
cblas.h
View File

@@ -51,7 +51,8 @@ typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=1
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
typedef CBLAS_ORDER CBLAS_LAYOUT;
float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
float cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
@@ -72,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX);
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);
@@ -82,6 +88,21 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

View File

@@ -0,0 +1,79 @@
# OpenBLASConfig.cmake
# --------------------
#
# OpenBLAS cmake module.
# This module sets the following variables in your project::
#
# OpenBLAS_FOUND - true if OpenBLAS and all required components found on the system
# OpenBLAS_VERSION - OpenBLAS version in format Major.Minor.Release
# OpenBLAS_INCLUDE_DIRS - Directory where OpenBLAS header is located.
# OpenBLAS_INCLUDE_DIR - same as DIRS
# OpenBLAS_LIBRARIES - OpenBLAS library to link against.
# OpenBLAS_LIBRARY - same as LIBRARIES
#
#
# Available components::
#
## shared - search for only shared library
## static - search for only static library
# serial - search for unthreaded library
# pthread - search for native pthread threaded library
# openmp - search for OpenMP threaded library
#
#
# Exported targets::
#
# If OpenBLAS is found, this module defines the following :prop_tgt:`IMPORTED`
## target. Target is shared _or_ static, so, for both, use separate, not
## overlapping, installations. ::
#
# OpenBLAS::OpenBLAS - the main OpenBLAS library #with header & defs attached.
#
#
# Suggested usage::
#
# find_package(OpenBLAS)
# find_package(OpenBLAS 0.2.20 EXACT CONFIG REQUIRED COMPONENTS pthread)
#
#
# The following variables can be set to guide the search for this package::
#
# OpenBLAS_DIR - CMake variable, set to directory containing this Config file
# CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package
# PATH - environment variable, set to bin directory of this package
# CMAKE_DISABLE_FIND_PACKAGE_OpenBLAS - CMake variable, disables
# find_package(OpenBLAS) when not REQUIRED, perhaps to force internal build
@PACKAGE_INIT@
set(PN OpenBLAS)
# need to check that the @USE_*@ evaluate to something cmake can perform boolean logic upon
if(@USE_OPENMP@)
set(${PN}_openmp_FOUND 1)
elseif(@USE_THREAD@)
set(${PN}_pthread_FOUND 1)
else()
set(${PN}_serial_FOUND 1)
endif()
check_required_components(${PN})
#-----------------------------------------------------------------------------
# Don't include targets if this file is being picked up by another
# project which has already built this as a subproject
#-----------------------------------------------------------------------------
if(NOT TARGET ${PN}::OpenBLAS)
include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake")
get_property(_loc TARGET ${PN}::OpenBLAS PROPERTY LOCATION)
set(${PN}_LIBRARY ${_loc})
get_property(_ill TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_LINK_LIBRARIES)
set(${PN}_LIBRARIES ${_ill})
get_property(_id TARGET ${PN}::OpenBLAS PROPERTY INCLUDE_DIRECTORIES)
set(${PN}_INCLUDE_DIR ${_id})
get_property(_iid TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
set(${PN}_INCLUDE_DIRS ${_iid})
endif()

View File

@@ -44,22 +44,45 @@ endif ()
if (DYNAMIC_ARCH)
if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99)
endif ()
if (X86)
set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
endif ()
if (X86_64)
set(DYNAMIC_CORE PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
set(DYNAMIC_CORE PRESCOTT CORE2)
if (DYNAMIC_OLDER)
set (DYNAMIC_CORE ${DYNAMIC_CORE} PENRYN DUNNINGTON)
endif ()
set (DYNAMIC_CORE ${DYNAMIC_CORE} NEHALEM)
if (DYNAMIC_OLDER)
set (DYNAMIC_CORE ${DYNAMIC_CORE} OPTERON OPTERON_SSE3)
endif ()
set (DYNAMIC_CORE ${DYNAMIC_CORE} BARCELONA)
if (DYNAMIC_OLDER)
set (DYNAMIC_CORE ${DYNAMIC_CORE} BOBCAT ATOM NANO)
endif ()
if (NOT NO_AVX)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR)
endif ()
if (NOT NO_AVX2)
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
endif ()
if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
endif ()
endif ()
if (NOT DYNAMIC_CORE)
unset(DYNAMIC_ARCH)
message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options")
unset(DYNAMIC_ARCH CACHE)
endif ()
endif ()

View File

@@ -3,6 +3,11 @@
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets Fortran related variables.
if (INTERFACE64)
set(SUFFIX64 64)
set(SUFFIX64_UNDERSCORE _64)
endif()
if (${F_COMPILER} STREQUAL "FLANG")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (BINARY64 AND INTERFACE64)
@@ -39,7 +44,10 @@ endif ()
if (${F_COMPILER} STREQUAL "GFORTRAN")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
# ensure reentrancy of lapack codes
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
# work around ABI violation in passing string arguments from C
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
if (NOT NO_LAPACK)
set(EXTRALIB "{EXTRALIB} -lgfortran")

View File

@@ -1,7 +1,7 @@
# helper functions for the kernel CMakeLists.txt
# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file.
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
macro(SetDefaultL1)
set(SAMAXKERNEL amax.S)
set(DAMAXKERNEL amax.S)
@@ -107,6 +107,12 @@ macro(SetDefaultL1)
set(DAXPBYKERNEL ../arm/axpby.c)
set(CAXPBYKERNEL ../arm/zaxpby.c)
set(ZAXPBYKERNEL ../arm/zaxpby.c)
set(SSUMKERNEL sum.S)
set(DSUMKERNEL sum.S)
set(CSUMKERNEL zsum.S)
set(ZSUMKERNEL zsum.S)
set(QSUMKERNEL sum.S)
set(XSUMKERNEL zsum.S)
endmacro ()
macro(SetDefaultL2)
@@ -162,4 +168,4 @@ macro(SetDefaultL3)
set(DGEADD_KERNEL ../generic/geadd.c)
set(CGEADD_KERNEL ../generic/zgeadd.c)
set(ZGEADD_KERNEL ../generic/zgeadd.c)
endmacro ()
endmacro ()

View File

@@ -1,10 +1,11 @@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
libsuffix=@SUFFIX64_UNDERSCORE@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
Name: OpenBLAS
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: @OPENBLAS_VERSION@
URL: https://github.com/xianyi/OpenBLAS
Libs: -L${libdir} -lopenblas
Libs: -L${libdir} -lopenblas${libsuffix}
Cflags: -I${includedir}

View File

@@ -8,6 +8,11 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
set(NO_EXPRECISION 1)
endif ()
if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly")
set(EXTRALIB "${EXTRALIB} -lm")
set(NO_EXPRECISION 1)
endif ()
if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX")
set(EXTRALIB "${EXTRALIB} -lm")
endif ()

View File

@@ -59,6 +59,9 @@ set(FU "")
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
set(FU "_")
endif()
if(MINGW AND NOT MINGW64)
set(FU "_")
endif()
set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
if (${COMPILER_ID} STREQUAL "GNU")
@@ -82,18 +85,28 @@ endif ()
# f_check
if (NOT NOFORTRAN)
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
else ()
file(APPEND ${TARGET_CONF_TEMP}
"#define BUNDERSCORE _\n"
"#define NEEDBUNDERSCORE 1\n")
set(BU "_")
endif ()
# Cannot run getarch on target if we are cross-compiling
if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
# Write to config as getarch would
if (DEFINED TARGET_CORE)
set(TCORE ${TARGET_CORE})
else()
set(TCORE ${CORE})
endif()
# TODO: Set up defines that getarch sets up based on every other target
# Perhaps this should be inside a different file as it grows larger
file(APPEND ${TARGET_CONF_TEMP}
"#define ${CORE}\n"
"#define CHAR_CORENAME \"${CORE}\"\n")
if ("${CORE}" STREQUAL "ARMV7")
"#define ${TCORE}\n"
"#define CHAR_CORENAME \"${TCORE}\"\n")
if ("${TCORE}" STREQUAL "ARMV7")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t32\n"
@@ -108,7 +121,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4)
elseif ("${CORE}" STREQUAL "ARMV8")
elseif ("${TCORE}" STREQUAL "ARMV8")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
@@ -116,18 +129,26 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define L2_ASSOCIATIVE\t32\n")
set(SGEMM_UNROLL_M 4)
"#define L2_ASSOCIATIVE\t32\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
elseif ("${CORE}" STREQUAL "CORTEXA57")
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t49152\n"
"#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t2097152\n"
"#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
@@ -135,15 +156,124 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n")
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 8)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t49152\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "FALKOR")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t65536\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t128\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "THUNDERX")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t128\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t167772164\n"
"#define L2_LINESIZE\t128\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 4)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 2)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "THUNDERX2T99")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t8\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t8\n"
"#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t8\n"
"#define L3_SIZE\t33554432\n"
"#define L3_LINESIZE\t64\n"
"#define L3_ASSOCIATIVE\t32\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
endif()
# Or should this actually be NUM_CORES?
@@ -163,6 +293,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
file(APPEND ${TARGET_CONF_TEMP}
"#define GEMM_MULTITHREAD_THRESHOLD\t${GEMM_MULTITHREAD_THRESHOLD}\n")
# Move to where gen_config_h would place it
file(MAKE_DIRECTORY ${TARGET_CONF_DIR})
file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}")
else(NOT CMAKE_CROSSCOMPILING)

View File

@@ -33,19 +33,50 @@ endif ()
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
message(STATUS "Compiling a ${BINARY}-bit binary.")
set(NO_AVX 1)
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE")
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX")
set(TARGET "NEHALEM")
endif ()
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
set(TARGET "BARCELONA")
endif ()
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
set(TARGET "ARMV7")
endif ()
endif ()
if (DEFINED TARGET)
if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
endif()
endif()
if (DEFINED TARGET)
message(STATUS "Targeting the ${TARGET} architecture.")
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
endif ()
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
endif ()
# On x86 no AVX support is available
if (X86 OR X86_64)
if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4"))
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512")
endif ()
endif ()
if (INTERFACE64)
message(STATUS "Using 64-bit integers.")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
@@ -117,10 +148,16 @@ endif ()
if (USE_THREAD)
message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.")
else()
if (${USE_LOCKING})
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING")
endif ()
endif ()
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
if (DEFINED BINARY)
message(STATUS "Compiling a ${BINARY}-bit binary.")
endif ()
if (NOT DEFINED NEED_PIC)
set(NEED_PIC 1)
endif ()
@@ -137,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
if (NOT NOFORTRAN)
# Fortran Compiler dependent settings
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
else ()
set(NO_LAPACK 1)
set(NO_LAPACKE 1)
endif ()
if (BINARY64)
@@ -162,7 +202,22 @@ if (NEED_PIC)
endif ()
if (DYNAMIC_ARCH)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (X86 OR X86_64 OR ARM64 OR PPC)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
endif ()
else ()
unset (DYNAMIC_ARCH)
message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing")
endif ()
endif ()
if (DYNAMIC_LIST)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST")
foreach(DCORE ${DYNAMIC_LIST})
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}")
endforeach ()
endif ()
if (NO_LAPACK)
@@ -211,6 +266,10 @@ if (CONSISTENT_FPCSR)
set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR")
endif ()
if (USE_TLS)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS")
endif ()
# Only for development
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST")
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST")
@@ -250,7 +309,7 @@ endif ()
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
# TODO: nead to convert these Makefiles
# TODO: need to convert these Makefiles
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
if (${CORE} STREQUAL "PPC440")
@@ -297,6 +356,8 @@ if (MIXED_MEMORY_ALLOCATION)
set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION")
endif ()
set(CCOMMON_OPT "${CCOMMON_OPT} -DVERSION=\"\\\"${OpenBLAS_VERSION}\\\"\"")
set(REVISION "-r${OpenBLAS_VERSION}")
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})

View File

@@ -10,6 +10,16 @@ if (${HOST_OS} STREQUAL "WINDOWS")
set(HOST_OS WINNT)
endif ()
if (${HOST_OS} STREQUAL "LINUX")
# check if we're building natively on Android (TERMUX)
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
if(${OPERATING_SYSTEM} MATCHES "Android")
set(HOST_OS ANDROID)
endif()
endif()
if(CMAKE_COMPILER_IS_GNUCC AND WIN32)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE
@@ -29,13 +39,21 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
set(X86_64 1)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(X86_64 1)
else()
set(X86 1)
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
set(ARM 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
set(ARM64 1)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(ARM64 1)
else()
set(ARM 1)
endif()
endif()
if (X86_64)
@@ -66,3 +84,12 @@ else()
set(BINARY32 1)
endif()
if (X86_64 OR X86)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
if (NO_AVX512 EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif()
file(REMOVE "avx512.tmp" "avx512.o")
endif()

View File

@@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
endfunction ()
# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition
# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
# @param sources_in the source files to build from
# @param defines_in (optional) preprocessor definitions that will be applied to all objects
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.

View File

@@ -85,6 +85,8 @@ extern "C" {
#if !defined(_MSC_VER)
#include <unistd.h>
#elif _MSC_VER < 1900
#define snprintf _snprintf
#endif
#include <time.h>
@@ -105,6 +107,10 @@ extern "C" {
#endif
#endif
#ifdef OS_HAIKU
#define NO_SYSV_IPC
#endif
#ifdef OS_WINDOWS
#ifdef ATOM
#define GOTO_ATOM ATOM
@@ -125,7 +131,7 @@ extern "C" {
#include <time.h>
#include <unistd.h>
#include <math.h>
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
#include <pthread.h>
#endif
#endif
@@ -179,7 +185,7 @@ extern "C" {
#define ALLOCA_ALIGN 63UL
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
#ifdef NEEDBUNDERSCORE
#define BLASFUNC(FUNC) FUNC##_
@@ -194,7 +200,7 @@ extern "C" {
#error "You can't specify both LOCK operation!"
#endif
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
#define USE_PTHREAD_LOCK
#undef USE_PTHREAD_SPINLOCK
#endif
@@ -253,8 +259,14 @@ typedef unsigned long BLASULONG;
#ifdef USE64BITINT
typedef BLASLONG blasint;
#if defined(OS_WINDOWS) && defined(__64BIT__)
#define blasabs(x) llabs(x)
#else
#define blasabs(x) labs(x)
#endif
#else
typedef int blasint;
#define blasabs(x) abs(x)
#endif
#else
#ifdef USE64BITINT
@@ -338,6 +350,11 @@ typedef int blasint;
#endif
#endif
#ifdef POWER9
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
/*
#ifdef PILEDRIVER
@@ -429,7 +446,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) 0
#else
#ifdef OS_WINDOWS
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
#else
@@ -642,6 +659,7 @@ void gotoblas_profile_init(void);
void gotoblas_profile_quit(void);
#ifdef USE_OPENMP
#ifndef C_MSVC
int omp_in_parallel(void);
int omp_get_num_procs(void);
@@ -649,12 +667,21 @@ int omp_get_num_procs(void);
__declspec(dllimport) int __cdecl omp_in_parallel(void);
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
#endif
#if (__STDC_VERSION__ >= 201112L)
#if defined(C_GCC) && ( __GNUC__ < 7)
// workaround for GCC bug 65467
#ifndef _Atomic
#define _Atomic volatile
#endif
#include <stdatomic.h>
#endif
#include <stdatomic.h>
#else
#ifndef _Atomic
#define _Atomic volatile
#endif
#endif
#else
#ifdef __ELF__
int omp_in_parallel (void) __attribute__ ((weak));

View File

@@ -19,6 +19,7 @@
#define CDOTC_K cdotc_k
#define CNRM2_K cnrm2_k
#define CSCAL_K cscal_k
#define CSUM_K csum_k
#define CSWAP_K cswap_k
#define CROT_K csrot_k
@@ -249,6 +250,7 @@
#define CDOTC_K gotoblas -> cdotc_k
#define CNRM2_K gotoblas -> cnrm2_k
#define CSCAL_K gotoblas -> cscal_k
#define CSUM_K gotoblas -> csum_k
#define CSWAP_K gotoblas -> cswap_k
#define CROT_K gotoblas -> csrot_k

View File

@@ -19,6 +19,7 @@
#define DDOTC_K ddot_k
#define DNRM2_K dnrm2_k
#define DSCAL_K dscal_k
#define DSUM_K dsum_k
#define DSWAP_K dswap_k
#define DROT_K drot_k
@@ -174,6 +175,7 @@
#define DDOTC_K gotoblas -> ddot_k
#define DNRM2_K gotoblas -> dnrm2_k
#define DSCAL_K gotoblas -> dscal_k
#define DSUM_K gotoblas -> dsum_k
#define DSWAP_K gotoblas -> dswap_k
#define DROT_K gotoblas -> drot_k

View File

@@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
double BLASFUNC(dzasum)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);
FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *);
FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *);
double BLASFUNC(dsum) (blasint *, double *, blasint *);
xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
double BLASFUNC(dzsum)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);
blasint BLASFUNC(isamax)(blasint *, float *, blasint *);
blasint BLASFUNC(idamax)(blasint *, double *, blasint *);
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);

View File

@@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG);
double zasum_k (BLASLONG, double *, BLASLONG);
xdouble xasum_k (BLASLONG, xdouble *, BLASLONG);
float ssum_k (BLASLONG, float *, BLASLONG);
double dsum_k (BLASLONG, double *, BLASLONG);
xdouble qsum_k (BLASLONG, xdouble *, BLASLONG);
float csum_k (BLASLONG, float *, BLASLONG);
double zsum_k (BLASLONG, double *, BLASLONG);
xdouble xsum_k (BLASLONG, xdouble *, BLASLONG);
float samax_k (BLASLONG, float *, BLASLONG);
double damax_k (BLASLONG, double *, BLASLONG);
xdouble qamax_k (BLASLONG, xdouble *, BLASLONG);

View File

@@ -47,6 +47,14 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *);
extern "C" {
#endif
extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K,
float * A, BLASLONG strideA,
float * B, BLASLONG strideB,
float * R, BLASLONG strideR);
extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);
int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double,

View File

@@ -66,6 +66,7 @@
#define DOTC_K QDOTC_K
#define NRM2_K QNRM2_K
#define SCAL_K QSCAL_K
#define SUM_K QSUM_K
#define SWAP_K QSWAP_K
#define ROT_K QROT_K
@@ -356,6 +357,7 @@
#define DOTC_K DDOTC_K
#define NRM2_K DNRM2_K
#define SCAL_K DSCAL_K
#define SUM_K DSUM_K
#define SWAP_K DSWAP_K
#define ROT_K DROT_K
@@ -658,6 +660,7 @@
#define DOTC_K SDOTC_K
#define NRM2_K SNRM2_K
#define SCAL_K SSCAL_K
#define SUM_K SSUM_K
#define SWAP_K SSWAP_K
#define ROT_K SROT_K
@@ -962,6 +965,7 @@
#define DOTC_K XDOTC_K
#define NRM2_K XNRM2_K
#define SCAL_K XSCAL_K
#define SUM_K XSUM_K
#define SWAP_K XSWAP_K
#define ROT_K XROT_K
@@ -1363,6 +1367,7 @@
#define DOTC_K ZDOTC_K
#define NRM2_K ZNRM2_K
#define SCAL_K ZSCAL_K
#define SUM_K ZSUM_K
#define SWAP_K ZSWAP_K
#define ROT_K ZROT_K
@@ -1785,6 +1790,7 @@
#define DOTC_K CDOTC_K
#define NRM2_K CNRM2_K
#define SCAL_K CSCAL_K
#define SUM_K CSUM_K
#define SWAP_K CSWAP_K
#define ROT_K CROT_K

View File

@@ -94,7 +94,7 @@ static inline unsigned int rpcc(void){
#define RPCC_DEFINED
#ifndef NO_AFFINITY
#define WHEREAMI
//#define WHEREAMI
static inline int WhereAmI(void){
int ret=0;
__asm__ __volatile__(".set push \n"

View File

@@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
float (*snrm2_k) (BLASLONG, float *, BLASLONG);
float (*sasum_k) (BLASLONG, float *, BLASLONG);
float (*ssum_k) (BLASLONG, float *, BLASLONG);
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
double (*dasum_k) (BLASLONG, double *, BLASLONG);
double (*dsum_k) (BLASLONG, double *, BLASLONG);
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
@@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG);
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
@@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
float (*casum_k) (BLASLONG, float *, BLASLONG);
float (*csum_k) (BLASLONG, float *, BLASLONG);
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
double (*znrm2_k) (BLASLONG, double *, BLASLONG);
double (*zasum_k) (BLASLONG, double *, BLASLONG);
double (*zsum_k) (BLASLONG, double *, BLASLONG);
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
@@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG);
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);

View File

@@ -39,7 +39,7 @@
#ifndef COMMON_POWER
#define COMMON_POWER
#if defined(POWER8)
#if defined(POWER8) || defined(POWER9)
#define MB __asm__ __volatile__ ("eieio":::"memory")
#define WMB __asm__ __volatile__ ("eieio":::"memory")
#else
@@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH
#endif
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) )
#define DCBT_ARG 0
#else
#define DCBT_ARG 8
@@ -263,7 +263,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define L1_PREFETCH dcbtst
#endif
#if defined(POWER8)
#if defined(POWER8) || defined(POWER9)
#define L1_DUALFETCH
#define L1_PREFETCHSIZE (16 + 128 * 100)
#define L1_PREFETCH dcbtst
@@ -499,7 +499,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#ifdef OS_LINUX
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifndef __64BIT__
#define PROLOGUE \
.section .text;\
@@ -598,9 +598,14 @@ REALNAME:;\
#ifndef __64BIT__
#define PROLOGUE \
.machine "any";\
.toc;\
.globl .REALNAME;\
.globl REALNAME;\
.csect REALNAME[DS],3;\
REALNAME:;\
.long .REALNAME, TOC[tc0], 0;\
.csect .text[PR],5;\
.REALNAME:;
.REALNAME:
#define EPILOGUE \
_section_.text:;\
@@ -611,9 +616,14 @@ _section_.text:;\
#define PROLOGUE \
.machine "any";\
.toc;\
.globl .REALNAME;\
.globl REALNAME;\
.csect REALNAME[DS],3;\
REALNAME:;\
.llong .REALNAME, TOC[tc0], 0;\
.csect .text[PR], 5;\
.REALNAME:;
.REALNAME:
#define EPILOGUE \
_section_.text:;\
@@ -774,7 +784,7 @@ Lmcount$lazy_ptr:
#define HALT mfspr r0, 1023
#ifdef OS_LINUX
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#if defined(PPC440) || defined(PPC440FP2)
#undef MAX_CPU_NUMBER
#define MAX_CPU_NUMBER 1
@@ -802,7 +812,7 @@ Lmcount$lazy_ptr:
#define BUFFER_SIZE ( 2 << 20)
#elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8)
#elif defined(POWER8) || defined(POWER9)
#define BUFFER_SIZE ( 64 << 20)
#else
#define BUFFER_SIZE ( 16 << 20)
@@ -819,7 +829,7 @@ Lmcount$lazy_ptr:
#define MAP_ANONYMOUS MAP_ANON
#endif
#ifdef OS_LINUX
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifndef __64BIT__
#define FRAMESLOT(X) (((X) * 4) + 8)
#else

View File

@@ -19,6 +19,7 @@
#define QDOTC_K qdot_k
#define QNRM2_K qnrm2_k
#define QSCAL_K qscal_k
#define QSUM_K qsum_k
#define QSWAP_K qswap_k
#define QROT_K qrot_k
@@ -161,6 +162,7 @@
#define QDOTC_K gotoblas -> qdot_k
#define QNRM2_K gotoblas -> qnrm2_k
#define QSCAL_K gotoblas -> qscal_k
#define QSUM_K gotoblas -> qsum_k
#define QSWAP_K gotoblas -> qswap_k
#define QROT_K gotoblas -> qrot_k

View File

@@ -12,6 +12,7 @@
#define ISMAX_K ismax_k
#define ISMIN_K ismin_k
#define SASUM_K sasum_k
#define SSUM_K ssum_k
#define SAXPYU_K saxpy_k
#define SAXPYC_K saxpy_k
#define SCOPY_K scopy_k
@@ -170,6 +171,7 @@
#define ISMAX_K gotoblas -> ismax_k
#define ISMIN_K gotoblas -> ismin_k
#define SASUM_K gotoblas -> sasum_k
#define SSUM_K gotoblas -> ssum_k
#define SAXPYU_K gotoblas -> saxpy_k
#define SAXPYC_K gotoblas -> saxpy_k
#define SCOPY_K gotoblas -> scopy_k

View File

@@ -45,16 +45,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* SIZE must be carefully chosen to be:
* - as small as possible to maximize the number of stack allocation
* - large enough to support all architectures and kernel
* Chosing a too small SIZE will lead to a stack smashing.
* Choosing a SIZE too small will lead to a stack smashing.
*/
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \
/* do not restore all register */ \
volatile int stack_alloc_size = SIZE; \
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \
stack_alloc_size = 0; \
STACK_ALLOC_PROTECT_SET \
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \
/* do not restore all register */ \
volatile int stack_alloc_size = SIZE; \
if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \
STACK_ALLOC_PROTECT_SET \
/* Avoid declaring an array of length 0 */ \
TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \
__attribute__((aligned(0x20))); \
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
#else
//Original OpenBLAS/GotoBLAS codes.

View File

@@ -19,6 +19,7 @@
#define XDOTC_K xdotc_k
#define XNRM2_K xnrm2_k
#define XSCAL_K xscal_k
#define XSUM_K xsum_k
#define XSWAP_K xswap_k
#define XROT_K xqrot_k
@@ -227,6 +228,7 @@
#define XDOTC_K gotoblas -> xdotc_k
#define XNRM2_K gotoblas -> xnrm2_k
#define XSCAL_K gotoblas -> xscal_k
#define XSUM_K gotoblas -> xsum_k
#define XSWAP_K gotoblas -> xswap_k
#define XROT_K gotoblas -> xqrot_k

View File

@@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y));
return result;
#endif
@@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#endif
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona.
//Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

View File

@@ -60,8 +60,13 @@
#endif
*/
#define MB
#define WMB
#ifdef __GNUC__
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
#else
#define MB do {} while (0)
#define WMB do {} while (0)
#endif
static void __inline blas_lock(volatile BLASULONG *address){
@@ -124,7 +129,8 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
*ecx=cpuinfo[2];
*edx=cpuinfo[3];
#else
__asm__ __volatile__("cpuid"
__asm__ __volatile__("mov $0, %%ecx;"
"cpuid"
: "=a" (*eax),
"=b" (*ebx),
"=c" (*ecx),
@@ -205,7 +211,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
return result;
}
@@ -271,7 +277,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#ifdef ASSEMBLER
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona.
//Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

View File

@@ -19,6 +19,7 @@
#define ZDOTC_K zdotc_k
#define ZNRM2_K znrm2_k
#define ZSCAL_K zscal_k
#define ZSUM_K zsum_k
#define ZSWAP_K zswap_k
#define ZROT_K zdrot_k
@@ -249,6 +250,7 @@
#define ZDOTC_K gotoblas -> zdotc_k
#define ZNRM2_K gotoblas -> znrm2_k
#define ZSCAL_K gotoblas -> zscal_k
#define ZSUM_K gotoblas -> zsum_k
#define ZSWAP_K gotoblas -> zswap_k
#define ZROT_K gotoblas -> zdrot_k

14
cpp_thread_test/Makefile Normal file
View File

@@ -0,0 +1,14 @@
include ../Makefile.rule
all :: dgemv_tester dgemm_tester
dgemv_tester :
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
./dgemv_tester
dgemm_tester : dgemv_tester
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
./dgemm_tester
clean ::
rm -f dgemv_tester dgemm_tester

View File

@@ -0,0 +1,55 @@
inline void pauser(){
/// a portable way to pause a program
std::string dummy;
std::cout << "Press enter to continue...";
std::getline(std::cin, dummy);
}
void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
for(uint32_t i=0; i<numMat; i++){
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
matBlock[i][j] = rngdist(PRNG);
}
}
for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){
for(uint32_t j=0; j<numMat; j++){
matBlock[i+j] = matBlock[j];
}
}
}
void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){
for(uint32_t i=0; i<numVec; i++){
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
vecBlock[i][j] = rngdist(PRNG);
}
}
for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){
for(uint32_t j=0; j<numVec; j++){
vecBlock[i+j] = vecBlock[j];
}
}
}
std::mt19937_64 InitPRNG(){
std::random_device rd;
std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
//make sure the internal state of the PRNG is properly mixed by generating 10M random numbers
//PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed
for (uint32_t i=0;i<10000000;i++) rngdist(PRNG);
return PRNG;
}
void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){
std::cout<<i<<std::endl;
for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){
std::cout<<matBlock[i][j*randomMatSize + k]<<" ";
}
std::cout<<std::endl;
}
std::cout<<std::endl;
}
}

View File

@@ -0,0 +1,92 @@
#include <iostream>
#include <vector>
#include <random>
#include <future>
#include <omp.h>
#include "../cblas.h"
#include "cpp_thread_safety_common.h"
void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize);
}
int main(int argc, char* argv[]){
blasint randomMatSize = 1024; //dimension of the random square matrices used
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
uint32_t numTestRounds = 16; //number of testing rounds before success exit
if (argc > 4){
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
abort();
}
if(argc == 4){
std::vector<std::string> cliArgs;
for (int i = 1; i < argc; i++){
cliArgs.push_back(argv[i]);
std::cout<<argv[i]<<std::endl;
}
randomMatSize = std::stoul(cliArgs[0]);
numConcurrentThreads = std::stoul(cliArgs[1]);
numTestRounds = std::stoul(cliArgs[2]);
}
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
std::vector<std::vector<double>> matBlock(numConcurrentThreads*3);
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
std::cout<<"*----------------------------*\n";
std::cout<<"| DGEMM thread safety tester |\n";
std::cout<<"*----------------------------*\n";
std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n';
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
std::cout<<"Initializing random number generator..."<<std::flush;
std::mt19937_64 PRNG = InitPRNG();
std::cout<<"done\n";
std::cout<<"Preparing to test CBLAS DGEMM thread safety\n";
std::cout<<"Allocating matrices..."<<std::flush;
for(uint32_t i=0; i<(numConcurrentThreads*3); i++){
matBlock[i].resize(randomMatSize*randomMatSize);
}
std::cout<<"done\n";
//pauser();
std::cout<<"Filling matrices with random numbers..."<<std::flush;
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3);
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
std::cout<<"done\n";
std::cout<<"Testing CBLAS DGEMM thread safety\n";
omp_set_num_threads(numConcurrentThreads);
for(uint32_t R=0; R<numTestRounds; R++){
std::cout<<"DGEMM round #"<<R<<std::endl;
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
#pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads)
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize);
//launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]);
}
std::cout<<"done\n";
std::cout<<"Waiting for threads to finish..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i].get();
}
std::cout<<"done\n";
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
std::cout<<"Comparing results from different threads..."<<std::flush;
for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl;
std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl;
return -1;
}
}
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl;
return 0;
}

View File

@@ -0,0 +1,101 @@
#include <iostream>
#include <vector>
#include <random>
#include <future>
#include <omp.h>
#include "../cblas.h"
#include "cpp_thread_safety_common.h"
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
const blasint inc = 1;
cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
}
int main(int argc, char* argv[]){
blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
uint32_t numTestRounds = 16; //number of testing rounds before success exit
if (argc > 4){
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
abort();
}
if(argc == 4){
std::vector<std::string> cliArgs;
for (int i = 1; i < argc; i++){
cliArgs.push_back(argv[i]);
std::cout<<argv[i]<<std::endl;
}
randomMatSize = std::stoul(cliArgs.at(0));
numConcurrentThreads = std::stoul(cliArgs.at(1));
numTestRounds = std::stoul(cliArgs.at(2));
}
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
std::vector<std::vector<double>> matBlock(numConcurrentThreads);
std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2);
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
std::cout<<"*----------------------------*\n";
std::cout<<"| DGEMV thread safety tester |\n";
std::cout<<"*----------------------------*\n";
std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n';
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
std::cout<<"Initializing random number generator..."<<std::flush;
std::mt19937_64 PRNG = InitPRNG();
std::cout<<"done\n";
std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
std::cout<<"Allocating matrices..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
matBlock.at(i).resize(randomMatSize*randomMatSize);
}
std::cout<<"done\n";
std::cout<<"Allocating vectors..."<<std::flush;
for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
vecBlock.at(i).resize(randomMatSize);
}
std::cout<<"done\n";
//pauser();
std::cout<<"Filling matrices with random numbers..."<<std::flush;
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1);
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads);
std::cout<<"done\n";
std::cout<<"Filling vectors with random numbers..."<<std::flush;
FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2);
std::cout<<"done\n";
std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
omp_set_num_threads(numConcurrentThreads);
for(uint32_t R=0; R<numTestRounds; R++){
std::cout<<"DGEMV round #"<<R<<std::endl;
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
#pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
}
std::cout<<"done\n";
std::cout<<"Waiting for threads to finish..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i].get();
}
std::cout<<"done\n";
std::cout<<"Comparing results from different threads..."<<std::flush;
for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
return -1;
}
}
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
return 0;
}

View File

@@ -53,6 +53,7 @@
#define VENDOR_SIS 8
#define VENDOR_TRANSMETA 9
#define VENDOR_NSC 10
#define VENDOR_HYGON 11
#define VENDOR_UNKNOWN 99
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
@@ -115,6 +116,8 @@
#define CORE_STEAMROLLER 25
#define CORE_EXCAVATOR 26
#define CORE_ZEN 27
#define CORE_SKYLAKEX 28
#define CORE_DHYANA 29
#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
@@ -137,6 +140,8 @@
#define HAVE_AVX (1 << 18)
#define HAVE_FMA4 (1 << 19)
#define HAVE_FMA3 (1 << 20)
#define HAVE_AVX512VL (1 << 21)
#define HAVE_AVX2 (1 << 22)
#define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2
@@ -211,5 +216,9 @@ typedef struct {
#define CPUTYPE_STEAMROLLER 49
#define CPUTYPE_EXCAVATOR 50
#define CPUTYPE_ZEN 51
#define CPUTYPE_SKYLAKEX 52
#define CPUTYPE_DHYANA 53
#define CPUTYPE_HYGON_UNKNOWN 54
#endif

View File

@@ -34,7 +34,7 @@
#define CPU_CORTEXA15 4
static char *cpuname[] = {
"UNKOWN",
"UNKNOWN",
"ARMV6",
"ARMV7",
"CORTEXA9",

View File

@@ -29,27 +29,43 @@
#define CPU_UNKNOWN 0
#define CPU_ARMV8 1
#define CPU_CORTEXA57 2
#define CPU_VULCAN 3
#define CPU_THUNDERX 4
#define CPU_THUNDERX2T99 5
// Arm
#define CPU_CORTEXA53 2
#define CPU_CORTEXA57 3
#define CPU_CORTEXA72 4
#define CPU_CORTEXA73 5
// Qualcomm
#define CPU_FALKOR 6
// Cavium
#define CPU_THUNDERX 7
#define CPU_THUNDERX2T99 8
//Hisilicon
#define CPU_TSV110 9
static char *cpuname[] = {
"UNKNOWN",
"ARMV8" ,
"CORTEXA53",
"CORTEXA57",
"VULCAN",
"CORTEXA72",
"CORTEXA73",
"FALKOR",
"THUNDERX",
"THUNDERX2T99"
"THUNDERX2T99",
"TSV110"
};
static char *cpuname_lower[] = {
"unknown",
"armv8" ,
"armv8",
"cortexa53",
"cortexa57",
"vulcan",
"cortexa72",
"cortexa73",
"falkor",
"thunderx",
"thunderx2t99"
"thunderx2t99",
"tsv110"
};
int get_feature(char *search)
@@ -78,7 +94,7 @@ int get_feature(char *search)
if( p == NULL ) return 0;
t = strtok(p," ");
while( t = strtok(NULL," "))
while( (t = strtok(NULL," ")))
{
if (!strcmp(t, search)) { return(1); }
}
@@ -114,15 +130,28 @@ int detect(void)
fclose(infile);
if(cpu_part != NULL && cpu_implementer != NULL) {
if (strstr(cpu_implementer, "0x41") &&
(strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08") || strstr(cpu_part,"0xd03") ))
return CPU_CORTEXA57; //or compatible A53, A72
else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42"))
return CPU_VULCAN;
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
// Arm
if (strstr(cpu_implementer, "0x41")) {
if (strstr(cpu_part, "0xd03"))
return CPU_CORTEXA53;
else if (strstr(cpu_part, "0xd07"))
return CPU_CORTEXA57;
else if (strstr(cpu_part, "0xd08"))
return CPU_CORTEXA72;
else if (strstr(cpu_part, "0xd09"))
return CPU_CORTEXA73;
}
// Qualcomm
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
return CPU_FALKOR;
// Cavium
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0a1"))
return CPU_THUNDERX;
else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43"))
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
return CPU_THUNDERX2T99;
// HiSilicon
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
return CPU_TSV110;
}
p = (char *) NULL ;
@@ -180,64 +209,63 @@ void get_subdirname(void)
void get_cpuconfig(void)
{
// All arches should define ARMv8
printf("#define ARMV8\n");
printf("#define HAVE_NEON\n"); // This shouldn't be necessary
printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary
int d = detect();
switch (d)
{
case CPU_CORTEXA53:
printf("#define %s\n", cpuname[d]);
// Fall-through
case CPU_ARMV8:
printf("#define ARMV8\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
break;
case CPU_VULCAN:
printf("#define VULCAN \n");
printf("#define HAVE_VFP \n");
printf("#define HAVE_VFPV3 \n");
printf("#define HAVE_NEON \n");
printf("#define HAVE_VFPV4 \n");
printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
printf("#define L1_DATA_SIZE 32768 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
printf("#define L2_SIZE 262144 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define L3_SIZE 33554432 \n");
printf("#define L3_LINESIZE 64 \n");
printf("#define L3_ASSOCIATIVE 32 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
// Minimum parameters for ARMv8 (based on A53)
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
break;
case CPU_CORTEXA57:
printf("#define CORTEXA57\n");
printf("#define HAVE_VFP\n");
printf("#define HAVE_VFPV3\n");
printf("#define HAVE_NEON\n");
printf("#define HAVE_VFPV4\n");
case CPU_CORTEXA72:
case CPU_CORTEXA73:
// Common minimum settings for these Arm cores
// Can change a lot, but we need to be conservative
// TODO: detect info from /sys if possible
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 49152\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 3\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 2\n");
printf("#define L2_SIZE 2097152\n");
printf("#define L2_SIZE 524288\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_FALKOR:
printf("#define FALKOR\n");
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n");
printf("#define L2_SIZE 524288\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
case CPU_THUNDERX:
printf("#define ARMV8\n");
printf("#define THUNDERX\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n");
@@ -249,11 +277,7 @@ void get_cpuconfig(void)
break;
case CPU_THUNDERX2T99:
printf("#define VULCAN \n");
printf("#define HAVE_VFP \n");
printf("#define HAVE_VFPV3 \n");
printf("#define HAVE_NEON \n");
printf("#define HAVE_VFPV4 \n");
printf("#define THUNDERX2T99 \n");
printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
@@ -269,6 +293,21 @@ void get_cpuconfig(void)
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
case CPU_TSV110:
printf("#define TSV110 \n");
printf("#define L1_CODE_SIZE 65536 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
printf("#define L1_DATA_SIZE 65536 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
printf("#define L2_SIZE 524228 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
}
}
@@ -305,7 +344,7 @@ void get_features(void)
if( p == NULL ) return;
t = strtok(p," ");
while( t = strtok(NULL," "))
while( (t = strtok(NULL," ")))
{
}

View File

@@ -75,7 +75,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_1004K 2
static char *cpuname[] = {
"UNKOWN",
"UNKNOWN",
"P5600",
"1004K"
};

View File

@@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_I6500 6
static char *cpuname[] = {
"UNKOWN",
"UNKNOWN",
"SICORTEX",
"LOONGSON3A",
"LOONGSON3B",

View File

@@ -56,6 +56,7 @@
#define CPUTYPE_CELL 6
#define CPUTYPE_PPCG4 7
#define CPUTYPE_POWER8 8
#define CPUTYPE_POWER9 9
char *cpuname[] = {
"UNKNOWN",
@@ -66,7 +67,8 @@ char *cpuname[] = {
"POWER6",
"CELL",
"PPCG4",
"POWER8"
"POWER8",
"POWER9"
};
char *lowercpuname[] = {
@@ -78,7 +80,8 @@ char *lowercpuname[] = {
"power6",
"cell",
"ppcg4",
"power8"
"power8",
"power9"
};
char *corename[] = {
@@ -90,7 +93,8 @@ char *corename[] = {
"POWER6",
"CELL",
"PPCG4",
"POWER8"
"POWER8",
"POWER9"
};
int detect(void){
@@ -120,6 +124,7 @@ int detect(void){
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
@@ -127,6 +132,33 @@ int detect(void){
#endif
#ifdef _AIX
FILE *infile;
char buffer[512], *p;
p = (char *)NULL;
infile = popen("prtconf|grep 'Processor Type'", "r");
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("Pro", buffer, 3)){
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
#endif
break;
}
}
pclose(infile);
if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3;
if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4;
if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970;
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
return CPUTYPE_POWER5;
#endif
@@ -142,6 +174,52 @@ int detect(void){
return CPUTYPE_PPC970;
#endif
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
int id;
__asm __volatile("mfpvr %0" : "=r"(id));
switch ( id >> 16 ) {
case 0x4e: // POWER9
return CPUTYPE_POWER9;
break;
case 0x4d:
case 0x4b: // POWER8/8E
return CPUTYPE_POWER8;
break;
case 0x4a:
case 0x3f: // POWER7/7E
return CPUTYPE_POWER6;
break;
case 0x3e:
return CPUTYPE_POWER6;
break;
case 0x3a:
return CPUTYPE_POWER5;
break;
case 0x35:
case 0x38: // POWER4 /4+
return CPUTYPE_POWER4;
break;
case 0x40:
case 0x41: // POWER3 /3+
return CPUTYPE_POWER3;
break;
case 0x39:
case 0x3c:
case 0x44:
case 0x45:
return CPUTYPE_PPC970;
break;
case 0x70:
return CPUTYPE_CELL;
break;
case 0x8003:
return CPUTYPE_PPCG4;
break;
default:
return CPUTYPE_UNKNOWN;
}
#endif
}
void get_architecture(void){

View File

@@ -50,6 +50,8 @@
#ifdef NO_AVX
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
#define CORE_HASWELL CORE_NEHALEM
#define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM
#define CORE_SKYLAKEX CORE_NEHALEM
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
#define CORE_SANDYBRIDGE CORE_NEHALEM
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
@@ -95,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
("mov %%ebx, %%edi;"
"cpuid;"
"xchgl %%ebx, %%edi;"
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc");
#else
__asm__ __volatile__
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc");
#endif
}
@@ -209,6 +211,44 @@ int support_avx(){
#endif
}
int support_avx2(){
#ifndef NO_AVX2
int eax, ebx, ecx=0, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 0)
ret=1; //OS supports AVX2
return ret;
#else
return 0;
#endif
}
int support_avx512(){
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & 32) != 32){
ret=0; //OS does not even support AVX2
}
if((ebx & (1<<31)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 0xe0) == 0xe0)
ret=1; //OS supports AVX512VL
}
return ret;
#else
return 0;
#endif
}
int get_vendor(void){
int eax, ebx, ecx, edx;
@@ -231,6 +271,7 @@ int get_vendor(void){
if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS;
if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA;
if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC;
if (!strcmp(vendor, "HygonGenuine")) return VENDOR_HYGON;
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
@@ -292,6 +333,8 @@ int get_cputype(int gettype){
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
#ifndef NO_AVX
if (support_avx()) feature |= HAVE_AVX;
if (support_avx2()) feature |= HAVE_AVX2;
if (support_avx512()) feature |= HAVE_AVX512VL;
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
#endif
@@ -1004,7 +1047,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
}
}
if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) {
if ((get_vendor() == VENDOR_AMD) ||
(get_vendor() == VENDOR_HYGON) ||
(get_vendor() == VENDOR_CENTAUR)) {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
LDTB.size = 4096;
@@ -1166,7 +1211,7 @@ int get_cpuname(void){
return CPUTYPE_CORE2;
}
break;
case 1:
case 1: // family 6 exmodel 1
switch (model) {
case 6:
return CPUTYPE_CORE2;
@@ -1183,7 +1228,7 @@ int get_cpuname(void){
return CPUTYPE_DUNNINGTON;
}
break;
case 2:
case 2: // family 6 exmodel 2
switch (model) {
case 5:
//Intel Core (Clarkdale) / Core (Arrandale)
@@ -1212,7 +1257,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 3:
case 3: // family 6 exmodel 3
switch (model) {
case 7:
// Bay Trail
@@ -1226,57 +1271,47 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
case 12:
case 15:
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 13:
//Broadwell
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
}
break;
case 4:
case 4: // family 6 exmodel 4
switch (model) {
case 5:
case 6:
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 7:
case 15:
//Broadwell
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 14:
//Skylake
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 12:
@@ -1286,54 +1321,85 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 5:
case 5: // family 6 exmodel 5
switch (model) {
case 6:
//Broadwell
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 5:
// Skylake X
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 14:
// Skylake
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 7:
// Xeon Phi Knights Landing
if(support_avx())
#ifndef NO_AVX2
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 12:
// Apollo Lake
case 15:
// Denverton
return CPUTYPE_NEHALEM;
}
break;
case 9:
case 8:
case 6: // family 6 exmodel 6
switch (model) {
case 14: // Kaby Lake
if(support_avx())
#ifndef NO_AVX2
case 6: // Cannon Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
#else
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
case 7: // family 6 exmodel 7
switch (model) {
case 10: // Goldmont Plus
return CPUTYPE_NEHALEM;
case 14: // Ice Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
case 9:
case 8:
switch (model) {
case 14: // Kaby Lake and refreshes
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
}
@@ -1420,6 +1486,8 @@ int get_cpuname(void){
switch (model) {
case 1:
// AMD Ryzen
case 8:
// AMD Ryzen2
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_ZEN;
@@ -1435,6 +1503,26 @@ int get_cpuname(void){
return CPUTYPE_AMD_UNKNOWN;
}
if (vendor == VENDOR_HYGON){
switch (family) {
case 0xf:
switch (exfamily) {
case 9:
//Hygon Dhyana
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_ZEN;
#else
return CPUTYPE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
#endif
else
return CPUTYPE_BARCELONA;
}
break;
}
return CPUTYPE_HYGON_UNKNOWN;
}
if (vendor == VENDOR_CYRIX){
switch (family) {
case 0x4:
@@ -1556,6 +1644,8 @@ static char *cpuname[] = {
"STEAMROLLER",
"EXCAVATOR",
"ZEN",
"SKYLAKEX",
"DHYANA"
};
static char *lowercpuname[] = {
@@ -1610,10 +1700,12 @@ static char *lowercpuname[] = {
"steamroller",
"excavator",
"zen",
"skylakex",
"dhyana"
};
static char *corename[] = {
"UNKOWN",
"UNKNOWN",
"80486",
"P5",
"P6",
@@ -1641,6 +1733,8 @@ static char *corename[] = {
"STEAMROLLER",
"EXCAVATOR",
"ZEN",
"SKYLAKEX",
"DHYANA"
};
static char *corename_lower[] = {
@@ -1672,6 +1766,8 @@ static char *corename_lower[] = {
"steamroller",
"excavator",
"zen",
"skylakex",
"dhyana"
};
@@ -1860,6 +1956,19 @@ int get_coretype(void){
else
return CORE_NEHALEM;
case 5:
// Skylake X
#ifndef NO_AVX512
return CORE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
#endif
case 14:
// Skylake
if(support_avx())
@@ -1958,6 +2067,8 @@ int get_coretype(void){
switch (model) {
case 1:
// AMD Ryzen
case 8:
// Ryzen 2
if(support_avx())
#ifndef NO_AVX2
return CORE_ZEN;
@@ -1973,6 +2084,23 @@ int get_coretype(void){
}
}
if (vendor == VENDOR_HYGON){
if (family == 0xf){
if (exfamily == 9) {
if(support_avx())
#ifndef NO_AVX2
return CORE_ZEN;
#else
return CORE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
#endif
else
return CORE_BARCELONA;
} else {
return CORE_BARCELONA;
}
}
}
if (vendor == VENDOR_CENTAUR) {
switch (family) {
case 0x6:
@@ -2059,6 +2187,8 @@ void get_cpuconfig(void){
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n");
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n");
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n");
if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n");
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
@@ -2127,6 +2257,8 @@ void get_sse(void){
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n");
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n");
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n");
if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n");
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");

View File

@@ -27,17 +27,20 @@
#include <string.h>
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
static char *cpuname[] = {
"ZARCH_GENERIC",
"Z13"
"Z13",
"Z14"
};
static char *cpuname_lower[] = {
"zarch_generic",
"z13"
"z13",
"z14"
};
int detect(void)
@@ -61,6 +64,8 @@ int detect(void)
if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13;
if (strstr(p, "3906")) return CPU_Z14;
if (strstr(p, "3907")) return CPU_Z14;
return CPU_GENERIC;
}
@@ -107,5 +112,16 @@ void get_cpuconfig(void)
printf("#define Z13\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
break;
case CPU_Z14:
printf("#define Z14\n");
printf("#define L1_DATA_SIZE 131072\n");
printf("#define L1_DATA_LINESIZE 256\n");
printf("#define L1_DATA_ASSOCIATIVE 8\n");
printf("#define L2_SIZE 4194304\n");
printf("#define L2_LINESIZE 256\n");
printf("#define L2_ASSOCIATIVE 8\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
}
}

View File

@@ -101,6 +101,10 @@ OS_INTERIX
OS_LINUX
#endif
#if defined(__HAIKU__)
OS_HAIKU
#endif
#if defined(__i386) || defined(_X86)
ARCH_X86
#endif
@@ -109,7 +113,7 @@ ARCH_X86
ARCH_X86_64
#endif
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER)
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__)
ARCH_POWER
#endif

View File

@@ -6,6 +6,8 @@ TOPDIR = ..
include $(TOPDIR)/Makefile.system
override CFLAGS += -DADD$(BU) -DCBLAS
override TARGET_ARCH=
override TARGET_MACH=
LIB = $(TOPDIR)/$(LIBNAME)
@@ -102,7 +104,13 @@ clean ::
rm -f x*
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
CEXTRALIB =
ifeq ($(USE_OPENMP), 1)
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB = -lomp
endif
endif
endif
# Single real
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)

View File

@@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@@ -62,9 +62,36 @@
#endif
#endif
#ifndef TRANSA
#ifndef thread_local
# if __STDC_VERSION__ >= 201112 && !defined __STDC_NO_THREADS__
# define thread_local _Thread_local
# elif defined _WIN32 && ( \
defined _MSC_VER || \
defined __ICL || \
defined __DMC__ || \
defined __BORLANDC__ )
# define thread_local __declspec(thread)
/* note that ICC (linux) and Clang are covered by __GNUC__ */
# elif defined __GNUC__ || \
defined __SUNPRO_C || \
defined __xlC__
# define thread_local __thread
# else
# define UNSAFE
#endif
#endif
#if defined USE_OPENMP
#undef UNSAFE
#endif
#if !defined(TRANSA) && !defined(UNSAFE)
#define Y_DUMMY_NUM 1024
#if defined(USE_OPENMP)
static FLOAT y_dummy[Y_DUMMY_NUM];
#pragma omp threadprivate(y_dummy)
# else
static thread_local FLOAT y_dummy[Y_DUMMY_NUM];
# endif
#endif
static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
@@ -105,10 +132,12 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#ifdef TRANSA
y += n_from * incy * COMPSIZE;
#else
# ifndef UNSAFE
//for split matrix row (n) direction and vector x of gemv_n
x += n_from * incx * COMPSIZE;
//store partial result for every thread
y += (m_to - m_from) * 1 * COMPSIZE * pos;
# endif
#endif
}
@@ -136,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
BLASLONG width, i, num_cpu;
#ifndef TRANSA
#if !defined(TRANSA) && !defined(UNSAFE)
int split_x=0;
#endif
@@ -212,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
i -= width;
}
#ifndef TRANSA
#if !defined(TRANSA) && !defined(UNSAFE)
//try to split matrix on row direction and x.
//Then, reduction.
if (num_cpu < nthreads) {
@@ -272,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
exec_blas(num_cpu, queue);
}
#ifndef TRANSA
#if !defined(TRANSA) && !defined(UNSAFE)
if(split_x==1){
//reduction
for(i=0; i<num_cpu; i++){

View File

@@ -54,16 +54,12 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
COPY_K(m, b, incb, buffer, 1);
}
/*FIXME the GEMV unrolling performed here was found to be broken, see issue 1332 */
/* Multiplying DTB size by 100 is just a quick-and-dirty hack to disable it for now[B */
for (is = 0; is < m; is += DTB_ENTRIES){
for (is = 0; is < m; is += DTB_ENTRIES * 100){
min_i = MIN(m - is, DTB_ENTRIES * 100);
min_i = MIN(m - is, DTB_ENTRIES);
#ifndef TRANSA
if (is > 0){
fprintf(stderr,"WARNING unrolling of the trmv_U loop may give wrong results\n");
if (is > 0){
GEMV_N(is, min_i, 0, dp1,
a + is * lda, lda,
B + is, 1,

View File

@@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel;
@@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel;

View File

@@ -362,7 +362,7 @@ cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -410,7 +410,7 @@ zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -458,7 +458,7 @@ xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -558,7 +558,7 @@ cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -606,7 +606,7 @@ zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -654,7 +654,7 @@ xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -1821,7 +1821,7 @@ cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -1869,7 +1869,7 @@ zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -1917,7 +1917,7 @@ xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -1974,7 +1974,7 @@ cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2022,7 +2022,7 @@ zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2070,7 +2070,7 @@ xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2731,7 +2731,7 @@ cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2779,7 +2779,7 @@ zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2827,7 +2827,7 @@ xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2927,7 +2927,7 @@ cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2975,7 +2975,7 @@ zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -3023,7 +3023,7 @@ xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4190,7 +4190,7 @@ cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4238,7 +4238,7 @@ zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4286,7 +4286,7 @@ xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4343,7 +4343,7 @@ cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4391,7 +4391,7 @@ zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4439,7 +4439,7 @@ xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)

View File

@@ -48,6 +48,10 @@
#define SWITCH_RATIO 2
#endif
#ifndef GEMM_PREFERED_SIZE
#define GEMM_PREFERED_SIZE 1
#endif
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@@ -91,11 +95,7 @@
#endif
typedef struct {
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;
@@ -351,7 +351,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Make sure if no one is using workspace */
START_RPCC();
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
#if defined(FUSED_GEMM) && !defined(TIMING)
@@ -413,7 +413,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Wait until other region of B is initialized */
START_RPCC();
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2);
/* Apply kernel with local region of A and part of other region of B */
@@ -431,6 +431,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Clear synchronization flag if this thread is done with other region of B */
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}
} while (current != mypos);
@@ -492,7 +493,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC();
for (i = 0; i < args -> nthreads; i++) {
for (js = 0; js < DIVIDE_RATE; js++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
}
}
STOP_RPCC(waiting3);
@@ -513,10 +514,29 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
return 0;
}
static int round_up(int remainder, int width, int multiple)
{
if (multiple > remainder || width <= multiple)
return width;
width = (width + multiple - 1) / multiple;
width = width * multiple;
return width;
}
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
*range_n, FLOAT *sa, FLOAT *sb,
BLASLONG nthreads_m, BLASLONG nthreads_n) {
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
#else
CRITICAL_SECTION level3_lock;
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
blas_arg_t newarg;
#ifndef USE_ALLOC_HEAP
@@ -557,6 +577,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
#endif
#endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_lock(&level3_lock);
#else
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
#ifdef USE_ALLOC_HEAP
/* Dynamically allocate workspace */
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
@@ -604,9 +632,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
num_parts = 0;
while (m > 0){
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
width = round_up(m, width, GEMM_PREFERED_SIZE);
m -= width;
if (m < 0) width = width + m;
range_M[num_parts + 1] = range_M[num_parts] + width;
num_parts ++;
}
for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
@@ -648,9 +681,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
if (width < SWITCH_RATIO) {
width = SWITCH_RATIO;
}
width = round_up(n, width, GEMM_PREFERED_SIZE);
n -= width;
if (n < 0) width = width + n;
range_N[num_parts + 1] = range_N[num_parts] + width;
num_parts ++;
}
for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
@@ -658,8 +694,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
}
/* Clear synchronization flags */
for (i = 0; i < MAX_CPU_NUMBER; i++) {
for (j = 0; j < MAX_CPU_NUMBER; j++) {
for (i = 0; i < nthreads; i++) {
for (j = 0; j < nthreads; j++) {
for (k = 0; k < DIVIDE_RATE; k++) {
job[i].working[j][CACHE_LINE_SIZE * k] = 0;
}
@@ -674,6 +710,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
free(job);
#endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_unlock(&level3_lock);
#else
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
return 0;
}

View File

@@ -48,7 +48,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
BLASLONG width, i;
BLASLONG n_from, n_to;
double dnum, nf, nt, di;
double dnum, nf, nt, di, dinum;
int num_cpu;
int mask = 0;
@@ -109,7 +109,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) {
di = (double)i;
width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1);
dinum = di * di +dnum;
if (dinum <0)
width = (BLASLONG)(( - di + mask)/(mask+1)) * (mask+1);
else
width = (BLASLONG)(( sqrt(dinum) - di + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
@@ -136,9 +140,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
nf = (double)(arg -> n - n_from);
nt = (double)(arg -> n - n_to);
dnum = (nt * nt - nf * nf) / (double)nthreads;
num_cpu = 0;
range[0] = n_from;
@@ -149,8 +151,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) {
di = (double)(arg -> n - i);
width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1);
dinum = di * di + dnum;
if (dinum<0)
width = ((BLASLONG)(di + mask)/(mask+1)) * (mask+1);
else
width = ((BLASLONG)((-sqrt(dinum) + di) + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
} else {

View File

@@ -47,7 +47,11 @@ GenerateNamedObjects("abs.c" "DOUBLE" "z_abs" 0 "" "" 1)
GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1)
if (DYNAMIC_ARCH)
list(APPEND COMMON_SOURCES dynamic.c)
if (ARM64)
list(APPEND COMMON_SOURCES dynamic_arm64.c)
else ()
list(APPEND COMMON_SOURCES dynamic.c)
endif ()
else ()
list(APPEND COMMON_SOURCES parameter.c)
endif ()

View File

@@ -15,7 +15,15 @@ endif
# COMMONOBJS += info.$(SUFFIX)
ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH),arm64)
COMMONOBJS += dynamic_arm64.$(SUFFIX)
else
ifeq ($(ARCH),power)
COMMONOBJS += dynamic_power.$(SUFFIX)
else
COMMONOBJS += dynamic.$(SUFFIX)
endif
endif
else
COMMONOBJS += parameter.$(SUFFIX)
endif
@@ -71,7 +79,15 @@ BLAS_SERVER = blas_server.c
endif
ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH),arm64)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX)
else
ifeq ($(ARCH),power)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX)
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
endif
endif
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
endif

View File

@@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************/
#include "common.h"
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU)
#include <dlfcn.h>
#include <signal.h>
#include <sys/resource.h>
@@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
/* jobs is queued. */
/* We need this grobal for cheking if initialization is finished. */
/* We need this global for checking if initialization is finished. */
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
/* Local Variables */
@@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
#ifdef MONITOR
/* Monitor is a function to see thread's status for every seconds. */
/* Usually it turns off and it's for debugging. */
/* Monitor is a function to see thread's status for every second. */
/* Usually it turns off and it's for debugging. */
static pthread_t monitor_thread;
static int main_status[MAX_CPU_NUMBER];
@@ -582,7 +582,7 @@ int blas_thread_init(void){
if(ret!=0){
struct rlimit rlim;
const char *msg = strerror(ret);
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg);
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg);
#ifdef RLIMIT_NPROC
if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
@@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) {
long i;
#ifdef SMP_SERVER
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
if (num_threads < 1) num_threads = blas_num_threads;
#ifndef NO_AFFINITY

View File

@@ -48,6 +48,10 @@
#else
#ifndef OMP_SCHED
#define OMP_SCHED static
#endif
int blas_server_avail = 0;
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
@@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
break;
}
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(OMP_SCHED)
for (i = 0; i < num; i ++) {
#ifndef USE_SIMPLE_THREADED_LEVEL3

View File

@@ -50,7 +50,7 @@
/* This is a thread implementation for Win32 lazy implementation */
/* Thread server common infomation */
/* Thread server common information */
typedef struct{
CRITICAL_SECTION lock;
HANDLE filled;
@@ -61,7 +61,7 @@ typedef struct{
} blas_pool_t;
/* We need this global for cheking if initialization is finished. */
/* We need this global for checking if initialization is finished. */
int blas_server_avail = 0;
/* Local Variables */
@@ -461,13 +461,18 @@ int BLASFUNC(blas_thread_shutdown)(void){
SetEvent(pool.killed);
for(i = 0; i < blas_num_threads - 1; i++){
// Could also just use WaitForMultipleObjects
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
#ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
TerminateThread(blas_threads[i],0);
#endif
CloseHandle(blas_threads[i]);
}
CloseHandle(pool.filled);
CloseHandle(pool.killed);
blas_server_avail = 0;
}
@@ -478,7 +483,12 @@ int BLASFUNC(blas_thread_shutdown)(void){
void goto_set_num_threads(int num_threads)
{
long i;
long i;
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
if (num_threads < 1) num_threads = blas_cpu_number;

View File

@@ -49,6 +49,167 @@
#define EXTERN
#endif
#ifdef DYNAMIC_LIST
extern gotoblas_t gotoblas_PRESCOTT;
#ifdef DYN_ATHLON
extern gotoblas_t gotoblas_ATHLON;
#else
#define gotoblas_ATHLON gotoblas_PRESCOTT
#endif
#ifdef DYN_KATMAI
extern gotoblas_t gotoblas_KATMAI;
#else
#define gotoblas_KATMAI gotoblas_PRESCOTT
#endif
#ifdef DYN_BANIAS
extern gotoblas_t gotoblas_BANIAS;
#else
#define gotoblas_BANIAS gotoblas_PRESCOTT
#endif
#ifdef DYN_COPPERMINE
extern gotoblas_t gotoblas_COPPERMINE;
#else
#define gotoblas_COPPERMINE gotoblas_PRESCOTT
#endif
#ifdef DYN_NORTHWOOD
extern gotoblas_t gotoblas_NORTHWOOD;
#else
#define gotoblas_NORTHWOOD gotoblas_PRESCOTT
#endif
#ifdef DYN_CORE2
extern gotoblas_t gotoblas_CORE2;
#else
#define gotoblas_CORE2 gotoblas_PRESCOTT
#endif
#ifdef DYN_NEHALEM
extern gotoblas_t gotoblas_NEHALEM;
#else
#define gotoblas_NEHALEM gotoblas_PRESCOTT
#endif
#ifdef DYN_BARCELONA
extern gotoblas_t gotoblas_BARCELONA;
#elif defined(DYN_NEHALEM)
#define gotoblas_BARCELONA gotoblas_NEHALEM
#else
#define gotoblas_BARCELONA gotoblas_PRESCOTT
#endif
#ifdef DYN_ATOM
extern gotoblas_t gotoblas_ATOM;
elif defined(DYN_NEHALEM)
#define gotoblas_ATOM gotoblas_NEHALEM
#else
#define gotoblas_ATOM gotoblas_PRESCOTT
#endif
#ifdef DYN_NANO
extern gotoblas_t gotoblas_NANO;
#else
#define gotoblas_NANO gotoblas_PRESCOTT
#endif
#ifdef DYN_PENRYN
extern gotoblas_t gotoblas_PENRYN;
#else
#define gotoblas_PENRYN gotoblas_PRESCOTT
#endif
#ifdef DYN_DUNNINGTON
extern gotoblas_t gotoblas_DUNNINGTON;
#else
#define gotoblas_DUNNINGTON gotoblas_PRESCOTT
#endif
#ifdef DYN_OPTERON
extern gotoblas_t gotoblas_OPTERON;
#else
#define gotoblas_OPTERON gotoblas_PRESCOTT
#endif
#ifdef DYN_OPTERON_SSE3
extern gotoblas_t gotoblas_OPTERON_SSE3;
#else
#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
#endif
#ifdef DYN_BOBCAT
extern gotoblas_t gotoblas_BOBCAT;
#elif defined(DYN_NEHALEM)
#define gotoblas_BOBCAT gotoblas_NEHALEM
#else
#define gotoblas_BOBCAT gotoblas_PRESCOTT
#endif
#ifdef DYN_SANDYBRIDGE
extern gotoblas_t gotoblas_SANDYBRIDGE;
#elif defined(DYN_NEHALEM)
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#else
#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
#endif
#ifdef DYN_BULLDOZER
extern gotoblas_t gotoblas_BULLDOZER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_BULLDOZER gotoblas_NEHALEM
#else
#define gotoblas_BULLDOZER gotoblas_PRESCOTT
#endif
#ifdef DYN_PILEDRIVER
extern gotoblas_t gotoblas_PILEDRIVER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_PILEDRIVER gotoblas_NEHALEM
#else
#define gotoblas_PILEDRIVER gotoblas_PRESCOTT
#endif
#ifdef DYN_STEAMROLLER
extern gotoblas_t gotoblas_STEAMROLLER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_STEAMROLLER gotoblas_NEHALEM
#else
#define gotoblas_STEAMROLLER gotoblas_PRESCOTT
#endif
#ifdef DYN_EXCAVATOR
extern gotoblas_t gotoblas_EXCAVATOR;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_EXCAVATOR gotoblas_NEHALEM
#else
#define gotoblas_EXCAVATOR gotoblas_PRESCOTT
#endif
#ifdef DYN_HASWELL
extern gotoblas_t gotoblas_HASWELL;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_HASWELL gotoblas_NEHALEM
#else
#define gotoblas_HASWELL gotoblas_PRESCOTT
#endif
#ifdef DYN_ZEN
extern gotoblas_t gotoblas_ZEN;
#elif defined(DYN_HASWELL)
#define gotoblas_ZEN gotoblas_HASWELL
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_ZEN gotoblas_NEHALEM
#else
#define gotoblas_ZEN gotoblas_PRESCOTT
#endif
#ifdef DYN_SKYLAKEX
extern gotoblas_t gotoblas_SKYLAKEX;
#elif defined(DYN_HASWELL)
#define gotoblas_SKYLAKEX gotoblas_HASWELL
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
#else
#define gotoblas_SKYLAKEX gotoblas_PRESCOTT
#endif
#else // not DYNAMIC_LIST
EXTERN gotoblas_t gotoblas_KATMAI;
EXTERN gotoblas_t gotoblas_COPPERMINE;
EXTERN gotoblas_t gotoblas_NORTHWOOD;
@@ -56,16 +217,27 @@ EXTERN gotoblas_t gotoblas_BANIAS;
EXTERN gotoblas_t gotoblas_ATHLON;
extern gotoblas_t gotoblas_PRESCOTT;
extern gotoblas_t gotoblas_CORE2;
extern gotoblas_t gotoblas_NEHALEM;
extern gotoblas_t gotoblas_BARCELONA;
#ifdef DYNAMIC_OLDER
extern gotoblas_t gotoblas_ATOM;
extern gotoblas_t gotoblas_NANO;
extern gotoblas_t gotoblas_CORE2;
extern gotoblas_t gotoblas_PENRYN;
extern gotoblas_t gotoblas_DUNNINGTON;
extern gotoblas_t gotoblas_NEHALEM;
extern gotoblas_t gotoblas_OPTERON;
extern gotoblas_t gotoblas_OPTERON_SSE3;
extern gotoblas_t gotoblas_BARCELONA;
extern gotoblas_t gotoblas_BOBCAT;
#else
#define gotoblas_ATOM gotoblas_NEHALEM
#define gotoblas_NANO gotoblas_NEHALEM
#define gotoblas_PENRYN gotoblas_CORE2
#define gotoblas_DUNNINGTON gotoblas_CORE2
#define gotoblas_OPTERON gotoblas_CORE2
#define gotoblas_OPTERON_SSE3 gotoblas_CORE2
#define gotoblas_BOBCAT gotoblas_CORE2
#endif
#ifndef NO_AVX
extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER;
@@ -74,15 +246,22 @@ extern gotoblas_t gotoblas_STEAMROLLER;
extern gotoblas_t gotoblas_EXCAVATOR;
#ifdef NO_AVX2
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
#else
extern gotoblas_t gotoblas_HASWELL;
extern gotoblas_t gotoblas_ZEN;
#ifndef NO_AVX512
extern gotoblas_t gotoblas_SKYLAKEX;
#else
#define gotoblas_SKYLAKEX gotoblas_HASWELL
#endif
#endif
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
@@ -90,10 +269,12 @@ extern gotoblas_t gotoblas_ZEN;
#define gotoblas_ZEN gotoblas_BARCELONA
#endif
#endif // DYNAMIC_LIST
#define VENDOR_INTEL 1
#define VENDOR_AMD 2
#define VENDOR_CENTAUR 3
#define VENDOR_HYGON 4
#define VENDOR_UNKNOWN 99
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
@@ -124,9 +305,49 @@ int support_avx(){
#endif
}
int support_avx2(){
#ifndef NO_AVX2
int eax, ebx, ecx=0, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 0)
ret=1; //OS supports AVX2
return ret;
#else
return 0;
#endif
}
int support_avx512(){
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 1){
ret=0; //OS does not even support AVX2
}
if((ebx & (1<<31)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 0xe0) == 0xe0)
ret=1; //OS supports AVX512VL
}
return ret;
#else
return 0;
#endif
}
extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
static int get_vendor(void){
@@ -149,6 +370,7 @@ static int get_vendor(void){
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
@@ -223,18 +445,24 @@ static gotoblas_t *get_coretype(void){
}
//Intel Haswell
if (model == 12 || model == 15) {
if(support_avx())
if(support_avx2())
return &gotoblas_HASWELL;
else{
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Broadwell
if (model == 13) {
if(support_avx())
if(support_avx2())
return &gotoblas_HASWELL;
else{
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
@@ -244,27 +472,36 @@ static gotoblas_t *get_coretype(void){
case 4:
//Intel Haswell
if (model == 5 || model == 6) {
if(support_avx())
if(support_avx2())
return &gotoblas_HASWELL;
else{
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Broadwell
if (model == 7 || model == 15) {
if(support_avx())
if(support_avx2())
return &gotoblas_HASWELL;
else{
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Skylake
if (model == 14) {
if(support_avx())
if(support_avx2())
return &gotoblas_HASWELL;
else{
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
@@ -277,42 +514,104 @@ static gotoblas_t *get_coretype(void){
case 5:
//Intel Broadwell
if (model == 6) {
if(support_avx())
if(support_avx2())
return &gotoblas_HASWELL;
else{
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Skylake
if (model == 14 || model == 5) {
if(support_avx())
if (model == 5) {
// Intel Skylake X
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
else{
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
//Intel Skylake
if (model == 14) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Phi Knights Landing
if (model == 7) {
if(support_avx())
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
else{
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Apollo Lake
if (model == 12) {
//Apollo Lake or Denverton
if (model == 12 || model == 15) {
return &gotoblas_NEHALEM;
}
return NULL;
case 6:
if (model == 6) {
// Cannon Lake
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 7:
if (model == 14) {
// Ice Lake
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 9:
case 8:
if (model == 14 ) { // Kaby Lake
if(support_avx())
if (model == 14 ) { // Kaby Lake, Coffee Lake
if(support_avx2())
return &gotoblas_HASWELL;
else{
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
@@ -325,7 +624,7 @@ static gotoblas_t *get_coretype(void){
}
}
if (vendor == VENDOR_AMD){
if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
if (family <= 0xe) {
// Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
@@ -397,7 +696,7 @@ static gotoblas_t *get_coretype(void){
}
}
} else if (exfamily == 8) {
if (model == 1) {
if (model == 1 || model == 8) {
if(support_avx())
return &gotoblas_ZEN;
else{
@@ -405,6 +704,13 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
} else if (exfamily == 9) {
if(support_avx())
return &gotoblas_ZEN;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else {
return &gotoblas_BARCELONA;
}
@@ -445,7 +751,8 @@ static char *corename[] = {
"Haswell",
"Steamroller",
"Excavator",
"Zen"
"Zen",
"SkylakeX"
};
char *gotoblas_corename(void) {
@@ -473,7 +780,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
if (gotoblas == &gotoblas_ZEN) return corename[23];
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
return corename[0];
}
@@ -485,7 +792,7 @@ static gotoblas_t *force_coretype(char *coretype){
char message[128];
//char mname[20];
for ( i=1 ; i <= 23; i++)
for ( i=1 ; i <= 24; i++)
{
if (!strncasecmp(coretype,corename[i],20))
{
@@ -503,6 +810,7 @@ static gotoblas_t *force_coretype(char *coretype){
switch (found)
{
case 24: return (&gotoblas_SKYLAKEX);
case 23: return (&gotoblas_ZEN);
case 22: return (&gotoblas_EXCAVATOR);
case 21: return (&gotoblas_STEAMROLLER);

View File

@@ -0,0 +1,198 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include "common.h"
#include <asm/hwcap.h>
#include <sys/auxv.h>
extern gotoblas_t gotoblas_ARMV8;
extern gotoblas_t gotoblas_CORTEXA57;
extern gotoblas_t gotoblas_THUNDERX;
extern gotoblas_t gotoblas_THUNDERX2T99;
extern void openblas_warning(int verbose, const char * msg);
#define NUM_CORETYPES 4
/*
* In case asm/hwcap.h is outdated on the build system, make sure
* that HWCAP_CPUID is defined
*/
#ifndef HWCAP_CPUID
#define HWCAP_CPUID (1 << 11)
#endif
#define get_cpu_ftr(id, var) ({ \
asm("mrs %0, "#id : "=r" (var)); \
})
static char *corename[] = {
"armv8",
"cortexa57",
"thunderx",
"thunderx2t99",
"unknown"
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1];
if (gotoblas == &gotoblas_THUNDERX) return corename[ 2];
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3];
return corename[NUM_CORETYPES];
}
static gotoblas_t *force_coretype(char *coretype) {
int i ;
int found = -1;
char message[128];
for ( i=0 ; i < NUM_CORETYPES; i++)
{
if (!strncasecmp(coretype, corename[i], 20))
{
found = i;
break;
}
}
switch (found)
{
case 0: return (&gotoblas_ARMV8);
case 1: return (&gotoblas_CORTEXA57);
case 2: return (&gotoblas_THUNDERX);
case 3: return (&gotoblas_THUNDERX2T99);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
return NULL;
}
static gotoblas_t *get_coretype(void) {
int implementer, variant, part, arch, revision, midr_el1;
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
char coremsg[128];
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
openblas_warning(1, coremsg);
return NULL;
}
get_cpu_ftr(MIDR_EL1, midr_el1);
/*
* MIDR_EL1
*
* 31 24 23 20 19 16 15 4 3 0
* -----------------------------------------------------------------
* | Implementer | Variant | Architecture | Part Number | Revision |
* -----------------------------------------------------------------
*/
implementer = (midr_el1 >> 24) & 0xFF;
part = (midr_el1 >> 4) & 0xFFF;
switch(implementer)
{
case 0x41: // ARM
switch (part)
{
case 0xd07: // Cortex A57
case 0xd08: // Cortex A72
case 0xd03: // Cortex A53
return &gotoblas_CORTEXA57;
}
break;
case 0x42: // Broadcom
switch (part)
{
case 0x516: // Vulcan
return &gotoblas_THUNDERX2T99;
}
break;
case 0x43: // Cavium
switch (part)
{
case 0x0a1: // ThunderX
return &gotoblas_THUNDERX;
case 0x0af: // ThunderX2
return &gotoblas_THUNDERX2T99;
}
break;
}
return NULL;
}
void gotoblas_dynamic_init(void) {
char coremsg[128];
char coren[22];
char *p;
if (gotoblas) return;
p = getenv("OPENBLAS_CORETYPE");
if ( p )
{
gotoblas = force_coretype(p);
}
else
{
gotoblas = get_coretype();
}
if (gotoblas == NULL)
{
snprintf(coremsg, 128, "Falling back to generic ARMV8 core\n");
openblas_warning(1, coremsg);
gotoblas = &gotoblas_ARMV8;
}
if (gotoblas && gotoblas->init) {
strncpy(coren, gotoblas_corename(), 20);
sprintf(coremsg, "Core: %s\n", coren);
openblas_warning(2, coremsg);
gotoblas -> init();
} else {
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1);
}
}
void gotoblas_dynamic_quit(void) {
gotoblas = NULL;
}

View File

@@ -0,0 +1,102 @@
#include "common.h"
extern gotoblas_t gotoblas_POWER6;
extern gotoblas_t gotoblas_POWER8;
extern gotoblas_t gotoblas_POWER9;
extern void openblas_warning(int verbose, const char *msg);
static char *corename[] = {
"unknown",
"POWER6",
"POWER8",
"POWER9"
};
#define NUM_CORETYPES 4
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_POWER6) return corename[1];
if (gotoblas == &gotoblas_POWER8) return corename[2];
if (gotoblas == &gotoblas_POWER9) return corename[3];
return corename[0];
}
static gotoblas_t *get_coretype(void) {
if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
return &gotoblas_POWER6;
if (__builtin_cpu_is("power8"))
return &gotoblas_POWER8;
if (__builtin_cpu_is("power9"))
return &gotoblas_POWER9;
return NULL;
}
static gotoblas_t *force_coretype(char * coretype) {
int i ;
int found = -1;
char message[128];
for ( i = 0 ; i < NUM_CORETYPES; i++)
{
if (!strncasecmp(coretype, corename[i], 20))
{
found = i;
break;
}
}
switch (found)
{
case 1: return (&gotoblas_POWER6);
case 2: return (&gotoblas_POWER8);
case 3: return (&gotoblas_POWER9);
default: return NULL;
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
}
void gotoblas_dynamic_init(void) {
char coremsg[128];
char coren[22];
char *p;
if (gotoblas) return;
p = getenv("OPENBLAS_CORETYPE");
if ( p )
{
gotoblas = force_coretype(p);
}
else
{
gotoblas = get_coretype();
}
if (gotoblas == NULL)
{
snprintf(coremsg, 128, "Falling back to POWER8 core\n");
openblas_warning(1, coremsg);
gotoblas = &gotoblas_POWER8;
}
if (gotoblas && gotoblas -> init) {
strncpy(coren,gotoblas_corename(),20);
sprintf(coremsg, "Core: %s\n",coren);
openblas_warning(2, coremsg);
gotoblas -> init();
} else {
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1);
}
}
void gotoblas_dynamic_quit(void) {
gotoblas = NULL;
}

View File

@@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {
int mynode = 1;
/* if number of threads is larger than inital condition */
/* if number of threads is larger than initial condition */
if (pos < 0) {
sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
return 0;
@@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) {
common -> shmid = pshmid;
if (common -> magic != SH_MAGIC) {
#if defined(__GLIBC_PREREQ)
#if __GLIBC_PREREQ(2, 7)
cpu_set_t *cpusetp;
#else
cpu_set_t cpuset;
#endif
#endif
int nums;
int ret;
@@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) {
}
CPU_FREE(cpusetp);
#else
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
if (ret!=0) {
common->num_procs = nums;
} else {
@@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) {
int i;
int n = 0;
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++;
if (CPU_ISSET(i,&cpuset)) n++;
common->num_procs = n;
}
#else
common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
common->num_procs = CPU_COUNT(&cpuset);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -36,8 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <string.h>
static char* openblas_config_str=""
"OpenBLAS "
VERSION
" "
#ifdef USE64BITINT
"USE64BITINT "
" USE64BITINT "
#endif
#ifdef NO_CBLAS
"NO_CBLAS "

View File

@@ -167,7 +167,7 @@ int get_L2_size(void){
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@@ -251,7 +251,7 @@ int get_L2_size(void){
void blas_set_parameter(void){
int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
int size = 16;
#else
int size = get_L2_size();
@@ -730,35 +730,8 @@ void blas_set_parameter(void){
#if defined(ARCH_ARM64)
#if defined(VULCAN) || defined(THUNDERX2T99)
unsigned long dgemm_prefetch_size_a;
unsigned long dgemm_prefetch_size_b;
unsigned long dgemm_prefetch_size_c;
#endif
void blas_set_parameter(void)
{
#if defined(VULCAN) || defined(THUNDERX2T99)
dgemm_p = 160;
dgemm_q = 128;
dgemm_r = 4096;
sgemm_p = 128;
sgemm_q = 352;
sgemm_r = 4096;
cgemm_p = 128;
cgemm_q = 224;
cgemm_r = 4096;
zgemm_p = 128;
zgemm_q = 112;
zgemm_r = 4096;
dgemm_prefetch_size_a = 3584;
dgemm_prefetch_size_b = 512;
dgemm_prefetch_size_c = 128;
#endif
}
#endif

897
dynamic.c Normal file
View File

@@ -0,0 +1,897 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include "common.h"
#ifdef _MSC_VER
#define strncasecmp _strnicmp
#define strcasecmp _stricmp
#endif
#ifdef ARCH_X86
#define EXTERN extern
#else
#define EXTERN
#endif
#ifdef DYNAMIC_LIST
extern gotoblas_t gotoblas_PRESCOTT;
#ifdef DYN_ATHLON
extern gotoblas_t gotoblas_ATHLON;
#else
#define gotoblas_ATHLON gotoblas_PRESCOTT
#endif
#ifdef DYN_KATMAI
extern gotoblas_t gotoblas_KATMAI;
#else
#define gotoblas_KATMAI gotoblas_PRESCOTT
#endif
#ifdef DYN_BANIAS
extern gotoblas_t gotoblas_BANIAS;
#else
#define gotoblas_BANIAS gotoblas_PRESCOTT
#endif
#ifdef DYN_COPPERMINE
extern gotoblas_t gotoblas_COPPERMINE;
#else
#define gotoblas_COPPERMINE gotoblas_PRESCOTT
#endif
#ifdef DYN_NORTHWOOD
extern gotoblas_t gotoblas_NORTHWOOD;
#else
#define gotoblas_NORTHWOOD gotoblas_PRESCOTT
#endif
#ifdef DYN_CORE2
extern gotoblas_t gotoblas_CORE2;
#else
#define gotoblas_CORE2 gotoblas_PRESCOTT
#endif
#ifdef DYN_NEHALEM
extern gotoblas_t gotoblas_NEHALEM;
#else
#define gotoblas_NEHALEM gotoblas_PRESCOTT
#endif
#ifdef DYN_BARCELONA
extern gotoblas_t gotoblas_BARCELONA;
#elif defined(DYN_NEHALEM)
#define gotoblas_BARCELONA gotoblas_NEHALEM
#else
#define gotoblas_BARCELONA gotoblas_PRESCOTT
#endif
#ifdef DYN_ATOM
extern gotoblas_t gotoblas_ATOM;
elif defined(DYN_NEHALEM)
#define gotoblas_ATOM gotoblas_NEHALEM
#else
#define gotoblas_ATOM gotoblas_PRESCOTT
#endif
#ifdef DYN_NANO
extern gotoblas_t gotoblas_NANO;
#else
#define gotoblas_NANO gotoblas_PRESCOTT
#endif
#ifdef DYN_PENRYN
extern gotoblas_t gotoblas_PENRYN;
#else
#define gotoblas_PENRYN gotoblas_PRESCOTT
#endif
#ifdef DYN_DUNNINGTON
extern gotoblas_t gotoblas_DUNNINGTON;
#else
#define gotoblas_DUNNINGTON gotoblas_PRESCOTT
#endif
#ifdef DYN_OPTERON
extern gotoblas_t gotoblas_OPTERON;
#else
#define gotoblas_OPTERON gotoblas_PRESCOTT
#endif
#ifdef DYN_OPTERON_SSE3
extern gotoblas_t gotoblas_OPTERON_SSE3;
#else
#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
#endif
#ifdef DYN_BOBCAT
extern gotoblas_t gotoblas_BOBCAT;
#elif defined(DYN_NEHALEM)
#define gotoblas_BOBCAT gotoblas_NEHALEM
#else
#define gotoblas_BOBCAT gotoblas_PRESCOTT
#endif
#ifdef DYN_SANDYBRIDGE
extern gotoblas_t gotoblas_SANDYBRIDGE;
#elif defined(DYN_NEHALEM)
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#else
#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
#endif
#ifdef DYN_BULLDOZER
extern gotoblas_t gotoblas_BULLDOZER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_BULLDOZER gotoblas_NEHALEM
#else
#define gotoblas_BULLDOZER gotoblas_PRESCOTT
#endif
#ifdef DYN_PILEDRIVER
extern gotoblas_t gotoblas_PILEDRIVER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_PILEDRIVER gotoblas_NEHALEM
#else
#define gotoblas_PILEDRIVER gotoblas_PRESCOTT
#endif
#ifdef DYN_STEAMROLLER
extern gotoblas_t gotoblas_STEAMROLLER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_STEAMROLLER gotoblas_NEHALEM
#else
#define gotoblas_STEAMROLLER gotoblas_PRESCOTT
#endif
#ifdef DYN_EXCAVATOR
extern gotoblas_t gotoblas_EXCAVATOR;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_EXCAVATOR gotoblas_NEHALEM
#else
#define gotoblas_EXCAVATOR gotoblas_PRESCOTT
#endif
#ifdef DYN_HASWELL
extern gotoblas_t gotoblas_HASWELL;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_HASWELL gotoblas_NEHALEM
#else
#define gotoblas_HASWELL gotoblas_PRESCOTT
#endif
#ifdef DYN_ZEN
extern gotoblas_t gotoblas_ZEN;
#elif defined(DYN_HASWELL)
#define gotoblas_ZEN gotoblas_HASWELL
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_ZEN gotoblas_NEHALEM
#else
#define gotoblas_ZEN gotoblas_PRESCOTT
#endif
#ifdef DYN_SKYLAKEX
extern gotoblas_t gotoblas_SKYLAKEX;
#elif defined(DYN_HASWELL)
#define gotoblas_SKYLAKEX gotoblas_HASWELL
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
#else
#define gotoblas_SKYLAKEX gotoblas_PRESCOTT
#endif
#else // not DYNAMIC_LIST
EXTERN gotoblas_t gotoblas_KATMAI;
EXTERN gotoblas_t gotoblas_COPPERMINE;
EXTERN gotoblas_t gotoblas_NORTHWOOD;
EXTERN gotoblas_t gotoblas_BANIAS;
EXTERN gotoblas_t gotoblas_ATHLON;
extern gotoblas_t gotoblas_PRESCOTT;
extern gotoblas_t gotoblas_CORE2;
extern gotoblas_t gotoblas_NEHALEM;
extern gotoblas_t gotoblas_BARCELONA;
#ifdef DYNAMIC_OLDER
extern gotoblas_t gotoblas_ATOM;
extern gotoblas_t gotoblas_NANO;
extern gotoblas_t gotoblas_PENRYN;
extern gotoblas_t gotoblas_DUNNINGTON;
extern gotoblas_t gotoblas_OPTERON;
extern gotoblas_t gotoblas_OPTERON_SSE3;
extern gotoblas_t gotoblas_BOBCAT;
#else
#define gotoblas_ATOM gotoblas_NEHALEM
#define gotoblas_NANO gotoblas_NEHALEM
#define gotoblas_PENRYN gotoblas_CORE2
#define gotoblas_DUNNINGTON gotoblas_CORE2
#define gotoblas_OPTERON gotoblas_CORE2
#define gotoblas_OPTERON_SSE3 gotoblas_CORE2
#define gotoblas_BOBCAT gotoblas_CORE2
#endif
#ifndef NO_AVX
extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER;
extern gotoblas_t gotoblas_PILEDRIVER;
extern gotoblas_t gotoblas_STEAMROLLER;
extern gotoblas_t gotoblas_EXCAVATOR;
#ifdef NO_AVX2
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
#else
extern gotoblas_t gotoblas_HASWELL;
extern gotoblas_t gotoblas_ZEN;
#ifndef NO_AVX512
extern gotoblas_t gotoblas_SKYLAKEX;
#else
#define gotoblas_SKYLAKEX gotoblas_HASWELL
#endif
#endif
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
#define gotoblas_EXCAVATOR gotoblas_BARCELONA
#define gotoblas_ZEN gotoblas_BARCELONA
#endif
#endif // DYNAMIC_LIST
#define VENDOR_INTEL 1
#define VENDOR_AMD 2
#define VENDOR_CENTAUR 3
#define VENDOR_HYGON 4
#define VENDOR_UNKNOWN 99
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
#ifndef NO_AVX
static inline void xgetbv(int op, int * eax, int * edx){
//Use binary code for xgetbv
__asm__ __volatile__
(".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
}
#endif
int support_avx(){
#ifndef NO_AVX
int eax, ebx, ecx, edx;
int ret=0;
cpuid(1, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 6) == 6){
ret=1; //OS support AVX
}
}
return ret;
#else
return 0;
#endif
}
int support_avx2(){
#ifndef NO_AVX2
int eax, ebx, ecx=0, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 0)
ret=1; //OS supports AVX2
return ret;
#else
return 0;
#endif
}
int support_avx512(){
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 1){
ret=0; //OS does not even support AVX2
}
if((ebx & (1<<31)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 0xe0) == 0xe0)
ret=1; //OS supports AVX512VL
}
return ret;
#else
return 0;
#endif
}
extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
static int get_vendor(void){
int eax, ebx, ecx, edx;
union
{
char vchar[16];
int vint[4];
} vendor;
cpuid(0, &eax, &ebx, &ecx, &edx);
*(&vendor.vint[0]) = ebx;
*(&vendor.vint[1]) = edx;
*(&vendor.vint[2]) = ecx;
vendor.vchar[12] = '\0';
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
return VENDOR_UNKNOWN;
}
static gotoblas_t *get_coretype(void){
int eax, ebx, ecx, edx;
int family, exfamily, model, vendor, exmodel;
cpuid(1, &eax, &ebx, &ecx, &edx);
family = BITMASK(eax, 8, 0x0f);
exfamily = BITMASK(eax, 20, 0xff);
model = BITMASK(eax, 4, 0x0f);
exmodel = BITMASK(eax, 16, 0x0f);
vendor = get_vendor();
if (vendor == VENDOR_INTEL){
switch (family) {
case 0x6:
switch (exmodel) {
case 0:
if (model <= 0x7) return &gotoblas_KATMAI;
if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE;
if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS;
if (model == 14) return &gotoblas_BANIAS;
if (model == 15) return &gotoblas_CORE2;
return NULL;
case 1:
if (model == 6) return &gotoblas_CORE2;
if (model == 7) return &gotoblas_PENRYN;
if (model == 13) return &gotoblas_DUNNINGTON;
if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM;
if (model == 12) return &gotoblas_ATOM;
return NULL;
case 2:
//Intel Core (Clarkdale) / Core (Arrandale)
// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
// Xeon (Clarkdale), 32nm
if (model == 5) return &gotoblas_NEHALEM;
//Intel Xeon Processor 5600 (Westmere-EP)
//Xeon Processor E7 (Westmere-EX)
//Xeon E7540
if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
//Intel Core i7-3000 / Xeon E5
if (model == 10 || model == 13) {
if(support_avx())
return &gotoblas_SANDYBRIDGE;
else{
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
case 3:
//Intel Sandy Bridge 22nm (Ivy Bridge?)
if (model == 10 || model == 14) {
if(support_avx())
return &gotoblas_SANDYBRIDGE;
else{
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Haswell
if (model == 12 || model == 15) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Broadwell
if (model == 13) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
if (model == 7) return &gotoblas_ATOM; //Bay Trail
return NULL;
case 4:
//Intel Haswell
if (model == 5 || model == 6) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Broadwell
if (model == 7 || model == 15) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Skylake
if (model == 14) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Braswell / Avoton
if (model == 12 || model == 13) {
return &gotoblas_NEHALEM;
}
return NULL;
case 5:
//Intel Broadwell
if (model == 6) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
if (model == 5) {
// Intel Skylake X
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
//Intel Skylake
if (model == 14) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Phi Knights Landing
if (model == 7) {
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Apollo Lake or Denverton
if (model == 12 || model == 15) {
return &gotoblas_NEHALEM;
}
return NULL;
case 6:
if (model == 6) {
// Cannon Lake
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 7:
if (model == 10) // Goldmont plus
return &gotoblas_NEHALEM;
if (model == 14) {
// Ice Lake
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 9:
case 8:
if (model == 14 ) { // Kaby Lake, Coffee Lake
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
}
case 0xf:
if (model <= 0x2) return &gotoblas_NORTHWOOD;
return &gotoblas_PRESCOTT;
}
}
if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
if (family <= 0xe) {
// Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
if ( (eax & 0xffff) >= 0x01) {
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0)
return NULL;
}
else
return NULL;
return &gotoblas_ATHLON;
}
if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) {
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
else return &gotoblas_OPTERON;
} else if (exfamily == 5) {
return &gotoblas_BOBCAT;
} else if (exfamily == 6) {
if(model == 1){
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return &gotoblas_BULLDOZER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 2 || model == 3){
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
if(support_avx())
return &gotoblas_PILEDRIVER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 5){
if(support_avx())
return &gotoblas_EXCAVATOR;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 0 || model == 8){
if (exmodel == 1) {
//AMD Trinity
if(support_avx())
return &gotoblas_PILEDRIVER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if (exmodel == 3) {
//AMD STEAMROLLER
if(support_avx())
return &gotoblas_STEAMROLLER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if (exmodel == 6) {
if(support_avx())
return &gotoblas_EXCAVATOR;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
}
} else if (exfamily == 8) {
if (model == 1 || model == 8) {
if(support_avx())
return &gotoblas_ZEN;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
} else if (exfamily == 9) {
if(support_avx())
return &gotoblas_ZEN;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else {
return &gotoblas_BARCELONA;
}
}
}
if (vendor == VENDOR_CENTAUR) {
switch (family) {
case 0x6:
return &gotoblas_NANO;
}
}
return NULL;
}
static char *corename[] = {
"Unknown",
"Katmai",
"Coppermine",
"Northwood",
"Prescott",
"Banias",
"Atom",
"Core2",
"Penryn",
"Dunnington",
"Nehalem",
"Athlon",
"Opteron",
"Opteron_SSE3",
"Barcelona",
"Nano",
"Sandybridge",
"Bobcat",
"Bulldozer",
"Piledriver",
"Haswell",
"Steamroller",
"Excavator",
"Zen",
"SkylakeX"
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_KATMAI) return corename[ 1];
if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2];
if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3];
if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4];
if (gotoblas == &gotoblas_BANIAS) return corename[ 5];
if (gotoblas == &gotoblas_ATOM) return corename[ 6];
if (gotoblas == &gotoblas_CORE2) return corename[ 7];
if (gotoblas == &gotoblas_PENRYN) return corename[ 8];
if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9];
if (gotoblas == &gotoblas_NEHALEM) return corename[10];
if (gotoblas == &gotoblas_ATHLON) return corename[11];
if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12];
if (gotoblas == &gotoblas_OPTERON) return corename[13];
if (gotoblas == &gotoblas_BARCELONA) return corename[14];
if (gotoblas == &gotoblas_NANO) return corename[15];
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
if (gotoblas == &gotoblas_HASWELL) return corename[20];
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
if (gotoblas == &gotoblas_ZEN) return corename[23];
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
return corename[0];
}
static gotoblas_t *force_coretype(char *coretype){
int i ;
int found = -1;
char message[128];
//char mname[20];
for ( i=1 ; i <= 24; i++)
{
if (!strncasecmp(coretype,corename[i],20))
{
found = i;
break;
}
}
if (found < 0)
{
//strncpy(mname,coretype,20);
snprintf(message, 128, "Core not found: %s\n",coretype);
openblas_warning(1, message);
return(NULL);
}
switch (found)
{
case 24: return (&gotoblas_SKYLAKEX);
case 23: return (&gotoblas_ZEN);
case 22: return (&gotoblas_EXCAVATOR);
case 21: return (&gotoblas_STEAMROLLER);
case 20: return (&gotoblas_HASWELL);
case 19: return (&gotoblas_PILEDRIVER);
case 18: return (&gotoblas_BULLDOZER);
case 17: return (&gotoblas_BOBCAT);
case 16: return (&gotoblas_SANDYBRIDGE);
case 15: return (&gotoblas_NANO);
case 14: return (&gotoblas_BARCELONA);
case 13: return (&gotoblas_OPTERON);
case 12: return (&gotoblas_OPTERON_SSE3);
case 11: return (&gotoblas_ATHLON);
case 10: return (&gotoblas_NEHALEM);
case 9: return (&gotoblas_DUNNINGTON);
case 8: return (&gotoblas_PENRYN);
case 7: return (&gotoblas_CORE2);
case 6: return (&gotoblas_ATOM);
case 5: return (&gotoblas_BANIAS);
case 4: return (&gotoblas_PRESCOTT);
case 3: return (&gotoblas_NORTHWOOD);
case 2: return (&gotoblas_COPPERMINE);
case 1: return (&gotoblas_KATMAI);
}
return(NULL);
}
void gotoblas_dynamic_init(void) {
char coremsg[128];
char coren[22];
char *p;
if (gotoblas) return;
p = getenv("OPENBLAS_CORETYPE");
if ( p )
{
gotoblas = force_coretype(p);
}
else
{
gotoblas = get_coretype();
}
#ifdef ARCH_X86
if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
#else
if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
/* sanity check, if 64bit pointer we can't have a 32 bit cpu */
if (sizeof(void*) == 8) {
if (gotoblas == &gotoblas_KATMAI ||
gotoblas == &gotoblas_COPPERMINE ||
gotoblas == &gotoblas_NORTHWOOD ||
gotoblas == &gotoblas_BANIAS ||
gotoblas == &gotoblas_ATHLON)
gotoblas = &gotoblas_PRESCOTT;
}
#endif
if (gotoblas && gotoblas -> init) {
strncpy(coren,gotoblas_corename(),20);
sprintf(coremsg, "Core: %s\n",coren);
openblas_warning(2, coremsg);
gotoblas -> init();
} else {
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1);
}
}
void gotoblas_dynamic_quit(void) {
gotoblas = NULL;
}

View File

@@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol
libgoto_hpl.def : gensymbol
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
ifeq ($(OSNAME), Darwin)
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
else
@@ -114,20 +118,22 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
endif
ifneq (,$(filter 1 2,$(NOFORTRAN)))
#only build without Fortran
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
endif
dllinit.$(SUFFIX) : dllinit.c
$(CC) $(CFLAGS) -c -o $(@F) -s $<
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
so : ../$(LIBSONAME)
ifeq ($(OSNAME), Android)
INTERNALNAME = $(LIBPREFIX).so
FEXTRALIB += -lm
EXTRALIB += -lm
else
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
@@ -139,6 +145,14 @@ else
$(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed
../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c
endif
ifeq ($(F_COMPILER), INTEL)
$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
else
ifneq ($(C_COMPILER), LSB)
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
@@ -150,6 +164,7 @@ else
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
endif
endif
rm -f linktest

View File

@@ -40,15 +40,25 @@
void gotoblas_init(void);
void gotoblas_quit(void);
#if defined(SMP) && defined(USE_TLS)
void blas_thread_memory_cleanup(void);
#endif
BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {
if (reason == DLL_PROCESS_ATTACH) {
gotoblas_init();
}
if (reason == DLL_PROCESS_DETACH) {
gotoblas_quit();
switch(reason) {
case DLL_PROCESS_ATTACH:
gotoblas_init();
break;
case DLL_PROCESS_DETACH:
gotoblas_quit();
break;
case DLL_THREAD_ATTACH:
break;
case DLL_THREAD_DETACH:
#if defined(SMP) && defined(USE_TLS)
blas_thread_memory_cleanup();
#endif
break;
}
return TRUE;

12
f_check
View File

@@ -125,7 +125,7 @@ if ($compiler eq "") {
$openmp = "-openmp";
}
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data =~ / zho_ge__/) {
$need2bu = 1;
@@ -292,9 +292,6 @@ if ($link ne "") {
&& ($flags !~ /^-LIST:/)
&& ($flags !~ /^-LANG:/)
) {
if ($vendor eq "PGI") {
$flags =~ s/lib$/libso/;
}
$linker_L .= $flags . " ";
}
@@ -311,17 +308,11 @@ if ($link ne "") {
if ($flags =~ /^\-rpath\@/) {
$flags =~ s/\@/\,/g;
if ($vendor eq "PGI") {
$flags =~ s/lib$/libso/;
}
$linker_L .= "-Wl,". $flags . " " ;
}
if ($flags =~ /^\-rpath-link\@/) {
$flags =~ s/\@/\,/g;
if ($vendor eq "PGI") {
$flags =~ s/lib$/libso/;
}
$linker_L .= "-Wl,". $flags . " " ;
}
@@ -330,7 +321,6 @@ if ($link ne "") {
&& ($flags !~ /gfortranbegin/)
&& ($flags !~ /frtbegin/)
&& ($flags !~ /pathfstart/)
&& ($flags !~ /numa/)
&& ($flags !~ /crt[0-9]/)
&& ($flags !~ /gcc/)
&& ($flags !~ /user32/)

157
getarch.c
View File

@@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <unistd.h>
#endif
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#else
#define NO_AVX512
#endif
/* #define FORCE_P2 */
/* #define FORCE_KATMAI */
/* #define FORCE_COPPERMINE */
@@ -326,6 +330,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "HASWELL"
#endif
#ifdef FORCE_SKYLAKEX
#ifdef NO_AVX512
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#else
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "SKYLAKEX"
#define ARCHCONFIG "-DSKYLAKEX " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
#define LIBNAME "skylakex"
#define CORENAME "SKYLAKEX"
#endif
#endif
#ifdef FORCE_ATOM
#define FORCE
#define FORCE_INTEL
@@ -603,6 +637,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "POWER8"
#endif
#if defined(FORCE_POWER9)
#define FORCE
#define ARCHITECTURE "POWER"
#define SUBARCHITECTURE "POWER9"
#define SUBDIRNAME "power"
#define ARCHCONFIG "-DPOWER9 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
"-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "power9"
#define CORENAME "POWER9"
#endif
#ifdef FORCE_PPCG4
#define FORCE
@@ -912,11 +958,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DARMV8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 "
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "armv8"
#define CORENAME "ARMV8"
#endif
#ifdef FORCE_CORTEXA53
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXA53"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA53 " \
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa53"
#define CORENAME "CORTEXA53"
#else
#endif
#ifdef FORCE_CORTEXA57
#define FORCE
#define ARCHITECTURE "ARM64"
@@ -927,26 +990,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa57"
#define CORENAME "CORTEXA57"
#else
#endif
#ifdef FORCE_VULCAN
#ifdef FORCE_CORTEXA72
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "VULCAN"
#define SUBARCHITECTURE "CORTEXA72"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DVULCAN " \
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
#define ARCHCONFIG "-DCORTEXA72 " \
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
#define LIBNAME "vulcan"
#define CORENAME "VULCAN"
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa72"
#define CORENAME "CORTEXA72"
#else
#endif
#ifdef FORCE_CORTEXA73
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXA73"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA73 " \
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa73"
#define CORENAME "CORTEXA73"
#else
#endif
#ifdef FORCE_FALKOR
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "FALKOR"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DFALKOR " \
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "falkor"
#define CORENAME "FALKOR"
#else
#endif
@@ -958,13 +1052,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DTHUNDERX " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
"-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 "
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "thunderx"
#define CORENAME "THUNDERX"
#else
#endif
#ifdef FORCE_THUNDERX2T99
#define ARMV8
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "THUNDERX2T99"
@@ -975,12 +1071,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "thunderx2t99"
#define CORENAME "THUNDERX2T99"
#else
#endif
#ifdef FORCE_TSV110
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "TSV110"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DTSV110 " \
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "tsv110"
#define CORENAME "TSV110"
#else
#endif
#ifdef FORCE_ZARCH_GENERIC
#define FORCE
#define ARCHITECTURE "ZARCH"
@@ -1001,8 +1114,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "Z13"
#endif
#ifdef FORCE_Z14
#define FORCE
#define ARCHITECTURE "ZARCH"
#define SUBARCHITECTURE "Z14"
#define ARCHCONFIG "-DZ14 " \
"-DDTB_DEFAULT_ENTRIES=64"
#define LIBNAME "z14"
#define CORENAME "Z14"
#endif
#ifndef FORCE
#ifdef USER_TARGET
#error "The TARGET specified on the command line or in Makefile.rule is not supported. Please choose a target from TargetList.txt"
#endif
#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \
defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__)
#ifndef POWER
@@ -1181,9 +1308,7 @@ int main(int argc, char *argv[]){
#elif NO_PARALLEL_MAKE==1
printf("MAKE += -j 1\n");
#else
#ifndef OS_WINDOWS
printf("MAKE += -j %d\n", get_num_cores());
#endif
#endif
break;

View File

@@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES
rotm.c rotmg.c # N.B. these do not have complex counterparts
rot.c
asum.c
sum.c
)
# these will have 'z' prepended for the complex version
@@ -23,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES
axpby.c
)
# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f
# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f
# these all have 'z' sources for complex versions
set(BLAS2_SOURCES
gemv.c ger.c
@@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES})
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX")
GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX")
GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX")
GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX")
endif ()
if (${float_type} STREQUAL "ZCOMPLEX")
GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX")
@@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES})
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
endif ()
endforeach ()

View File

@@ -25,7 +25,7 @@ SBLAS1OBJS = \
saxpy.$(SUFFIX) sswap.$(SUFFIX) \
scopy.$(SUFFIX) sscal.$(SUFFIX) \
sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \
sasum.$(SUFFIX) snrm2.$(SUFFIX) \
sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \
smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \
smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \
srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \
@@ -51,7 +51,7 @@ DBLAS1OBJS = \
daxpy.$(SUFFIX) dswap.$(SUFFIX) \
dcopy.$(SUFFIX) dscal.$(SUFFIX) \
ddot.$(SUFFIX) \
dasum.$(SUFFIX) dnrm2.$(SUFFIX) \
dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \
dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \
dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \
drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \
@@ -76,7 +76,7 @@ CBLAS1OBJS = \
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \
cdotc.$(SUFFIX) cdotu.$(SUFFIX) \
scasum.$(SUFFIX) scnrm2.$(SUFFIX) \
scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \
scamax.$(SUFFIX) icamax.$(SUFFIX) \
scamin.$(SUFFIX) icamin.$(SUFFIX) \
csrot.$(SUFFIX) crotg.$(SUFFIX) \
@@ -105,7 +105,7 @@ ZBLAS1OBJS = \
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \
zdotc.$(SUFFIX) zdotu.$(SUFFIX) \
dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \
dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \
dzamax.$(SUFFIX) izamax.$(SUFFIX) \
dzamin.$(SUFFIX) izamin.$(SUFFIX) \
zdrot.$(SUFFIX) zrotg.$(SUFFIX) \
@@ -146,7 +146,7 @@ QBLAS1OBJS = \
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
qdot.$(SUFFIX) \
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
@@ -168,7 +168,7 @@ XBLAS1OBJS = \
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
xdotc.$(SUFFIX) xdotu.$(SUFFIX) \
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
@@ -203,7 +203,7 @@ ifdef QUAD_PRECISION
QBLAS1OBJS = \
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
@@ -224,7 +224,7 @@ QBLAS3OBJS = \
XBLAS1OBJS = \
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
@@ -260,10 +260,11 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \
idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX)
CSBLAS1OBJS = \
cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
CSBLAS2OBJS = \
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
@@ -277,10 +278,11 @@ CSBLAS3OBJS = \
cblas_sgeadd.$(SUFFIX)
CDBLAS1OBJS = \
cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
CDBLAS2OBJS = \
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
@@ -294,13 +296,14 @@ CDBLAS3OBJS += \
cblas_dgeadd.$(SUFFIX)
CCBLAS1OBJS = \
cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
cblas_ccopy.$(SUFFIX) \
cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
cblas_caxpby.$(SUFFIX)
cblas_caxpby.$(SUFFIX) \
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)
CCBLAS2OBJS = \
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
@@ -320,13 +323,15 @@ CCBLAS3OBJS = \
CZBLAS1OBJS = \
cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
cblas_zcopy.$(SUFFIX) \
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
cblas_zaxpby.$(SUFFIX)
cblas_zaxpby.$(SUFFIX) \
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)
CZBLAS2OBJS = \
cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \
@@ -560,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c
qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c $< -o $(@F)
@@ -1359,6 +1382,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c
cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
@@ -1371,6 +1406,18 @@ cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c
cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_icmax.$(SUFFIX) cblas_icmax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_izmax.$(SUFFIX) cblas_izmax.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
@@ -1383,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c
cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
@@ -1390,7 +1449,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

View File

@@ -40,11 +40,11 @@
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#endif
#if defined(Z13)
#define MULTI_THREAD_MINIMAL 200000
#else
#define MULTI_THREAD_MINIMAL 10000
#define MULTI_THREAD_MINIMAL 10000
#endif
#ifndef CBLAS
@@ -75,6 +75,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
if (alpha == ZERO) return;
if (incx == 0 && incy == 0) {
*y += n * alpha *(*x);
return;
}
IDEBUG_START;
FUNCTION_PROFILE_START();
@@ -83,17 +88,15 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
if (incy < 0) y -= (n - 1) * incy;
#ifdef SMP
nthreads = num_cpu_avail(1);
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0)
nthreads = 1;
//Temporarily work-around the low performance issue with small imput size &
//
//Temporarily work-around the low performance issue with small input size &
//multithreads.
if (n <= MULTI_THREAD_MINIMAL)
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) {
#endif

View File

@@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order,
if (trans) lenx = m;
if (trans) leny = n;
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return;

View File

@@ -44,6 +44,7 @@
#endif
#ifndef COMPLEX
#define SMP_THRESHOLD_MIN 65536.0
#ifdef XDOUBLE
#define ERROR_NAME "QGEMM "
#elif defined(DOUBLE)
@@ -52,6 +53,7 @@
#define ERROR_NAME "SGEMM "
#endif
#else
#define SMP_THRESHOLD_MIN 8192.0
#ifndef GEMM3M
#ifdef XDOUBLE
#define ERROR_NAME "XGEMM "
@@ -121,8 +123,6 @@ void NAME(char *TRANSA, char *TRANSB,
FLOAT *sa, *sb;
#ifdef SMP
int nthreads_max;
int nthreads_avail;
double MNK;
#ifndef COMPLEX
#ifdef XDOUBLE
@@ -245,8 +245,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
XFLOAT *sa, *sb;
#ifdef SMP
int nthreads_max;
int nthreads_avail;
double MNK;
#ifndef COMPLEX
#ifdef XDOUBLE
@@ -273,6 +271,14 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
PRINT_DEBUG_CNAME;
#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT)
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) {
sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc);
return;
}
#endif
#ifndef COMPLEX
args.alpha = (void *)&alpha;
args.beta = (void *)&beta;
@@ -411,25 +417,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
mode |= (transa << BLAS_TRANSA_SHIFT);
mode |= (transb << BLAS_TRANSB_SHIFT);
nthreads_max = num_cpu_avail(3);
nthreads_avail = nthreads_max;
#ifndef COMPLEX
MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
#else
MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
#endif
args.common = NULL;
if ( nthreads_max > nthreads_avail )
args.nthreads = nthreads_avail;
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
args.nthreads = 1;
else
args.nthreads = nthreads_max;
args.nthreads = num_cpu_avail(3);
args.common = NULL;
if (args.nthreads == 1) {
#endif

View File

@@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order,
if (trans) lenx = m;
if (trans) leny = n;
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return;

View File

@@ -97,7 +97,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
blas_level1_thread(mode, n, k1, k2, dummyalpha,
a, lda, NULL, 0, ipiv, incx,
laswp[flag], nthreads);
(int(*)())laswp[flag], nthreads);
}
#endif

Some files were not shown because too many files have changed in this diff Show More