Compare commits

...

528 Commits

Author SHA1 Message Date
Martin Kroeker
e32f3b1447 Restore -march flag for Android builds
fixes #2419 - renewed discussion in #2112 suggests removal of the option was primarily aimed at non-Android builds
2020-02-16 17:32:13 +01:00
Martin Kroeker
5e94aa4877 Merge pull request #2417 from marxin/make-ctest-verbose-for-drone
Make ctest verbose for drone
2020-02-15 21:57:41 +01:00
Martin Kroeker
93f3e27574 Merge pull request #2415 from marxin/add-cmake-to-gitignore
Add CMake related files to .gitignore.
2020-02-15 21:57:03 +01:00
Martin Kroeker
785c389b0e Merge pull request #2420 from martin-frbg/issue2396
Correct generation of GETRF files by the CMAKE build
2020-02-15 21:56:16 +01:00
Martin Kroeker
c222b25b81 Correct generation of GETRF files by the CMAKE build
fixes #2396
2020-02-15 19:29:14 +01:00
Martin Kroeker
221da8bf05 Merge pull request #2411 from martin-frbg/fix2254-038
Fix pre-processed POWER8 codes and wrong conditionals in the POWER8,PPC440 and PPC970 KERNEL files
2020-02-14 23:07:43 +01:00
Martin Liska
eb285b4d20 Make ctest verbose for drone builder. 2020-02-14 10:46:37 +01:00
Martin Kroeker
cafdd999b8 Update caxpy_power8.S 2020-02-13 22:44:09 +01:00
Martin Kroeker
92ca92a46c Update caxpy_power8.S 2020-02-13 21:24:54 +01:00
Martin Kroeker
486c35c5dc Update icamin_power8.S 2020-02-13 18:38:43 +01:00
Martin Liska
0e05ea9bac Add CMake related files to .gitignore. 2020-02-13 14:51:55 +01:00
Martin Kroeker
5ba3699f41 Update isamin_power8.S 2020-02-13 00:00:32 +01:00
Martin Kroeker
8eefa530cd Update isamax_power8.S 2020-02-12 23:59:50 +01:00
Martin Kroeker
de40d47edf Update isamin_power8.S 2020-02-12 23:57:48 +01:00
Martin Kroeker
7c162b8a21 Update isamax_power8.S 2020-02-12 23:56:57 +01:00
Martin Kroeker
0544cbc806 Fix syntax of endianness conditional 2020-02-12 20:00:29 +01:00
Martin Kroeker
120d20731f Fix syntax of endianness conditional 2020-02-12 19:58:42 +01:00
Martin Kroeker
dc345d84df Fix syntax of endianness conditional and add gcc version check for workaround 2020-02-12 19:56:52 +01:00
Martin Kroeker
616921fd91 Merge pull request #27 from xianyi/develop
rebase
2020-02-12 19:16:14 +01:00
Martin Kroeker
8a9e9a82a1 Merge pull request #2410 from bartoldeman/fix-dscal-inline-asm
Fix inline asm in dscal: mark x, x1 as clobbered. Fixes #2408
2020-02-12 15:38:37 +01:00
Bart Oldeman
7ea5e07d1c Fix inline asm in dscal: mark x, x1 as clobbered. Fixes #2408
The leaq instructions in dscal_kernel_inc_8 modify x and x1 so they
must be declared as input/output constraints, otherwise the compiler
may assume the corresponding registers are not modified.
2020-02-12 14:11:44 +00:00
Martin Kroeker
cb6ef49857 Merge pull request #2407 from susilehtola/patch-2
Patch out instances of Z15 in dynamic_zarch.c
2020-02-11 13:04:44 +01:00
Martin Kroeker
63994e1cdb Merge pull request #2405 from susilehtola/patch-1
Fix typo in dynamic_zarch.c
2020-02-11 13:03:35 +01:00
Martin Kroeker
496e3019bc Merge pull request #2404 from martin-frbg/issue2395
Fix spurious application of USE_TRMM in cmake builds
2020-02-11 13:00:36 +01:00
Martin Kroeker
169be3f097 Merge pull request #2403 from martin-frbg/issue2400
Fix coretype identification of Intel Cannon Lake, Ice Lake and Goldmont
2020-02-11 13:00:16 +01:00
Martin Kroeker
6ccbb089c2 Merge pull request #2402 from gxw-loongson/develop
Avoid printing the following information on mips and mips64 when check msa
2020-02-11 12:59:53 +01:00
Martin Kroeker
59ebe3636a Merge pull request #2399 from martin-frbg/buffersize
Make BUFFER_SIZE configurable at build time
2020-02-11 12:56:56 +01:00
Susi Lehtola
5a6bba3061 Patch out instances of Z15 in dynamic_zarch.c
There does not appear to be a Z15 kernel yet, causing link errors from the code. This patch fixes the issue.
2020-02-11 15:07:33 +13:00
Susi Lehtola
dff173e50e Fix typo in dynamic_zarch.c 2020-02-11 14:46:30 +13:00
Martin Kroeker
7e5cbb6f35 Fix bad conditional syntax that caused spurious application of USE_TRMM 2020-02-10 21:17:39 +01:00
Martin Kroeker
303bdb673b Fix coretype detection for Intel extended models 6 and 7
affecting Goldmont, Cannon Lake, Ice Lake autodetection
2020-02-10 19:17:32 +01:00
gxw
754433f420 Avoid printing the following information on mips and mips64 when check msa:
"unrecognized command line option ‘-mmsa’"
2020-02-10 19:11:45 +08:00
Martin Kroeker
7f0d523b42 Make BUFFER_SIZE configurable 2020-02-09 23:32:57 +01:00
Martin Kroeker
c353d8b106 Make BUFFER_SIZE configurable 2020-02-09 23:30:22 +01:00
Martin Kroeker
579be3aa9d Add configuration option for BUFFER_SIZE 2020-02-09 23:28:04 +01:00
Martin Kroeker
449e8ea443 Merge pull request #26 from xianyi/develop
rebase
2020-02-09 23:23:55 +01:00
Martin Kroeker
3bec250cf9 Increment version to 0.3.9.dev 2020-02-09 23:18:44 +01:00
Martin Kroeker
f03dd23e90 Increment version to 0.3.9.dev 2020-02-09 23:18:07 +01:00
Martin Kroeker
fa93d63365 Merge branch 'release-0.3.0' into develop 2020-02-09 23:16:06 +01:00
Martin Kroeker
90e6c66a57 Merge pull request #2397 from martin-frbg/038changes
Update Changelog with changes from 0.3.8
2020-02-09 23:01:52 +01:00
Martin Kroeker
32d97330b3 Update with changes from 0.3.8 2020-02-09 23:00:36 +01:00
Martin Kroeker
29eaf4b6d7 Merge pull request #25 from xianyi/develop
rebase
2020-02-09 22:48:15 +01:00
Martin Kroeker
47c1bf7f4d typo fixes 2020-02-09 01:06:40 +01:00
Martin Kroeker
2b55f0ad30 Merge pull request #2393 from martin-frbg/issue2388
Provide more documentation in README.md
2020-02-09 01:00:33 +01:00
Martin Kroeker
a5b32ab06c Merge pull request #2390 from martin-frbg/pgi
Small corrections for compilation with PGI compilers
2020-02-09 00:13:40 +01:00
Martin Kroeker
50545b19d0 Update CPU and OS support and document DYNAMIC_ARCH option in README.md
prompted by #2388
2020-02-09 00:06:07 +01:00
Martin Kroeker
b3cbd60d7a Remove PGI from list again as it is actually still not capable 2020-02-08 10:20:13 +01:00
Martin Kroeker
70199d1905 Merge pull request #2389 from Zeyiii/develop
Fix bugs in benchmark of gemv
2020-02-07 16:05:46 +01:00
Martin Kroeker
cfe63d8cc2 Remove OpenMP libraries from link list 2020-02-07 16:03:51 +01:00
Martin Kroeker
d55b10830f Remove OpenMP libraries from link list 2020-02-07 16:02:17 +01:00
Martin Kroeker
c1c10cbb21 Merge pull request #2384 from wjc404/develop
Optimize AVX512 DGEMM (& DTRMM)
2020-02-07 13:47:12 +01:00
Martin Kroeker
5989841524 Add PGI to avx512-supporting compilers 2020-02-07 13:01:31 +01:00
Martin Kroeker
68a43db358 Fix utest compilation with PGI 2020-02-07 10:15:18 +01:00
Martin Kroeker
9694037b23 Set SUFFIX in tempfile commands, fix bad architecture option for PGI compiler in avx512 test 2020-02-07 10:09:25 +01:00
Martin Kroeker
71faa1c1a7 Merge pull request #24 from xianyi/develop
rebase
2020-02-07 10:03:02 +01:00
wjc404
3447d04eaf Update dgemm_kernel_16x2_skylakex.c 2020-02-06 02:14:10 +00:00
wjc404
8b5cdcc64c Update sgemm_kernel_8x4_haswell.c 2020-02-06 01:47:46 +00:00
wjc404
4e00d96a78 Update dgemm_kernel_16x2_skylakex.c 2020-02-06 01:46:36 +00:00
w00421467
ce9ea8f826 Fix another branch 2020-02-05 15:07:18 +08:00
w00421467
0b909203cb Fix bugs in benchmark of gemv 2020-02-05 14:53:37 +08:00
wjc404
096da2f51a Update dgemm_kernel_16x2_skylakex.c 2020-02-05 13:36:57 +08:00
wjc404
2f96a2c55b Update trmm_R.c 2020-02-05 10:15:02 +08:00
wjc404
833bd0f8ff Update trmm_L.c 2020-02-05 10:09:41 +08:00
wjc404
77b8f49556 Update level3_thread.c 2020-02-04 20:33:08 +08:00
wjc404
1c3e20ce48 Update level3.c 2020-02-04 20:30:23 +08:00
wjc404
83b6be7976 Update param.h 2020-02-04 19:55:26 +08:00
wjc404
081b188529 Update KERNEL.SKYLAKEX 2020-02-03 21:38:08 +08:00
wjc404
f3f969f681 Update param.h 2020-02-03 21:34:12 +08:00
wjc404
8019e70211 AVX512 16x2 DGEMM kernel 2020-02-03 21:32:56 +08:00
Martin Kroeker
8d2a796f49 Merge pull request #2378 from martin-frbg/issue2377
Add -march option for AVX512 in cmake as well
2020-01-30 17:07:19 +01:00
Martin Kroeker
8dc9fd4dfe Add -march option for AVX512 2020-01-30 12:41:18 +01:00
Martin Kroeker
abc67bdd74 Merge pull request #2375 from ewanglong/master
fix a few performance drop in some matrix size per data type
2020-01-30 10:27:29 +01:00
Martin Kroeker
1f62a82789 Merge pull request #2376 from wjc404/develop
Fix remaining bugs in parallel GEMM3M
2020-01-23 21:50:19 +01:00
wjc404
e9fb8f62b1 Update level3_gemm3m_thread.c 2020-01-22 17:40:03 +00:00
Wang,Long
fbf4f48f4a fix a few performance drop in some matrix size per data type
Signed-off-by: Wang,Long <long1.wang@intel.com>
2020-01-22 15:15:04 +00:00
Martin Kroeker
b9ad450295 Merge pull request #2373 from Qiyu8/optimize#gemmbeta
Optimize genenal Gemm Beta
2020-01-21 15:05:38 +01:00
Martin Kroeker
e011ad820a Merge pull request #2372 from martin-frbg/winexit
Do not run any cleanup if the program is exiting anyway
2020-01-21 14:56:45 +01:00
Qiyu8
ff42e68652 Optimize genenal Gemm Beta 2020-01-20 11:49:42 +08:00
Martin Kroeker
23f322f997 Do not run any cleanup if the program is exiting anyway
From keno's PR #2350 - this avoids the potential hang in blas_thread_shutdown where we may wait for threads to exit while they are waiting on the loader lock from DllMain
2020-01-19 13:28:27 +01:00
Martin Kroeker
093d37de8d Merge pull request #2371 from martin-frbg/issue2370
Free Windows thread memory with MEM_RELEASE rather than MEM_DECOMMIT
2020-01-18 20:39:34 +01:00
Martin Kroeker
d65e9a2bbd Merge pull request #2253 from thrasibule/xerbla
fix error messages
2020-01-18 20:39:04 +01:00
Martin Kroeker
78100b8093 Free Windows thread memory with MEM_RELEASE rather than MEM_DECOMMIT
as suggested by hjmndv in #2370
2020-01-18 15:06:39 +01:00
Martin Kroeker
70f45749b9 Merge pull request #2367 from wjc404/develop
Improve paralleled SGEMM performance on SKYLAKEX CPUs
2020-01-15 21:13:43 +01:00
wjc404
e5dcdeb550 Update sgemm_direct_skylakex.c 2020-01-13 16:59:23 +08:00
wjc404
952cc2ba38 Update sgemm_kernel_16x4_skylakex_2.c 2020-01-13 16:58:54 +08:00
wjc404
feaafbedd3 make skylakex sgemm code more friendly for readers
BTW some kernels were adjusted to improve performance
2020-01-13 16:28:41 +08:00
wjc404
1c67567008 improve skylakex paralleled sgemm performance 2020-01-13 16:26:03 +08:00
Martin Kroeker
4e979bf75b Merge pull request #2366 from martin-frbg/install390
Add new file lapack.h from LAPACK 3.9.0 to installable headers
2020-01-13 09:00:21 +01:00
Martin Kroeker
daa4310db5 Install new lapack.h
new file in LAPACK 3.9.0, split off from lapacke.h
2020-01-12 22:00:50 +01:00
Martin Kroeker
b8f3605132 Merge pull request #23 from xianyi/develop
rebase
2020-01-12 21:57:23 +01:00
Martin Kroeker
b36018be6d Merge pull request #2365 from wjc404/develop
Fix SKYLAKEX STRMM issues
2020-01-09 23:23:09 +01:00
wjc404
3a100b2797 Update KERNEL.SKYLAKEX 2020-01-09 13:48:41 +08:00
Martin Kroeker
38742d5547 Merge pull request #2361 from wjc404/develop
Optimize AVX2 SGEMM & STRMM
2020-01-08 16:20:28 +01:00
wjc404
bd4c032f52 Update sgemm_kernel_8x4_haswell.c 2020-01-07 11:22:46 +08:00
wjc404
9dc9b7b95e Update sgemm_kernel_8x4_haswell.c 2020-01-06 20:11:36 +08:00
wjc404
9f5cdc49d4 Update CONTRIBUTORS.md 2020-01-06 12:28:43 +08:00
wjc404
b7b408a120 optimize AVX2 SGEMM 2020-01-06 12:16:09 +08:00
wjc404
92b10212de optimize AVX2 SGEMM 2020-01-06 12:11:21 +08:00
wjc404
b73bf01378 optimize AVX2 SGEMM 2020-01-06 12:09:14 +08:00
wjc404
eb3c9f1db9 optimize AVX2 SGEMM 2020-01-06 12:07:02 +08:00
Martin Kroeker
fd2ff2714f Merge pull request #2359 from martin-frbg/lapack-pr330
Apply LAPACKE fix for eigenvector transposition in symmetric eigensolvers
2020-01-03 15:03:30 +01:00
Martin Kroeker
2ea2bd99c7 Apply LAPACKE fix for eigenvector transposition in symmetric eigensolvers
from Reference-LAPACK PR 330
2020-01-03 11:10:00 +01:00
Martin Kroeker
fbb894948c Merge pull request #22 from xianyi/develop
rebase
2020-01-03 10:23:25 +01:00
Martin Kroeker
e711659c90 Merge pull request #2358 from shengyang-3390/develop
Test all 7 declared values of N in float and double cblas3 tests
2020-01-03 09:02:03 +01:00
shengyang
893e6e57c4 modified: ctest/din3 ctest/sin3 2020-01-03 10:03:33 +08:00
Martin Kroeker
456ee2e1f0 Merge pull request #2357 from chenxuqiang/dgemm_beta_zero
kernel/arm64/dgemm_beta.S: add beta == zero branch
2020-01-02 22:28:36 +01:00
Martin Kroeker
9998f8ed8b Merge pull request #2356 from shengyang-3390/develop
Use arm neon instructions to optimize ncopy operation (and enable 7th column in float+complex cblas3 test drivers)
2020-01-02 22:27:44 +01:00
shengyang
80db5f11e1 update 2020-01-02 11:01:57 +08:00
chenxuqiang
52de4cc8fd kernel/arm64/dgemm_beta.S: add beta == zero branch
added beta == zero branch, and no need to load C matrix.

Signed by: Xuqiang Chen <chenxuqiang3@hisilicon.com>
2020-01-01 21:50:45 -05:00
Martin Kroeker
44028581cc Merge pull request #2355 from Zeyiii/dev-zeyi2
Use arm neon instructions to optimize sgemm_beta operation
2020-01-01 22:14:16 +01:00
Martin Kroeker
86ab939936 Merge pull request #2354 from ZuoQ3/develop
[WIP] Use arm neon instructions to optimize tcopy operation
2020-01-01 22:13:37 +01:00
Martin Kroeker
375b1875c8 [WIP] Update LAPACK to 3.9.0 (#2353)
* Update make.inc entries for LAPACK 3.9.0

Reference-LAPACK PR 347 changed some variable names and relative paths

* Update LAPACK to 3.9.0

* Add new functions from LAPACK 3.9.0

* Add new functions from LAPACK 3.9.0

* Restore LOADER command 

as it makes it easier to specify pthread as needed

* Restore LOADER

* Restore EIG/LIN prefixes in cmdbase

* add binary path to lapack_testing.py call

* Restore OpenMP version check

* Restore OpenMP version check

* Restore fix for out-of-bounds array accesses

from #2096
2020-01-01 13:18:53 +01:00
Martin Kroeker
6c85cb1869 Merge pull request #2352 from wjc404/develop
AVX2 ZGEMM3M kernel
2019-12-31 18:08:10 +01:00
Martin Kroeker
995768bbc5 Merge pull request #2351 from Zeyiii/develop
prefetching for dgemm_beta
2019-12-31 18:07:37 +01:00
int_13h
96ad579428 add in runtime cpu detection for zarch (#2349)
add in runtime cpu detection for zarch
2019-12-31 18:03:27 +01:00
shengyang
8d84403205 Use arm neon instructions to optimize ncopy operation
modified:   KERNEL.ARMV8
	modified:   KERNEL.TSV110
	new file:   sgemm_ncopy_4.S
2019-12-31 17:06:35 +08:00
shengyang
8729db117c modified: ctest/din3
modified:   ctest/sin3
2019-12-31 15:59:52 +08:00
w00421467
0833a4846a Use arm neon instructions to optimize sgemm_beta operation 2019-12-31 10:42:03 +08:00
zq
50f7fc1401 [WIP] Use arm neon instructions to optimize tcopy operation 2019-12-31 10:21:23 +08:00
w00421467
d1b53806be Merge remote-tracking branch 'pub/develop' into develop 2019-12-31 10:13:24 +08:00
wjc404
a0f0a802fc Update zgemm3m_kernel_4x4_haswell.c 2019-12-30 17:33:42 +08:00
wjc404
700fe5b5ee Add files via upload 2019-12-30 17:18:59 +08:00
wjc404
bb2729c855 Update CONTRIBUTORS.md 2019-12-30 16:11:37 +08:00
wjc404
aae44d040d Update CONTRIBUTORS.md 2019-12-30 16:10:08 +08:00
wjc404
6362c34ee6 Update param.h 2019-12-30 16:08:19 +08:00
wjc404
f60840c420 Update KERNEL.ZEN 2019-12-30 16:04:23 +08:00
wjc404
109e18cd96 Update KERNEL.HASWELL 2019-12-30 16:03:24 +08:00
wjc404
ae1579be13 Create zgemm3m_kernel_4x4_haswell.c 2019-12-30 16:02:51 +08:00
w00421467
3ccf8885ac prefetching for dgemm_beta 2019-12-30 11:45:49 +08:00
Martin Kroeker
454847588e Update LAPACK to 3.9.0 2019-12-29 21:27:18 +01:00
Martin Kroeker
0257f26488 Merge pull request #21 from xianyi/develop
rebase
2019-12-29 18:08:55 +01:00
Martin Kroeker
c45b7aef14 Merge pull request #2348 from wjc404/develop
AVX2 CGEMM3M kernel
2019-12-28 20:07:56 +01:00
wjc404
312060d0d6 Update CONTRIBUTORS.md 2019-12-27 23:36:13 +08:00
wjc404
cd765f094b Update cgemm3m_kernel_8x4_haswell.c 2019-12-27 18:23:29 +08:00
wjc404
64639f440f Update param.h 2019-12-27 18:06:42 +08:00
wjc404
3a66c8cac1 Update KERNEL.ZEN 2019-12-27 18:04:08 +08:00
wjc404
4c35b8dbaa Update gemm3m_level3.c 2019-12-27 18:03:01 +08:00
wjc404
ed9af2f7da Update KERNEL.HASWELL 2019-12-27 18:01:38 +08:00
wjc404
5fd1edead9 Create cgemm3m_kernel_8x4_haswell.c 2019-12-27 18:00:55 +08:00
Martin Kroeker
26478eb0d0 Merge pull request #2345 from wjc404/develop
Optimize AVX2 CGEMM
2019-12-25 22:26:41 +01:00
wjc404
eeecd623d8 Update cgemm_kernel_8x2_haswell.c 2019-12-24 00:40:16 +08:00
wjc404
3ce6bcdb5f Update CONTRIBUTORS.md 2019-12-24 00:30:16 +08:00
wjc404
6fbe51072b Update CONTRIBUTORS.md 2019-12-24 00:24:40 +08:00
wjc404
611445c7f8 Update param.h 2019-12-23 23:44:55 +08:00
wjc404
2cd9306bb5 Update KERNEL.ZEN 2019-12-23 23:42:30 +08:00
wjc404
c418c81224 Update KERNEL.HASWELL 2019-12-23 23:41:44 +08:00
wjc404
025741f16a Fast Haswell CGEMM kernel 2019-12-23 23:40:03 +08:00
Martin Kroeker
0ae49d2990 Merge pull request #2344 from wjc404/develop
Optimize AVX2 ZGEMM
2019-12-21 12:16:55 +01:00
wjc404
105e26e12a Adjust Haswell ZGEMM blocking parameters 2019-12-21 14:38:51 +08:00
wjc404
f41d52665d Fast Haswell ZGEMM kernel 2019-12-21 14:37:06 +08:00
wjc404
d573d24de7 Fast Haswell ZGEMM kernel 2019-12-21 14:35:15 +08:00
Martin Kroeker
31d6c2eb7d Merge pull request #2340 from Zeyiii/develop
[WIP] Use arm neon instructions to optimize gemm beta operation
2019-12-20 08:38:57 +01:00
w00421467
b7cc69ee62 declare DGEMM_BETA in KERNEL.ARMV8 rather than the generic KERNEL 2019-12-20 10:11:50 +08:00
w00421467
aeef942c4f use arm neon instructions to optimize gemm beta operation 2019-12-17 10:00:13 +08:00
Martin Kroeker
445ca2f418 Merge pull request #2339 from Jehan/wip/Jehan/fix-timeout
driver: more reasonable thread wait timeout on Windows.
2019-12-13 14:57:26 +01:00
Jehan
13226e3101 driver: more reasonable thread wait timeout on Windows.
It used to be 5ms, which might not be long enough in some cases for the
thread to exit well, but then when set to 5000 (5s), it would slow down
any program depending on OpenBlas.

Let's just set it to 50ms, which is at least 10 times longer than
originally, but still reasonable in case of failed thread termination.
2019-12-13 09:52:33 +01:00
Martin Kroeker
1a6ea8ee6d Merge pull request #2338 from kavanabhat/aix_mod
Changes to build on AIX in POWER8 mode
2019-12-09 17:54:49 +01:00
Martin Kroeker
c6ecb195e6 Merge pull request #2337 from martin-frbg/issue2336
Support two-digit version numbers in gcc version check
2019-12-07 09:38:06 +01:00
Martin Kroeker
b28db31429 Support two-digit version numbers in gcc version check
fixes #2336 (non-recognition of gcc 10) with patch provided by JeffreyALaw.
2019-12-06 21:23:56 +01:00
Kavana Bhat
6baa9b07d7 AIX changes for Power8 2019-12-06 04:33:32 -06:00
Martin Kroeker
a4896b5538 Update DYNAMIC_ARCH support for ARM64 and PPC (#2332)
* Update DYNAMIC_ARCH list of ARM64 targets for gmake
* Update arm64 cpu list for runtime detection
* Update DYNAMIC_ARCH list of ARM64 targets for cmake and add POWERPC targets
2019-12-04 11:06:03 +01:00
Kavana Bhat
3938e59569 AIX changes for Power8 2019-12-04 00:23:46 -06:00
Martin Kroeker
9d5079008f Merge pull request #2334 from martin-frbg/fix2228
Remove misplaced file
2019-12-03 22:23:52 +01:00
Martin Kroeker
3518617f5b Add Intel Goldmont+ cpuid
was originally in #2228 but that PR had misplaced the file in the toplevel directory
2019-12-03 08:32:29 +01:00
Martin Kroeker
715f4650d9 Delete stray copy of dynamic.c from PR 2228 2019-12-03 08:24:10 +01:00
Martin Kroeker
10705183ce Merge pull request #20 from xianyi/develop
Rebase
2019-12-03 08:22:40 +01:00
Martin Kroeker
235599f17a Merge pull request #2329 from isuruf/patch-1
Workaround an ICE in clang 9.0.0
2019-12-02 08:30:43 +01:00
Isuru Fernando
b863b32ac5 Workaround an ICE in clang 9.0.0
This bug is not there in 8.x nor in the 9.0 daily snapshot.
2019-12-01 12:59:46 -06:00
Martin Kroeker
dd04143d4a Merge pull request #2328 from martin-frbg/ppc9
Fix precompiled kernels on POWER9 and make their use conditional on (old) gcc version
2019-11-30 12:23:57 +01:00
Martin Kroeker
f3a6164bff Merge pull request #2324 from antonblanchard/power9_segv
Fix SEGV in cdot_power9
2019-11-30 00:03:42 +01:00
Martin Kroeker
dedd822d1a Fix caxpy/caxpyc naming in localentry 2019-11-29 23:56:57 +01:00
Martin Kroeker
2181fb7047 Fix caxpy/caxpyc naming in localentry 2019-11-29 23:54:15 +01:00
Martin Kroeker
a9b62c03f8 Substitute precompiled gcc7 codes only when gcc is older than 9.x 2019-11-29 23:49:50 +01:00
Martin Kroeker
97762234f9 Add variable for gcc >=9 test
used in KERNEL.POWER9
2019-11-29 23:47:23 +01:00
Martin Kroeker
948d11fc51 Merge pull request #19 from xianyi/develop
rebase
2019-11-29 23:44:09 +01:00
Martin Kroeker
c815b8fb85 Merge pull request #2323 from wjc404/develop
some optimizations of AVX512 DGEMM
2019-11-28 20:55:16 +01:00
wjc404
e20709e976 Update param.h 2019-11-28 19:57:50 +08:00
wjc404
934e601e93 Update dgemm_kernel_4x8_skylakex_2.c 2019-11-28 19:56:35 +08:00
Martin Kroeker
a4c3668f99 Merge pull request #2321 from martin-frbg/issue2319
Fix race conditions in multithreaded GEMM3M
2019-11-28 09:30:24 +01:00
Martin Kroeker
867232c6a4 Merge pull request #2327 from martin-frbg/travisosx
Cleanup IOS build and disable FORTRAN on 32bit and ios builds for now
2019-11-28 08:43:45 +01:00
Martin Kroeker
5aaf70ef95 Merge pull request #2326 from xianyi/revert-2325-travisosx
Revert "Cleanup Travis IOS xbuild and disable FORTRAN on 32bit and ios builds for now"
2019-11-28 00:17:19 +01:00
Martin Kroeker
ae2a0995cc Cleanup IOS build and disable FORTRAN on 32bit and ios builds for now
Travis recently appears unable to find a matching homebrew package for 32bit gfortran,
and the IOS crossbuild suffered from excessive output due to the known problem with "ASMNAME redefined"
warnings when CFLAGS is set in the environment
2019-11-28 00:15:36 +01:00
Martin Kroeker
83dae28ae2 Revert "Cleanup Travis IOS xbuild and disable FORTRAN on 32bit and ios builds for now" 2019-11-28 00:09:06 +01:00
Martin Kroeker
da986d2e83 Merge pull request #2325 from martin-frbg/travisosx
Cleanup Travis IOS xbuild and disable FORTRAN on 32bit and ios builds for now
2019-11-27 21:59:36 +01:00
Martin Kroeker
6bc487de35 Cleanup IOS build and disable FORTRAN on 32bit and ios builds for now
Travis recently appears unable to find a matching homebrew package for 32bit gfortran,
and the IOS crossbuild suffered from excessive output due to the known problem with "ASMNAME redefined"
warnings when CFLAGS is set in the environment
2019-11-27 15:10:57 +01:00
Anton Blanchard
cf2a8e410c Fix SEGV in cdot_power9
We were corrupting r2 because the local entry wasn't being
setup correctly.
2019-11-26 21:55:04 -07:00
wjc404
eb1e9c8c92 some optimizations 2019-11-26 14:12:20 +08:00
Martin Kroeker
f95989cbc1 Fix AVX512 capability test (always returning zero)
from #2322
2019-11-23 22:38:07 +01:00
Martin Kroeker
f3065a0eed Fix race conditions in multithreaded GEMM3M
by adding barriers (and a mutex lock for the non-OpenMP case) like it was already done for GEMM in level3_thread.c some time ago
2019-11-23 19:54:56 +01:00
Martin Kroeker
04226f1e97 Add the cpuid of the business/rackmount version of z15 as well 2019-11-21 18:14:29 +01:00
Martin Kroeker
0925ef70db Merge pull request #2316 from sharkcz/s390x
zarch: treat z15 as z14 instead of generic
2019-11-21 18:03:00 +01:00
Martin Kroeker
371e6f73d4 Merge pull request #2317 from aarnez/develop
Change bad usage of "asum" to "sum" in ZARCH versions of ?sum
2019-11-21 17:59:21 +01:00
Andreas Arnez
d117dfd505 Change bad usage of "asum" to "sum" in ZARCH versions of ?sum
The ZARCH implementations of ?sum contain a cut & paste-error: An inline
assembly argument is named "sum", but the assembly references "asum"
instead.  The mismatch causes a build error.  This is fixed.
2019-11-21 13:49:13 +01:00
Dan Horák
883c39773a zarch: treat z15 as z14 instead of generic
Signed-off-by: Dan Horák <dan@danny.cz>
2019-11-21 12:53:23 +01:00
Martin Kroeker
b09b5be0a4 Merge pull request #2315 from ewanglong/develop
revised fix windows compatible for #2313
2019-11-21 05:06:44 +01:00
Wang, Long
bfb5fbdb4d revised fix windows compatible for #2313
Signed-off-by: Wang, Long <long1.wang@intel.com>
2019-11-21 10:22:58 +08:00
Martin Kroeker
3da6d66da9 Merge pull request #2314 from Jehan/wip/Jehan/fix-openblas-crash
Fix usage of TerminateThread() causing critical section corruption.
2019-11-20 16:16:35 +01:00
Martin Kroeker
08fa83aba2 Merge pull request #2312 from martin-frbg/power8be
Further Power8 big-endian corrections
2019-11-20 15:12:06 +01:00
Martin Kroeker
63d3ee8dfc Merge pull request #2313 from ewanglong/develop
Fix the integer overflow issue for large matrix size
2019-11-20 14:49:15 +01:00
Wang, Long
1191db1a49 For the sake of windows compatible, used "unsigned long long" to ensure 64-bit length
Signed-off-by: Wang, Long <long1.wang@intel.com>
2019-11-20 21:30:47 +08:00
Jehan
1f6071590d Fix usage of TerminateThread() causing critical section corruption.
This patch was submitted to the GIMP project by a publisher wishing to
keep confidentiality (hence anonymously). I just pass along the patch.
Here is the patch explanation which came with:

First they remind us what Microsoft documentation says about
TerminateThread:
> TerminateThread is a dangerous function that should only be used in
> the most extreme cases. You should call TerminateThread only if you
> know exactly what the target thread is doing, and you control all of
> the code that the target thread could possibly be running at the time
> of the termination.
(https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-terminatethread)

Then they say that 5 milliseconds time-out might not be long enough for
the thread to exit gracefully. They propose to set it to a much higher
value (for instance here 5 seconds).

And finally you should always check the return value of
WaitForSingleObject(). In particular you want to run TerminateThread()
only if WaitForSingleObject() failed, not on success case.
2019-11-20 13:00:49 +01:00
Wang, Long
0caf1434c9 Fix the integer overflow issue for large matrix size
For large matrix, e.g. M=N=K, and M>1290, int mnk=M*N*K will overflow.
This will lead to wrong branching to single-threading. The performance
is downgraded significantly.

Signed-off-by: Wang, Long <long1.wang@intel.com>
2019-11-20 14:11:17 +08:00
Martin Kroeker
73128f3883 Merge pull request #2310 from martin-frbg/ppc440
Fix PPC440 big-endian support and disable the QCDOC qalloc routine by default
2019-11-17 23:19:48 +01:00
Martin Kroeker
cad0d150db Define alternate kernels for big-endian POWER8 2019-11-17 23:12:10 +01:00
Martin Kroeker
eba0aeb7cd Fix compilation for big-endian POWER8 2019-11-17 22:58:32 +01:00
Martin Kroeker
0c07c356c1 Define alternate kernels for big-endian PPC440 2019-11-17 19:25:08 +01:00
Martin Kroeker
82b75f97e5 Disable the old QCDOC qalloc by default and copy utility functions from memory.c
1. qalloc() appears to have been a special routine written for the PPC440-based QCDOC supercomputer(s) from around 2005, its source does not seem to be readily available. So switch the #if 1 in the code to rely on standard malloc() by default.
2. Utility functions like get_num_procs, get_num_threads that were added to the "normally" used memory.c in the meantime were still missing here.
2019-11-17 19:22:04 +01:00
Martin Kroeker
7887c45077 Merge pull request #17 from xianyi/develop
rebase
2019-11-17 19:09:49 +01:00
Martin Kroeker
3e67017ac8 Merge pull request #2309 from martin-frbg/ppc970-be
Fix PPC970 big-endian support
2019-11-17 18:22:24 +01:00
Martin Kroeker
b3ac6ee222 Define alternate kernels for big-endian PPC970
The altivec versions of SGEMM and CGEMM fail most test in LAPACK-TESTING when compiled for big endian, STRSM/CTRSM even cause segfaults. The rot kernels either fail the corresponding utest or lead to failures in LAPACK-TESTING.
2019-11-17 15:19:39 +01:00
Martin Kroeker
6082e556cd Use "generic" S/CGEMM unroll M on big-endian PPC970
as the respective PPC970 "altivec" kernels give wrong results when compiled for big endian
2019-11-17 15:10:26 +01:00
Martin Kroeker
92315173d5 Merge pull request #2308 from martin-frbg/ctestfix
Fix potential issue in the c/z blas3 ctests
2019-11-15 08:33:17 +01:00
Martin Kroeker
351d12b94e Fix potential spurious failure from uninitialized variable 2019-11-15 00:20:36 +01:00
Martin Kroeker
bf73aa141b Fix potential spurious failure from uninitialized variable 2019-11-15 00:19:24 +01:00
Martin Kroeker
71e96163db Merge pull request #2305 from wjc404/develop
AVX512 CGEMM & ZGEMM kernels
2019-11-12 07:38:37 +01:00
wjc404
819e852ae7 AVX512 CGEMM & ZGEMM kernels
96-99% 1-thread performance of MKL2018
2019-11-11 20:04:52 +08:00
Martin Kroeker
4e466d739c Merge pull request #15 from xianyi/develop
rebase
2019-11-09 18:52:08 +01:00
Martin Kroeker
4c6a457358 Merge pull request #2300 from wjc404/develop
Optimize SGEMM on SKYLAKEX CPUs
2019-11-06 07:27:33 +01:00
wjc404
836c414e22 optimizations of software prefetching 2019-11-05 13:36:56 +08:00
Martin Kroeker
d403eb3c2f Merge pull request #2302 from martin-frbg/ppc970
Disable three-operand DCBT on PPC970 regardless of operating system
2019-11-04 22:55:05 +01:00
Martin Kroeker
3cd97f1a80 Merge pull request #2301 from martin-frbg/ppc8be
Disable IDAMIN/MAX and IZAMIN/MAX optimizations on big-endian POWER8
2019-11-04 22:54:28 +01:00
Martin Kroeker
9955f0996f Merge pull request #2294 from martin-frbg/ios-cleanup
Remove obsolete workarounds for IOS on ARMV8
2019-11-04 22:53:58 +01:00
wjc404
430c11e135 Add files via upload 2019-11-04 20:10:12 +08:00
wjc404
fbacd2605d optimizations via software prefetches 2019-11-04 19:37:19 +08:00
Martin Kroeker
6fa89b06a1 Use the two-operand form of DCBT on all PPC970 regardless of OS
There seems to be no advantage to the three-operand form used in the earliest GotoBLAS kernels, and it causes compilation problems  on other than the previously special-cased platforms as well
2019-11-03 22:55:31 +01:00
Martin Kroeker
68597002ea The assembly microkernel is not safe to use on ELFv1 2019-11-03 22:42:46 +01:00
Martin Kroeker
d2a6285549 The assembly microkernel is not safe to use on ELFv1 2019-11-03 22:41:19 +01:00
Martin Kroeker
d999688d1a The assembly microkernel is not safe to use on ELFv1 2019-11-03 22:39:06 +01:00
Martin Kroeker
928fe1b28e The assembly microkernel is not safe to use on ELFv1 2019-11-03 22:37:27 +01:00
Martin Kroeker
ccc28c6d60 Merge pull request #13 from xianyi/develop
resync with upstream
2019-11-03 22:33:31 +01:00
wjc404
ae43b75a6a Add files via upload 2019-11-02 10:09:19 +08:00
wjc404
54fc06fd70 Add files via upload 2019-11-02 10:06:13 +08:00
wjc404
1df9a2013d new sgemm kernel for skylakex 2019-11-02 00:00:48 +08:00
wjc404
274ff5cdb8 update sgemm_q on skylakex cpus 2019-11-01 23:59:18 +08:00
Martin Kroeker
eb2eddf241 Merge pull request #2296 from kdunee/develop
Fixed a minor cmake problem, occuring when DYNAMIC_ARCH=ON and CMAKE_C_FLAGS was empty
2019-10-28 13:24:18 +01:00
k.dunikowski
8691825944 Fixed a minor cmake problem, occuring when DYNAMIC_CORE=ON and CMAKE_C_FLAGS was empty 2019-10-28 08:51:05 +01:00
Martin Kroeker
7dc8a76f60 Merge pull request #2293 from martin-frbg/pr2288
Add support for NetBSD by adding it to the existing xBSD conditionals
2019-10-25 23:46:39 +02:00
Martin Kroeker
df857551c0 Remove special parameter set for obsolete IOS/ARMV8 workaround 2019-10-25 23:07:00 +02:00
Martin Kroeker
85ccdce8c4 Remove the IOS fallbacks to generic C kernels 2019-10-25 23:02:37 +02:00
Martin Kroeker
aeabe0a83f Fix regex to parse -R options with and without whitespace
Both forms are seen on NetBSD (#2288)
2019-10-25 22:52:30 +02:00
Martin Kroeker
1b90989662 Add NetBSD to the xBSD conditionals 2019-10-25 12:52:49 +02:00
Martin Kroeker
e3e8b5cdca Add NetBSD 2019-10-25 12:51:06 +02:00
Martin Kroeker
69b16a894d Merge pull request #2292 from martin-frbg/g95fixes
Improve support for g95 and non-GNU ld
2019-10-25 10:35:17 +02:00
Martin Kroeker
6782e5767d Merge pull request #2291 from martin-frbg/gensymbol
Fix netlib 3.7/3.8 function enumeration for linktest
2019-10-25 10:34:50 +02:00
Martin Kroeker
48f5a89f92 Merge pull request #2282 from martin-frbg/issue2281
Optimize RPCC function on ARM64
2019-10-25 09:56:30 +02:00
Martin Kroeker
4ae1610f37 Merge pull request #2290 from martin-frbg/cpuidfixes
Fixup x86 cpuid changes from #2283
2019-10-24 22:52:15 +02:00
Martin Kroeker
911c3e2f4b Improve support for g95 and non-GNU ld
Auto-add "-fno-second-underscore" option to make LAPACKE compile (as it calls LAPACK functions that may have gotten a second underscore added otherwise). Also support -R for rpath when parsing compiler directives in f_check
2019-10-24 22:43:27 +02:00
Martin Kroeker
fab49e49e5 Move most lapack 3.7/3.8 additions to the embedded_underscores list
to allow linktest to pass with a compiler that adds a second underscore to such names
2019-10-24 21:26:20 +02:00
Martin Kroeker
b687fba5bc Disable direct clock register access on IOS and Android
as I find conflicting information on accessibility from non-priviledged processes
2019-10-24 21:18:17 +02:00
luzpaz
46a8c2519a Remove prototype of unused, unimplemented function (#2274)
* Fix source typo

Found via `codespell -q 3 -L amin,als,ba,dum,mone,nd,nto,orign -S Changelog.txt,./lapack*`

* Remove beta-thread function per request
2019-10-24 18:56:53 +02:00
Martin Kroeker
e9437eebd2 Restore Goldmont ID and improve QEMU support
#2283 had inadvertently removed Goldmont+, and cpuid was reporting a mix of Core2 and Pentium2 for some QEMU configurations
2019-10-24 18:45:27 +02:00
Martin Kroeker
3a39062cfc Merge pull request #12 from xianyi/develop
resync with upstream
2019-10-24 18:40:13 +02:00
Martin Kroeker
eaa0be1313 Merge pull request #2286 from wjc404/develop
AVX512 DGEMM kernel
2019-10-20 12:44:19 +02:00
wjc404
6ff013bae0 native support for icopy_4
90% MKL 1-thread performance.
2019-10-19 03:54:44 +08:00
wjc404
0d669e04bb Update dgemm_kernel_8x8_skylakex.c 2019-10-18 15:00:17 +08:00
wjc404
17cdd9f9e1 some correction 2019-10-18 14:58:07 +08:00
wjc404
6bcb06fcb1 make further changes to icopy_8 easier 2019-10-18 10:47:31 +08:00
wjc404
b7315f8401 Add files via upload 2019-10-16 19:23:36 +08:00
wjc404
9b19e9e1b0 Update dgemm_kernel_8x8_skylakex.c 2019-10-16 10:14:51 +08:00
wjc404
6bd67ddbab Update dgemm_kernel_8x8_skylakex.c 2019-10-16 03:20:08 +08:00
wjc404
5da9484d93 Add files via upload 2019-10-16 02:01:13 +08:00
wjc404
844629af57 Add files via upload 2019-10-16 02:00:34 +08:00
Martin Kroeker
2beaa82c05 Merge pull request #2283 from martin-frbg/issue2176
Support QEMU virtual cpu in 64bit mode as CORE2 or BARCELONA
2019-10-09 22:06:09 +02:00
Martin Kroeker
e8a2aed2b9 Support QEMU cpu calling itself 64bit AMD Athlon as well
Some QEMU instances pretend to be "AuthenticAMD" with the same family 6/model 6 even when running on an Intel host
(could be related to qemu or libvirt version and/or kvm availability). Also fix the define to depend on __x86_64__ set by the
compiler, the defines using __64BIT__ will only work for getarch_2nd.
2019-10-09 18:24:13 +02:00
Martin Kroeker
f262031685 Support QEMU virtual cpu as CORE2
qemu itself claims it is a 64bit P6, which does not exist in the wild.
2019-10-08 22:30:02 +02:00
Martin Kroeker
5f6206fa2d Simplify OSX/IOS cross-compilation and add a CI test for it (#2279)
* Add automatic fixups for OSX/IOS cross-compilation

* Add OSX/IOS cross-compilation test to Travis CI

* Handle platforms that lack hwcap.h by falling back to ARMV8

* Fix PROLOGUE for OSX/IOS
2019-10-08 20:13:14 +02:00
Martin Kroeker
f2cde2ccfb Update common_arm64.h 2019-10-08 20:12:08 +02:00
Martin Kroeker
ba7838d2e1 Merge pull request #2280 from martin-frbg/iosfix
Add overlooked part of IOS compilation fix
2019-10-08 10:25:25 +02:00
Martin Kroeker
a448884a63 Remove automatic label postfixes from macro included only once 2019-10-08 08:37:50 +02:00
Martin Kroeker
17609f88f1 Merge pull request #11 from xianyi/develop
sync with upstream
2019-10-08 08:32:52 +02:00
Martin Kroeker
3a2df19db6 Fix accidental duplication of jump instruction 2019-10-08 08:09:26 +02:00
Martin Kroeker
d2093a40d3 Merge pull request #2277 from martin-frbg/issue2275
Rewrite ARMV8 code to allow cross-compilation for IOS
2019-10-06 23:01:54 +02:00
Martin Kroeker
aa04b0925e Merge pull request #2276 from xianyi/revert-2272-thread-sqrt-of-negative
Revert "Avoid taking root of negative number in symv_thread.c"
2019-10-06 11:12:44 +02:00
Martin Kroeker
258ac56e0a Move 32bit OSX build back to xcode 8.3 but switch to gcc8 2019-10-05 10:52:47 +02:00
Martin Kroeker
56837e9d92 Make local labels in macro compatible with the xcode assembler
... which does not perform the automatic numbering on instantiation that the _@ suffix signifies
2019-10-04 14:53:23 +02:00
Martin Kroeker
bb5413863f Rewrite ARM64 PROLOGUE to make it compatible with xcode/ios 2019-10-04 14:50:03 +02:00
Martin Kroeker
32f5907fef Update 32bit macOS again to xcode 9.3
os version 10.13 "High Sierra" appears to be the oldest release now for which Homebrew provides a gcc package.
Anything older and the Travis job will run out of time building gcc from source
2019-10-03 01:09:02 +02:00
Martin Kroeker
ac10236cc8 Update the OSX BINARY=32 test to xcode9.2
in response to Homebrew updates
2019-10-02 22:35:34 +02:00
Martin Kroeker
8617d75548 Revert "Avoid taking root of negative number in symv_thread.c" 2019-10-01 23:50:41 +02:00
Martin Kroeker
c07d78b9e9 Merge pull request #2272 from seberg/thread-sqrt-of-negative
Avoid taking root of negative number in symv_thread.c
2019-09-30 11:27:29 +02:00
Sebastian Berg
6355c25dde Avoid taking root of negative number in symv_thread.c
This is similar to fixes in gh-1929, but there was one remaining
occurance of this type of pattern in the driver/level2/*_thread.c
files.
2019-09-29 22:03:12 -07:00
Martin Kroeker
5e244d80f2 Merge pull request #2271 from quickwritereader/strmm_fix
fixed bug power9 strmm . BLAS-TESTER passes
2019-09-29 13:53:45 +02:00
AbdelRauf
ede5efebab trmm fix 2019-09-29 02:28:34 +00:00
Martin Kroeker
84908d60d2 Merge pull request #2269 from martin-frbg/ppc-fixes
Ppc fixes
2019-09-27 09:52:19 +02:00
Martin Kroeker
596a22325a Fix prologue of power9 assembly cdot(c) kernel to provide cdotc 2019-09-27 00:47:18 +02:00
Martin Kroeker
7f58f3ad0e Fix mis-edits in the gcc-derived power8 caxpy kernel 2019-09-27 00:44:26 +02:00
Martin Kroeker
c0d570a357 Merge pull request #7 from xianyi/develop
update
2019-09-27 00:42:32 +02:00
Martin Kroeker
6b83079368 Count cpu cores on ARMV8 and use that to pick the GEMM_PQ parameters (#2267)
There is currently no simple way to query cache sizes on ARMV8, so this takes the number of cores as a trivial indication if the target is a server-class device with a big cache, or just a single-board toy or smartphone.
2019-09-25 23:13:24 +02:00
Martin Kroeker
673e5a0495 Replace several POWER8/9 C kernels with their gcc7-generated assembly versions (#2263)
* Add gcc7-generated assembly files for POWER8/9 isa/ica-min/max and POWER9 caxpy

To work around internal compiler errors encountered when compiling the original C source with gcc 4 and 5, and wrong code generated by gcc 8.3.0

* Use gcc-generated assembly instead of original C sources

to work around internal compiler errors encountered with gcc 4.8/5.4 and wrong code generation by gcc 8.3

* Use gcc-generated assembly instead of the original C source

to work around internal compiler errors encountered with gcc 4.8 and 5.4, and wrong code generation by gcc 8.3

* Add gcc7-generated assembler version of caxpy for power8

to work around wrong code generated by gcc 8.3

* Handle CONJ define for caxpyc

* Handle CONJ define for caxpyc

* Add gcc7-generated assembly cdot for POWER9

* Use prebuilt assembly for POWER9 cdot

created with gcc 7.3.1 to work around ICE in older gcc versions

* Exclude POWER9 from DYNAMIC_ARCH when gcc versions is lower than 6

* Update Makefile.system

* Use PROLOGUE macro to ensure correct function name for DYNAMIC_ARCH

* Disable POWER9 with old gcc versions
2019-09-22 22:35:22 +02:00
Martin Kroeker
bfa2cc7d64 Restore ppc64 CI job and remove the travis_wait that caused the problem with it 2019-09-20 10:29:35 +02:00
Martin Kroeker
e7c4d6705a Revert #2051 and replace with a better fix (#2261)
* Revert #2051 and add a better fix for TARGET=generic with DYNAMIC_ARCH
fixes #2257 without breaking #2048 again
2019-09-17 18:56:04 +02:00
Martin Kroeker
2a1911cc14 Merge pull request #6 from xianyi/develop
update to current develop
2019-09-13 14:00:23 +02:00
Martin Kroeker
9f7a9a32e3 Merge pull request #2252 from thrasibule/trtrs
Optimized ?trtrs
2019-09-12 21:45:47 +02:00
Guillaume Horel
2463938879 fix error message 2019-09-11 10:35:25 -04:00
Guillaume Horel
5d6525c87c more bugfix 2019-09-10 17:30:57 -04:00
Guillaume Horel
6cb47ea3f0 fix Makefile 2019-09-10 17:11:01 -04:00
Guillaume Horel
459bb9291d fix error codes 2019-09-10 17:10:33 -04:00
Martin Kroeker
3f1077ce6f Merge pull request #2249 from brada4/gcc7minor
Address minor warnings popping up in gcc7+
2019-09-10 08:27:32 +02:00
Martin Kroeker
eb45eb6942 Fix C compiler handling and BINARY=32 mode in CMAKE builds (#2248)
* Fix compiler identification and option setting

* Handle BINARY=32 option on X86_64

* Add xGEMM3M unroll parameters for crossbuild-target CORE2

* Replace bogus mingw64/32bit CI job with actual 32bit build

mingw64 is not multilib-capable, so using an x86_64-mingw with BINARY=32 in the CI was not going to work anyway (but build passed while BINARY=32 was ignored).
2019-09-10 08:27:06 +02:00
Guillaume Horel
f2becb777a fix Makefile 2019-09-09 11:36:50 -04:00
Guillaume Horel
5997b6b491 bugfix 2019-09-08 11:14:49 -04:00
Guillaume Horel
4b21b646ea turn on optimized code 2019-09-08 11:14:49 -04:00
Guillaume Horel
7ec7b999a5 add missing file 2019-09-08 11:14:49 -04:00
Guillaume Horel
af9ac0898a fix Makefile 2019-09-08 11:14:49 -04:00
Guillaume Horel
c7b5a459b6 add missing defines and headers 2019-09-08 11:14:49 -04:00
Guillaume Horel
9b2f0323d6 update Makefile 2019-09-08 11:14:49 -04:00
Guillaume Horel
9f6984fe4b add missing files 2019-09-08 11:14:49 -04:00
Guillaume Horel
42203dafdc add logic 2019-09-08 11:14:49 -04:00
Guillaume Horel
a4f17a9297 add missing objects 2019-09-08 11:14:49 -04:00
Guillaume Horel
733d97b2df add files 2019-09-08 11:14:49 -04:00
Guillaume Horel
ea747cf933 start working on ?trtrs 2019-09-08 11:14:49 -04:00
Andrew
4de545aa7d address minor warnings from gcc7 2019-09-07 10:21:08 +03:00
Andrew
6e9a93ec19 init 2019-09-07 10:18:46 +03:00
Martin Kroeker
fde8a8e6a0 Improve cmake build behaviour with non-host cpu targets (#2246)
1. Supply appropriate values for C/Z GEMM unroll when cross-compiling for CORE2 or ARMV7
2. Add the required xLOCAL_BUFFER_SIZE parameters for cross-compiling CORE2
3. Add -DFORCE_<target> option to getarch when building with -DTARGET=target
for #2245
2019-09-03 22:41:17 +02:00
Martin Kroeker
256fc15f5f Merge pull request #2 from xianyi/develop
update
2019-09-03 15:12:14 +02:00
Martin Kroeker
ee498525e0 Merge pull request #2242 from martin-frbg/issue2235
Add arch data for cmake cross-compiling to CORE2
2019-09-02 22:06:29 +02:00
Martin Kroeker
1fec0570f6 Add cgemm and zgemm unroll factors for core2 2019-09-02 15:03:45 +02:00
Martin Kroeker
b5af7b9c78 Disable ppc64le test environment on Travis CI
as this semi-official beta option has suddenly reverted to a standard x86_64 environment causing spurious failures
2019-08-31 18:06:12 +02:00
Martin Kroeker
f3c314550c Merge pull request #2243 from quickwritereader/develop
possible cgemv,caxpy,cdot fix
2019-08-30 23:06:23 +02:00
AbdelRauf
847c20c9b7 fix uninitialized variables i 2019-08-30 11:14:55 +00:00
AbdelRauf
4c22828812 caxpy and cdot are using vec_vsx_ld 2019-08-30 04:09:15 +00:00
AbdelRauf
e79712d969 cgemv using vec_vsx_ld instead of letting gcc to decide 2019-08-30 02:52:04 +00:00
AbdelRauf
be09551cdf aligned 2019-08-29 23:22:23 +00:00
Martin Kroeker
ec1ef6aa9e Merge pull request #2241 from martin-frbg/zdotfix
Make x86_64 zdot compile with PGI and Sun C again
2019-08-29 07:12:54 +02:00
Martin Kroeker
11c59acfb1 Keep both PGI/SUN and default code paths to avoid breaking Clang/WIndows 2019-08-28 18:07:44 +02:00
Martin Kroeker
bf0d92a310 Add arch data for cross-compiling to CORE2
for #2235
2019-08-28 17:35:56 +02:00
Martin Kroeker
db066151ee Merge pull request #2240 from martin-frbg/issue2237
Fix PGI build options (again)
2019-08-28 15:30:53 +02:00
Martin Kroeker
3a55dca2dc Make x86_64 zdot compile with PGI and Sun C again
broken by #2222 as CREAL,CIMAG do not expand to a valid lvalue with these compilers
2019-08-28 11:35:31 +02:00
Martin Kroeker
7d380f7d79 Fix PGI build options (again)
for #2237
2019-08-28 11:31:20 +02:00
Martin Kroeker
300f158d3b Merge pull request #2239 from martin-frbg/issue2231
Fix 32bit armv8 compilation regression
2019-08-28 07:54:57 +02:00
Martin Kroeker
3635fdbf2b Do not abuse the global ARCH variable as a local temporary
Setting it with a simple "uname -m" just to be able to decide whether to compile getarch.c with -march=native
may actually keep getarch from doing a proper probe. Fixes #2231, a regression caused by #2110
2019-08-27 22:52:17 +02:00
Martin Kroeker
b6552b11eb Merge pull request #2 from xianyi/develop
merge develop
2019-08-27 22:41:31 +02:00
Kavana Bhat
3dc6b26eff AIX changes for Power8 2019-08-20 06:51:35 -05:00
Martin Kroeker
5fdf9ad24f Merge pull request #2228 from martin-frbg/issue2227
Add Intel Goldmont Plus CPUID
2019-08-19 18:26:51 +02:00
Martin Kroeker
2fe967c542 Merge branch 'develop' into issue2227 2019-08-19 14:20:39 +02:00
Martin Kroeker
6d8595351c Add Intel Goldmont Plus CPUID
fixes #2227
2019-08-19 14:19:21 +02:00
Martin Kroeker
f40200f559 Merge pull request #2223 from martin-frbg/getarch-pgi
Make getarch compile with PGI
2019-08-16 12:21:30 +02:00
Martin Kroeker
a95a5e52b8 Fix PGI compiler detection for getarch 2019-08-16 09:00:11 +02:00
Martin Kroeker
e3d846ab57 Do not use -march=native with the PGI compiler 2019-08-16 08:58:10 +02:00
Martin Kroeker
8506386d82 Merge pull request #1 from xianyi/develop
rebase
2019-08-16 08:56:15 +02:00
Martin Kroeker
9ef96b32a6 Add multithreading support to the x86_64 zdot kernel (#2222)
* Add multithreading support

copied from the ThunderX2T99 kernel. For #2221
2019-08-15 22:09:12 +02:00
Martin Kroeker
b48c025974 Merge pull request #2218 from martin-frbg/issue2215
Make the new DGEMM regression test properly depend on CBLAS and LAPACKE
2019-08-14 07:32:31 +02:00
Martin Kroeker
a1fce67743 Make the new DGEMM regression test properly depend on CBLAS and LAPACKE
fixes #2215
2019-08-13 22:29:48 +02:00
Martin Kroeker
103b32fdb7 Merge pull request #2216 from martin-frbg/issue2214
Remove case-sensitivity in x86 LSAME on (AMD) cpus without CMOV
2019-08-13 13:59:33 +02:00
Martin Kroeker
aef9804089 Fix unwanted case-sensitivity in x86 LSAME for (AMD) processors without CMOV
Problem was already noticed some years ago in #238, but back then the problem was only corrected in one of the #ifdef branches.
Fixes #2214
2019-08-13 10:19:10 +02:00
Martin Kroeker
303869f572 Update with changes from 0.3.7 2019-08-11 23:31:36 +02:00
Martin Kroeker
02d9203981 Increment version to 0.3.8.dev 2019-08-11 23:28:47 +02:00
Martin Kroeker
7b6808b69c Increment version to 0.3.8.dev 2019-08-11 23:28:13 +02:00
Martin Kroeker
5f36f18148 Update with 0.3.7 changes 2019-08-11 23:23:27 +02:00
Martin Kroeker
d47fe78b0e Set version to 0.3.7 2019-08-11 23:16:45 +02:00
Martin Kroeker
ebe2f47a0f Set version to 0.3.7 2019-08-11 23:16:11 +02:00
Martin Kroeker
20d417762f Merge pull request #2213 from xianyi/develop
Update from develop in preparation of the 0.3.7 release
2019-08-11 23:14:49 +02:00
Martin Kroeker
321288597c Merge pull request #2212 from martin-frbg/nofort-nolib
Avoid spurious dependency on the fortran runtime despite NOFORTRAN=1
2019-08-11 20:26:34 +02:00
Martin Kroeker
be147a9f28 Avoid adding a spurious dependency on the fortran runtime despite NOFORTRAN=1
for cases where a fortran compiler is present but not wanted (e.g. not fully functional)
2019-08-11 16:24:39 +02:00
Martin Kroeker
c275290ea6 Merge pull request #2211 from martin-frbg/arm64_gcc_trivial
Silence two nuisance warnings from gcc
2019-08-11 16:08:05 +02:00
Martin Kroeker
b7bbb02447 Silence two nuisance warnings from gcc 2019-08-11 12:46:05 +02:00
Martin Kroeker
bf1430f7d7 Merge pull request #2208 from martin-frbg/munmap-debug
Provide more information on mmap/munmap failure
2019-08-09 07:55:35 +02:00
Martin Kroeker
dccff2e785 Merge pull request #2206 from martin-frbg/zen-dtrmm
Replace vpermpd with vpermilpd in the Haswell DTRMM kernel
2019-08-09 07:55:20 +02:00
Martin Kroeker
5c3458a6e7 Merge pull request #2199 from martin-frbg/zen-dtrsm
Replace most vpermpd calls in the Haswell DTRSM_RN kernel
2019-08-09 07:55:02 +02:00
Martin Kroeker
1776ad82c0 Add files via upload 2019-08-09 00:08:11 +02:00
Martin Kroeker
4e2f81cfa1 Provide more information on mmap/munmap failure
for #2207
2019-08-08 23:15:35 +02:00
Martin Kroeker
acf6002ab2 Replace most vpermpd calls in the Haswell DTRSM_RN kernel 2019-08-03 12:40:13 +02:00
Martin Kroeker
96a794e9fd Merge pull request #2198 from martin-frbg/icelake
Update CPUID recognition for Intel Ice Lake
2019-08-02 08:36:14 +02:00
Martin Kroeker
3d36c45116 Add CPUID identification of Intel Ice Lake 2019-08-01 22:52:35 +02:00
Martin Kroeker
648491e1aa Autodetect Intel Ice Lake (as SKYLAKEX target) 2019-08-01 22:51:09 +02:00
Martin Kroeker
2dfb804cb9 Replace vpermpd with vpermilpd in the Haswell DTRMM kernel
to improve performance on AMD Zen (#2180) applying wjc404's improvement of the DGEMM kernel from #2186
2019-07-28 23:17:28 +02:00
Martin Kroeker
4c153ec9da Merge pull request #2196 from wjc404/develop
Add vbroadcastsd kernel to dgemm_kernel_4x8_haswell.S
2019-07-28 23:11:40 +02:00
wjc404
7eecd8e39c Add files via upload 2019-07-28 07:39:09 +08:00
Martin Kroeker
f0406a7708 Merge pull request #2112 from ffontaine/develop
Makefile.arm: remove -march flags
2019-07-27 13:00:13 +02:00
Martin Kroeker
561f3fd995 Merge pull request #2193 from martin-frbg/makeutest
Override special make variables
2019-07-24 20:19:21 +02:00
Martin Kroeker
30efed14d1 Unset special make variables in ctest Makefile as well 2019-07-24 15:26:09 +02:00
Martin Kroeker
af2e7f28fc Override special make variables
as seen in https://github.com/xianyi/OpenBLAS/issues/1912#issuecomment-514183900 , any external setting of TARGET_ARCH (which could result from building OpenBLAS as part of a larger project that actually uses this variable) would cause the utest build to fail. 
(Other subtargets appear to be unaffected as they do not use implicit make rules)
2019-07-23 16:56:40 +02:00
Martin Kroeker
4250e6ed64 Merge pull request #2191 from tylerjereddy/conditional_updates
MAINT: remove legacy CMake endif()
2019-07-23 16:20:39 +02:00
Martin Kroeker
7b0b7c11d2 Merge pull request #2190 from martin-frbg/zdot-zen
Replace vpermpd with vpermilpd in the Haswell/Zen zdot microkernel
2019-07-23 16:15:08 +02:00
Martin Kroeker
d14cf1ccf4 Merge pull request #2189 from wjc404/develop
Update dgemm_kernel_4x8_haswell.S for reducing cache misses
2019-07-23 08:32:56 +02:00
Tyler Reddy
3f6ab1582a MAINT: remove legacy CMake endif()
* clean up a case where CMake endif()
contained the conditional used in the
if(), which is no longer needed /
discouraged since our minimum required
CMake version supports the modern syntax
2019-07-22 21:24:57 -06:00
Martin Kroeker
28e96458e5 Replace vpermpd with vpermilpd
to improve performance on Zen/Zen2 (as demonstrated by wjc404 in #2180)
2019-07-22 08:28:16 +02:00
wjc404
95fb98f556 Update dgemm_kernel_4x8_haswell.S 2019-07-21 01:10:32 +08:00
wjc404
4801c6d36b Update dgemm_kernel_4x8_haswell.S 2019-07-21 00:47:45 +08:00
wjc404
9440fa607d Add files via upload 2019-07-20 22:08:22 +08:00
wjc404
94db259e5b Add files via upload 2019-07-20 22:04:41 +08:00
wjc404
f49f8047ac Add files via upload 2019-07-20 14:33:37 +08:00
wjc404
825777faab Update dgemm_kernel_4x8_haswell.S 2019-07-19 23:58:24 +08:00
wjc404
9c89757562 Add files via upload 2019-07-19 23:47:58 +08:00
Martin Kroeker
b0b7600bef Merge pull request #2186 from wjc404/develop
Update "dgemm_kernel_4x8_haswell.S" for improving performance on zen2 chips
2019-07-18 16:04:44 +02:00
wjc404
9b04baeaee Update dgemm_kernel_4x8_haswell.S 2019-07-17 23:50:03 +08:00
wjc404
8a074b3965 Update dgemm_kernel_4x8_haswell.S 2019-07-17 23:47:30 +08:00
wjc404
211ab03b14 Update dgemm_kernel_4x8_haswell.S 2019-07-17 22:39:15 +08:00
wjc404
1733f927e6 Update dgemm_kernel_4x8_haswell.S 2019-07-17 21:27:41 +08:00
wjc404
182b06d6ad Update dgemm_kernel_4x8_haswell.S 2019-07-17 17:02:35 +08:00
wjc404
7a9050d681 Update dgemm_kernel_4x8_haswell.S 2019-07-17 00:55:06 +08:00
wjc404
0ba29fd262 Update dgemm_kernel_4x8_haswell.S for zen2
replaced a bunch of vpermpd instructions with vpermilpd and vperm2f128
2019-07-17 00:46:51 +08:00
Martin Kroeker
bafa021ed6 Merge pull request #2181 from isuruf/install_name
Change install_name on osx to match linux
2019-07-09 20:08:52 +02:00
Isuru Fernando
b89d9762a2 Change install_name on osx to match linux 2019-07-08 17:14:35 -05:00
Martin Kroeker
08dedf4c5e Merge pull request #2177 from martin-frbg/noaff
Fix surprising behaviour of NO_AFFINITY=0
2019-07-07 18:28:21 +02:00
Martin Kroeker
b89c781637 Fix surprising behaviour of NO_AFFINITY=0 2019-07-07 16:04:45 +02:00
Martin Kroeker
dd7ff77f4b Merge pull request #2175 from martin-frbg/cmake-mingw-fixes
Fix CMAKE compilation with MinGW32 and add it to Appveyor
2019-07-06 18:07:19 +02:00
Martin Kroeker
8fb76134bc Mingw32 needs leading underscore on object names
(also copy BUNDERSCORE settings for FORTRAN from the corresponding Makefile)
2019-07-06 15:07:15 +02:00
Martin Kroeker
04d671aae2 Make disabling DYNAMIC_ARCH on unsupported systems work
needs to be unset in the cache for the change to have any effect
2019-07-06 15:05:04 +02:00
Martin Kroeker
f69a0be712 Add getarch flags to disable AVX on x86
(and other small fixes to match Makefile behaviour)
2019-07-06 15:02:39 +02:00
Martin Kroeker
ae9e8b131e Add mingw builds to Appveyor config 2019-07-06 14:30:33 +02:00
Martin Kroeker
9086543f50 Utest needs CBLAS but not necessarily FORTRAN 2019-07-06 14:29:47 +02:00
Martin Kroeker
abea977ded Merge pull request #2162 from martin-frbg/pgi
Fixes for PGI compiler
2019-07-03 19:16:30 +02:00
Martin Kroeker
6b6c9b1441 Merge pull request #2172 from quickwritereader/develop
power9 cgemm/ctrmm. new sgemm 8x16
2019-07-01 21:06:02 +02:00
AbdelRauf
a97b301aaa cgemm/ctrmm power9 2019-07-01 14:07:54 +00:00
Martin Kroeker
2f13f04224 Merge pull request #2170 from pkubaj/patch-1
Fix build on PPC970 for FreeBSD
2019-06-30 23:29:02 +02:00
pkubaj
7c7505a778 Fix build for PPC970 on FreeBSD pt.2
FreeBSD needs those macros too.
2019-06-28 10:31:45 +00:00
pkubaj
5a4f1a2118 Fix build for PPC970 on FreeBSD pt. 1
FreeBSD needs DCBT_ARG=0 as well.
2019-06-28 10:29:44 +00:00
Martin Kroeker
3b761892df Merge pull request #2169 from pkubaj/develop
Fix build on FreeBSD/powerpc64.
2019-06-25 12:56:33 +02:00
Piotr Kubaj
eebfeba768 Fix build on FreeBSD/powerpc64.
Signed-off-by: Piotr Kubaj <pkubaj@anongoth.pl>
2019-06-25 10:58:56 +02:00
Martin Kroeker
7684c4f8f8 PGI compiler does not like -march=native 2019-06-20 19:56:01 +02:00
Martin Kroeker
7faf42b7bb Merge pull request #2167 from kavanabhat/dtrmm_power8_segfault
Fix DTRMMKERNEL register save for power8 64-bit mode (Fix for #2166)
2019-06-19 14:38:01 +02:00
kavanabhat
a575f1e4c7 Update dtrmm_kernel_16x4_power8.S 2019-06-19 15:27:14 +05:30
AbdelRauf
cdbfb891da new sgemm 8x16 2019-06-17 15:33:38 +00:00
Martin Kroeker
280552b988 Fix mov syntax 2019-06-16 18:35:43 +02:00
Martin Kroeker
bbd4bb0154 Zero ecx with a mov instruction
PGI assembler does not like the initialization in the constraints.
2019-06-16 15:04:10 +02:00
Martin Kroeker
6d3efb2b58 Update Makefile.x86_64 2019-06-14 08:08:11 +02:00
Martin Kroeker
d9ff2cd90d Do not force gcc options on non-gcc compilers
fixes compile failure with pgi 18.10 as reported on OpenBLAS-users
2019-06-13 23:01:35 +02:00
Martin Kroeker
2a43062de7 Merge pull request #2159 from martin-frbg/issue2149
Avoid unintentional activation of TLS codepath via USE_TLS=0
2019-06-10 19:12:45 +02:00
Martin Kroeker
4ea794a522 Avoid unintentional activation of TLS code via USE_TLS=0
fixes #2149
2019-06-10 17:24:15 +02:00
Martin Kroeker
ece0bfb881 Merge pull request #2158 from martin-frbg/issue2143
Remove any inadvertent use of -march=native from DYNAMIC_ARCH builds
2019-06-10 14:08:11 +02:00
Martin Kroeker
1f4b6a5d5d Remove any inadvertent use of -march=native from DYNAMIC_ARCH builds
from #2143, -march=native precludes use of more specific options like -march=skylake-avx512 in individual kernels, and defeats the purpose of dynamic arch anyway.
2019-06-10 09:50:13 +02:00
Martin Kroeker
be8f70d269 Merge pull request #2157 from martin-frbg/2154-2
Add gfortran workaround for potential ABI violation
2019-06-09 12:19:08 +02:00
Martin Kroeker
e674e1c735 Update fc.cmake 2019-06-09 09:31:13 +02:00
Martin Kroeker
6ca898b63b Add gfortran workaround for potential ABI violation
for #2154
2019-06-08 23:17:03 +02:00
Martin Kroeker
26411acd56 Merge pull request #2148 from TiborGY/cpp_thread_test_2
Thread safety tester using C++11 threading (cleaned history)
2019-06-07 13:23:07 +02:00
Martin Kroeker
0ab4076dd8 Merge pull request #2156 from martin-frbg/issue2154
Add gfortran workaround for C->FORTRAN ABI violation
2019-06-06 13:43:12 +02:00
Martin Kroeker
a0caa762b3 Add gfortran workaround for ABI violations
for #2154 (see gcc bug 90329)
2019-06-06 10:24:16 +02:00
Martin Kroeker
900d5a3205 Add gfortran workaround for ABI violations in LAPACKE
for #2154 (see gcc bug 90329)
2019-06-06 10:18:40 +02:00
Martin Kroeker
a17cf36225 Merge pull request #2153 from quickwritereader/develop
improved power9 zgemm,sgemm
2019-06-06 07:42:56 +02:00
AbdelRauf
148c4cc5fd conflict resolve 2019-06-05 20:50:50 +00:00
AbdelRauf
d0c3543c3f power9 zgemm ztrmm optimized 2019-06-05 20:07:16 +00:00
Martin Kroeker
909ad04aef Merge pull request #2145 from martin-frbg/1912-3
Separate implementations of AMAX and IAMAX on arm
2019-06-05 20:27:45 +02:00
Martin Kroeker
417efd41c6 Merge pull request #2110 from pc2/cpu-detection
Fix detection of Skylake processors when using GCC
2019-06-05 20:27:05 +02:00
Michael Lass
9cdc828afa c_check: Unlink correct file 2019-06-05 17:31:01 +02:00
Michael Lass
7a9a4dbc4f Fix detection of AVX512 capable compilers in getarch
21eda8b5 introduced a check in getarch.c to test if the compiler is capable of
AVX512. This check currently fails, since the used __AVX2__ macro is only
defined if getarch itself was compiled with AVX2/AVX512 support. Make sure this
is the case by building getarch with -march=native on x86_64. It is only
supposed to run on the build host anyway.
2019-06-05 17:30:56 +02:00
AbdelRauf
a469b32cf4 sgemm pipeline improved, zgemm rewritten without inner packs, ABI lxvx v20 fixed with vs52 2019-06-04 07:11:30 +00:00
Martin Kroeker
27649b9543 Document NO_AVX512
for #2151
2019-06-03 11:01:33 +02:00
TiborGY
16f3df5d35 add c++ thread test option to Makefile.rule 2019-06-01 21:36:41 +02:00
TiborGY
1aded69821 hook up c++ thread safety test (main Makefile) 2019-06-01 21:32:52 +02:00
TiborGY
c00289ba54 upload thread safety test folder 2019-06-01 21:30:06 +02:00
AbdelRauf
8fe794f059 improved zgemm power9 based on power8 2019-05-30 15:31:25 +00:00
Martin Kroeker
74c10b57c6 Use generic kernels for complex (I)AMAX to support softfp 2019-05-30 11:38:11 +02:00
Martin Kroeker
c5495d2056 Ensure correct output for DAMAX with softfp 2019-05-30 11:25:43 +02:00
Martin Kroeker
c70496b108 Separate implementations of AMAX and IAMAX on arm
As noted in #1912 and comment on #1942, the combined implementation happens to "do the right thing" on hardfp, but cannot return both value and index on softfp where they would have to share the return register
2019-05-29 15:02:51 +02:00
Martin Kroeker
ca8d8835f5 Merge pull request #2144 from xianyi/revert-2142-issue1912-2
Revert "Add softfp support in min/max kernels"
2019-05-29 14:09:10 +02:00
Martin Kroeker
d76b20b4d2 Revert "Add softfp support in min/max kernels" 2019-05-29 14:07:17 +02:00
Martin Kroeker
85af04da3c Merge pull request #2142 from martin-frbg/issue1912-2
Add softfp support in min/max kernels
2019-05-28 22:56:08 +02:00
Martin Kroeker
11e0dcbffb Merge pull request #2141 from martin-frbg/issue1912
Build and run utests independently of fortran
2019-05-28 20:50:40 +02:00
Martin Kroeker
79366ff7a9 Add softfp support in min/max kernels
fix for #1912
2019-05-28 20:34:22 +02:00
Martin Kroeker
21d05a4835 Merge pull request #2140 from martin-frbg/pgi19
Do not try ancient PGI hacks with recent versions of that compiler
2019-05-26 12:39:20 +02:00
Martin Kroeker
940f38f6dd Build and run utests in any case, they do their own checks for fortran availability 2019-05-24 13:02:23 +02:00
Martin Kroeker
1778fd4219 Do not try ancient PGI hacks with recent versions of that compiler
should fix #2139
2019-05-22 13:48:27 +02:00
Martin Kroeker
969dd6175e Merge pull request #2136 from martin-frbg/issue2126
Add option to allow combining USE_THREAD=0 with thread locking support
2019-05-16 12:08:16 +02:00
Martin Kroeker
d8d5682481 Merge pull request #2134 from tylerjereddy/skylake_regress_guard_may14
TST: add SkylakeX AVX512 CI test
2019-05-15 23:40:06 +02:00
Martin Kroeker
f66c11fc22 Remove unrelated change 2019-05-15 23:38:12 +02:00
Martin Kroeker
5ecffc28f2 Add option USE_LOCKING but keep default settings intact 2019-05-15 23:36:17 +02:00
Martin Kroeker
86dda5c2fa Add option USE_LOCKING for SMP-like locking in USE_THREAD=0 builds 2019-05-15 23:21:20 +02:00
Martin Kroeker
1e52572be3 Add option USE_LOCKING for single-threaded build with locking support 2019-05-15 23:19:30 +02:00
Martin Kroeker
d2cb610272 Add option USE_LOCKING for single-threaded build with locking support
for calling from concurrent threads
2019-05-15 23:18:43 +02:00
Tyler Reddy
a211bc9b6a TST: add SkylakeX AVX512 CI test
* adapt the C-level reproducer code for some
recent SkylakeX AVX512 kernel issues, provided
by Isuru Fernando and modified by Martin Kroeker,
for usage in the utest suite

* add an Intel SDE SkylakeX emulation utest run to
the Azure CI matrix; a custom Docker build was required
because Ubuntu image provided by Azure does not support
AVX512VL instructions
2019-05-14 11:32:23 -07:00
Martin Kroeker
9208ab8603 Merge pull request #2130 from isuruf/drone
Drone CI for arm64 native builds
2019-05-14 09:37:00 +02:00
Isuru Fernando
b43deb4ad6 Fix typo 2019-05-12 15:26:18 -05:00
Isuru Fernando
b911525c81 arm32 build 2019-05-12 15:21:43 -05:00
Isuru Fernando
7ff44e0016 Remove qemu armv8 builds 2019-05-12 15:09:53 -05:00
Isuru Fernando
e3cb8ad2d6 See if ubuntu 19.04 fixes the ICE 2019-05-12 14:28:48 -05:00
Isuru Fernando
7aa6faad5f parallel build 2019-05-12 14:22:36 -05:00
Isuru Fernando
3d94ab660f build without lapack on cmake 2019-05-12 14:17:12 -05:00
Isuru Fernando
cd99dfe034 Add cmake builds and print options 2019-05-12 14:10:10 -05:00
Isuru Fernando
dadafcdcd8 Add a cmake build as well 2019-05-12 14:10:10 -05:00
Isuru Fernando
d40c109eb0 no need of gcc in clang build 2019-05-12 14:10:10 -05:00
Isuru Fernando
608cd69b66 update yes 2019-05-12 14:10:10 -05:00
Isuru Fernando
231472c4c6 Fix typo 2019-05-12 14:10:10 -05:00
Isuru Fernando
612c2d78e0 apt update 2019-05-12 14:10:10 -05:00
Isuru Fernando
dc110e179d Switch to ubuntu and parallel jobs 2019-05-12 14:10:09 -05:00
Isuru Fernando
9184590c33 gfortran->gcc-gfortran 2019-05-12 14:10:09 -05:00
Isuru Fernando
a0aaf308ed Install gfortran and add a clang job 2019-05-12 14:10:09 -05:00
Isuru Fernando
15f925fe9a Install perl 2019-05-12 14:10:09 -05:00
Isuru Fernando
21acf03e9a Install gcc 2019-05-12 14:10:09 -05:00
Isuru Fernando
ff807473bb remove sudo 2019-05-12 14:10:09 -05:00
Isuru Fernando
58829c0988 install make 2019-05-12 14:10:09 -05:00
Isuru Fernando
d86f0b9e74 Test drone CI 2019-05-12 14:10:09 -05:00
Martin Kroeker
63554d5dec Merge pull request #2129 from martin-frbg/armv8azure
Move ARMv8/gcc CI job from Travis to Azure
2019-05-12 09:55:57 +02:00
Martin Kroeker
43068288e9 Update .travis.yml 2019-05-11 22:37:06 +02:00
Martin Kroeker
999a04f101 Move ARMv8 gcc build from Travis to Azure 2019-05-11 16:08:23 +02:00
Martin Kroeker
3cb1c8d210 Move ARMv8 gcc build from Travis to Azure 2019-05-11 16:07:30 +02:00
Martin Kroeker
ff1bfe7b16 Merge pull request #2127 from martin-frbg/issue2114_2
Add NO_AFFINITY to available CMAKE options on Linux, and set it to ON
2019-05-09 15:25:09 +02:00
Martin Kroeker
9ea30f3788 Replace ISMIN and ISAMIN kernels on all x86_64 platforms (#2125)
* Mark iamax_sse.S as unsuitable for MIN due to issue #2116
* Use iamax.S rather than iamax_sse.S for ISMIN/ISAMIN on all x86_64 as workaround for #2116
2019-05-09 14:42:36 +02:00
Martin Kroeker
a3d4c65d62 Add NO_AFFINITY to available options on Linux, and set it to ON
to match the gmake default. Fixes second part of #2114
2019-05-09 11:52:02 +02:00
Martin Kroeker
e1fc02095c Merge pull request #2124 from tylerjereddy/manylinux1_azure
TST: Azure manylinux1 & clean-up
2019-05-09 08:57:37 +02:00
Martin Kroeker
0cd6d8508f Merge pull request #2123 from tylerjereddy/azure_readme_badge
DOC: Add Azure CI status badge to README
2019-05-09 08:10:19 +02:00
Martin Kroeker
c2f152c470 Merge pull request #2120 from brada4/getrf-2113
Address redundant code concern #2113
2019-05-09 08:10:00 +02:00
Tyler Reddy
4efbac28ed TST: Azure manylinux1 & clean-up
* remove some of the steps & comments
from the original Azure yml template

* modify the trigger section to use
develop since OpenBLAS primarily uses
this branch; use the same batching
behavior as downstream projects NumPy/
SciPy

* remove Travis emulated ARMv6 gcc build
because this now happens in Azure

* use documented Ubuntu vmImage name for Azure
and add in a manylinux1 test run to the matrix

[skip appveyor]
2019-05-08 21:58:49 -07:00
Martin Kroeker
406c7242f4 Add ARMV6 build to azure CI setup (#2122)
using aytekinar's Alpine image and docker script from the Travis setup

[skip ci]
2019-05-09 00:47:44 +02:00
Tyler Reddy
53703585aa DOC: Add Azure CI status badge 2019-05-08 15:15:50 -07:00
Martin Kroeker
ad20ceaa68 Update azure-pipelines.yml 2019-05-08 19:07:58 +02:00
Martin Kroeker
dd77a3f0e2 Update azure-pipelines.yml 2019-05-08 15:25:43 +02:00
Martin Kroeker
a598ab1d32 Update azure-pipelines.yml 2019-05-08 15:23:54 +02:00
Martin Kroeker
16fd8e3dbe Update azure-pipelines.yml 2019-05-08 14:14:22 +02:00
Martin Kroeker
aa4c41bad2 Update azure-pipelines.yml
take out offending lines (although stolen from https://github.com/conda-forge/opencv-feedstock azure-pipelines fiie)
2019-05-08 14:12:02 +02:00
Martin Kroeker
5cf434167a fix tabbing in azure commands 2019-05-08 13:58:59 +02:00
Martin Kroeker
3a49e8c05a first try migrating one of the arm builds from travis 2019-05-08 13:52:22 +02:00
Martin Kroeker
95e2cf32e1 Merge pull request #2121 from tylerjereddy/ppc64le-travis
TST: add native POWER8 to CI
2019-05-08 13:31:46 +02:00
Martin Kroeker
70cea0b96b Update link to IBM MASS library, update cpu support status 2019-05-08 12:20:00 +02:00
Martin Kroeker
ae0dec77ec Merge pull request #2118 from Diazonium/develop
Change two http links to https
2019-05-08 11:41:17 +02:00
Tyler Reddy
e47b63466b TST: add native POWER8 to CI
* add native POWER8 testing to
Travis CI matrix with ppc64le
os entry
2019-05-07 19:11:08 -07:00
Zhang Xianyi
7d1b468d9d Set up CI with Azure Pipelines
[skip ci]
2019-05-08 09:58:01 +08:00
Andrew
575a84398a remove redundant code #2113 2019-05-07 23:46:54 +03:00
Martin Kroeker
5cabda79d0 Merge pull request #2117 from martin-frbg/issue2114
Fix errors in cpu affinity setup with glibc 2.6
2019-05-07 18:18:16 +02:00
Diazonium
c516209581 Change two http links to https
Closes #2109
2019-05-07 14:55:20 +02:00
Martin Kroeker
a6a8cc2b7f Fix errors in cpu enumeration with glibc 2.6
for #2114
2019-05-07 13:34:52 +02:00
Andrew
3d7debbb28 init 2019-05-07 13:15:08 +03:00
Fabrice Fontaine
5a9cce2bf6 Makefile.arm: remove -march flags
The provided -march flags, especially for ARMv5 and ARMv6 may not
necessarily match the needed ones: for ARMv5, it might be armv5,
armv5te, armv5t, etc. If the wrong one is used, the incorrect toolchain
sysroot can be used in a multilib toolchain.

Therefore, let the user building OpenBLAS pass the appropriate -march
flag.

The other flags, such as -mfpu=vfp or -mfloat-abi=hard are kept, as they
are actually required for the build to proceed (OpenBLAS uses VFP
instructions, and assume an EABIhf ABI).

[Peter: update for v0.2.20]
Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: Peter Korsgaard <peter@korsgaard.com>
[Retrieved from:
https://git.buildroot.net/buildroot/tree/package/openblas/0001-Makefile.arm-remove-march-flags.patch]
Signed-off-by: Fabrice Fontaine <fontaine.fabrice@gmail.com>
2019-05-05 18:37:28 +02:00
Martin Kroeker
6a8b4269b5 Merge pull request #2111 from martin-frbg/issue1955
Disable the SkyLakeX DGEMMIxCOPY kernels as well
2019-05-05 18:08:49 +02:00
Martin Kroeker
b1561ecc68 Disable DGEMMINCOPY as well for now
#1955
2019-05-05 15:52:01 +02:00
Martin Kroeker
7ed8431527 Disable the SkyLakeX DGEMMITCOPY kernel as well
as a stopgap measure for https://github.com/numpy/numpy/issues/13401 as mentioned in #1955
2019-05-04 22:54:41 +02:00
Martin Kroeker
a387a23518 Merge pull request #2101 from luzpaz/misc-typos
Misc. typo fixes in comments and documentation
2019-05-04 22:28:29 +02:00
luz.paz
b46875b76b Revert Changelog.txt typos 2019-05-04 15:43:17 -04:00
luz.paz
858e609e1f Revert reference/ fixes 2019-05-04 15:01:29 -04:00
Martin Kroeker
3f427c0cf9 Merge pull request #2107 from quickwritereader/develop
sgemm/strmm kernel for power9
2019-05-02 07:56:57 +02:00
Martin Kroeker
c95317158f Merge pull request #2105 from martin-frbg/issue2104
Correct argument of CPU_ISSET for glibc <2.5
2019-05-02 07:56:37 +02:00
AbdelRauf
47f892198c conflict resolve 2019-05-01 19:36:22 +00:00
Martin Kroeker
b43c8382c8 Correct argument of CPU_ISSET for glibc <2.5
fixes #2104
2019-05-01 10:46:46 +02:00
luz.paz
daf2fec12d Misc. typo fixes
Found via `codespell -q 3 -w -L ith,als,dum,nd,amin,nto,wis,ba -S ./relapack,./kernel,./lapack-netlib`
2019-04-29 17:03:56 -04:00
Martin Kroeker
4f8143b098 Increment version to 0.3.7.dev 2019-04-29 19:25:32 +02:00
Martin Kroeker
bfeb9c16b0 Increment version to 0.3.7.dev 2019-04-29 19:24:53 +02:00
AbdelRauf
628b335e83 Merge branch 'develop' of https://github.com/quickwritereader/OpenBLAS into develop 2019-04-29 08:57:44 +00:00
AbdelRauf
0f105dd8a5 sgemm/strmm 2019-04-29 08:49:50 +00:00
1146 changed files with 81190 additions and 15087 deletions

143
.drone.yml Normal file
View File

@@ -0,0 +1,143 @@
---
kind: pipeline
name: arm64_gcc_make
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:19.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm32_gcc_make
platform:
os: linux
arch: arm
steps:
- name: Build and Test
image: ubuntu:19.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm64_clang_make
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm32_clang_cmake
platform:
os: linux
arch: arm
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest -V
---
kind: pipeline
name: arm64_gcc_cmake
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: gcc
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest -V
---
kind: pipeline
name: arm64_clang_cmake
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest -V

3
.gitignore vendored
View File

@@ -87,4 +87,5 @@ build.*
*.swp
benchmark/*.goto
benchmark/smallscaling
CMakeCache.txt
CMakeFiles/*

View File

@@ -17,7 +17,7 @@ matrix:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
script:
- set -e
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
- make -C test $COMMON_FLAGS $BTYPE
- make -C ctest $COMMON_FLAGS $BTYPE
- make -C utest $COMMON_FLAGS $BTYPE
@@ -25,6 +25,15 @@ matrix:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64"
- <<: *test-ubuntu
os: linux-ppc64le
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
env:
# for matrix annotation only
- TARGET_BOX=PPC64LE_LINUX
- BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-ubuntu
env:
- TARGET_BOX=LINUX64
@@ -151,54 +160,25 @@ matrix:
os: osx
osx_image: xcode10.1
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
- brew update
- brew install gcc # for gfortran
- brew install gcc@8 # for gfortran
script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env:
- BTYPE="BINARY=64 INTERFACE64=1"
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
- <<: *test-macos
osx_image: xcode8.3
osx_image: xcode10.0
env:
- BTYPE="BINARY=32"
- BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
- &emulated-arm
dist: trusty
sudo: required
services: docker
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
name: "Emulated Build for ARMV6 with gcc"
before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset
script: |
echo "FROM openblas/alpine:${IMAGE_ARCH}
COPY . /tmp/openblas
RUN mkdir /tmp/openblas/build && \
cd /tmp/openblas/build && \
CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \
-D TARGET=${TARGET_ARCH} \
-D BUILD_SHARED_LIBS=ON \
-D BUILD_WITHOUT_LAPACK=ON \
-D BUILD_WITHOUT_CBLAS=ON \
-D CMAKE_BUILD_TYPE=Release ../ && \
cmake --build ." > Dockerfile
docker build .
- <<: *emulated-arm
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
name: "Emulated Build for ARMV6 with clang"
- <<: *emulated-arm
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
name: "Emulated Build for ARMV8 with gcc"
- <<: *emulated-arm
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
name: "Emulated Build for ARMV8 with clang"
allow_failures:
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
- <<: *test-macos
osx_image: xcode10.1
env:
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk"
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
# whitelist
branches:

View File

@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 6)
set(OpenBLAS_PATCH_VERSION 9.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions
@@ -20,9 +20,14 @@ if(MSVC)
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
endif()
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF)
option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
else()
set(NO_AFFINITY 1)
endif()
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoids conflicts with other BLAS libraries, especially when using
@@ -206,7 +211,8 @@ if (USE_THREAD)
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
endif()
if (MSVC OR NOT NOFORTRAN)
#if (MSVC OR NOT NOFORTRAN)
if (NOT NO_CBLAS)
# Broken without fortran on unix
add_subdirectory(utest)
endif()

View File

@@ -167,4 +167,16 @@ In chronological order:
* [2017-02-26] ztrmm kernel for IBM z13
* [2017-03-13] strmm and ctrmm kernel for IBM z13
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
* [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
* [2019-03-14] power9 dgemm/dtrmm kernel
* [2019-04-29] power9 sgemm/strmm kernel
* Jiachen Wang <https://github.com/wjc404>
* [2019-07-29] optimize AVX2 DGEMM
* [2019-10-20] AVX512 DGEMM kernel (4x8)
* [2019-11-06] optimize AVX512 SGEMM
* [2019-11-12] AVX512 CGEMM & ZGEMM kernels
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM
* [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels
* [2020-01-07] optimize AVX2 SGEMM and STRMM

View File

@@ -1,4 +1,101 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.8
9-Feb-2020
common:
` * LAPACK has been updated to 3.9.0 (plus patches up to
January 2nd, 2020)
* CMAKE support has been improved in several areas including
cross-compilation
* a thread race condition in the GEMM3M kernels was resolved
* the "generic" (plain C) gemm beta kernel used by many targets
has been sped up
* an optimized version of the LAPACK trtrs functions has been added
* an incompatibilty between the LAPACK tests and the OpenBLAS
implementation of XERBLA was resolved, removing the numerous
warnings about wrong error exits in the former
* support for NetBSD has been added
* support for compilation with g95 and non-GNU versions of ld
has been improved
* support for compilation with (upcoming) gcc 10 has been added
POWER:
* worked around miscompilation of several POWER8 and POWER9
kernels by older versions of gcc
* added support for big-endian POWER8 and for compilation on AIX
* corrected bugs in the big-endian support for PPC440 and PPC970
* DYNAMIC_ARCH support is now available in CMAKE builds as well
ARMV8:
* performance of DGEMM_BETA and SGEMM_NCOPY has been improved
* compilation for 32bit works again
* performance of the RPCC function has been improved
* improved performance on small systems
* DYNAMIC_ARCH support is now available in CMAKE builds as well
* cross-compilation from OSX to IOS was simplified
x86_64:
* a new AVX512 DGEMM kernel was added and the AVX512 SGEMM kernel
was significantly improved
* optimized AVX512 kernels for CGEMM and ZGEMM have been added
* AVX2 kernels for STRMM, SGEMM, and CGEMM have been significantly
sped up and optimized CGEMM3M and ZGEMM3M kernels have been added
* added support for QEMU virtual cpus
* a compilation problem with PGI and SUN compilers was fixed
* Intel "Goldmont plus" is now autodetected
* a potential crash on program exit on MS Windows has been fixed
x86:
* an unwanted case sensitivity in the implementation of LSAME
on older 32bit AMD cpus was fixed
zarch:
* Z15 is now supported as Z14
* DYNAMIC_ARCH is now available on ZARCH as well
====================================================================
Version 0.3.7
11-Aug 2019
common:
* having the gmake special variables TARGET_ARCH or TARGET_MACH
defined no longer causes build failures in ctest or utest
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
has the same effect as setting them to 1
* a new test program was added to allow checking the library for
thread safety
* a new option USE_LOCKING was added to ensure thread safety when
OpenBLAS itself is built without multithreading but will be
called from multiple threads.
* a build failure on Linux with glibc versions earlier than 2.5
was fixed
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
on glibc 2.6 was fixed
* NO_AFFINITY was added to the CMAKE options (and defaults to being
active on Linux, as in the gmake builds)
x86_64:
* the build-time logic for detection of AVX512 availability in
the processor and compiler was fixed
* gmake builds on OSX now set the internal name of the library to
libopenblas.0.dylib (consistent with CMAKE)
* the Haswell DGEMM kernel received a significant speedup through
improved prefetch and load instructions
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
increased by avoiding vpermpd instructions
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
to fix remaining errors in DGEMM, DSYMM and DTRMM
POWER:
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
* added optimized kernels for POWER9 SGEMM and STRMM
ARMV7:
* fixed the softfp implementations of xAMAX and IxAMAX
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
they were appropriate for only a subset of platforms
====================================================================
Version 0.3.6
29-Apr-2019

View File

@@ -34,7 +34,7 @@ endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
.PHONY : all libs netlib $(RELA) test ctest shared install
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
@@ -109,6 +109,7 @@ endif
ifeq ($(OSNAME), Darwin)
@$(MAKE) -C exports dyn
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq ($(OSNAME), WINNT)
@$(MAKE) -C exports dll
@@ -123,10 +124,13 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
$(MAKE) -C utest all
endif
$(MAKE) -C utest all
ifndef NO_CBLAS
$(MAKE) -C ctest all
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
$(MAKE) -C cpp_thread_test all
endif
endif
endif
@@ -243,21 +247,21 @@ prof_lapack : lapack_prebuild
lapack_prebuild :
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -315,7 +319,7 @@ lapack-test :
ifneq ($(CROSS), 1)
( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
endif
lapack-runtest:

View File

@@ -1,7 +1,7 @@
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -mfpu=neon -march=armv7-a
FCOMMON_OPT += -mfpu=neon -march=armv7-a
CCOMMON_OPT += -mfpu=neon -march=armv7-a
FCOMMON_OPT += -mfpu=neon -march=armv7-a
else
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
@@ -9,11 +9,6 @@ endif
endif
ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -mfpu=vfp -march=armv6
FCOMMON_OPT += -mfpu=vfp -march=armv6
endif
ifeq ($(CORE), ARMV5)
CCOMMON_OPT += -march=armv5
FCOMMON_OPT += -march=armv5
CCOMMON_OPT += -mfpu=vfp
FCOMMON_OPT += -mfpu=vfp
endif

View File

@@ -39,7 +39,10 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif
ifeq ($(GCCVERSIONGTEQ9), 1)
ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
endif
endif

View File

@@ -51,6 +51,7 @@ endif
ifneq ($(OSNAME), AIX)
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@@ -83,7 +84,8 @@ ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@@ -99,6 +101,7 @@ else
#install on AIX has different options syntax
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"

View File

@@ -29,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas
endif
endif
# workaround for C->FORTRAN ABI violation in LAPACKE
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -fno-optimize-sibling-calls
endif
FLAMEPATH = $(HOME)/flame/lib

View File

@@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.6
VERSION = 0.3.9.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -58,6 +58,12 @@ VERSION = 0.3.6
# For force setting for multi threaded, specify USE_THREAD = 1
# USE_THREAD = 0
# If you want to build a single-threaded OpenBLAS, but expect to call this
# from several concurrent threads in some other program, comment this in for
# thread safety. (This is done automatically for USE_THREAD=1 , and should not
# be necessary when USE_OPENMP=1)
# USE_LOCKING = 1
# If you're going to use this library with OpenMP, please comment it in.
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
# USE_OPENMP = 1
@@ -91,6 +97,15 @@ VERSION = 0.3.6
# they need to wait for the preceding API calls to finish or risk data corruption.
# NUM_PARALLEL = 2
# When multithreading, OpenBLAS needs to use a memory buffer for communicating
# and collating results for individual subranges of the original matrix. Since
# the original GotoBLAS of the early 2000s, the default size of this buffer has
# been set at a value of 32<<20 (which is 32MB) on x86_64 , twice that on PPC.
# If you expect to handle large problem sizes (beyond about 30000x30000) uncomment
# this line and adjust the (32<<n) factor if necessary. Usually an insufficient value
# manifests itself as a crash in the relevant scal kernel (sscal_k, dscal_k etc)
# BUFFERSIZE = 25
# If you don't need to install the static library, please comment this in.
# NO_STATIC = 1
@@ -157,6 +172,10 @@ NO_AFFINITY = 1
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
# NO_AVX2 = 1
# Don't use SkylakeX optimizations if binutils or compiler are too old (the build
# system will try to determine this automatically)
# NO_AVX512 = 1
# Don't use parallel make.
# NO_PARALLEL_MAKE = 1
@@ -181,17 +200,17 @@ NO_AFFINITY = 1
# time out to improve performance. This number should be from 4 to 30
# which corresponds to (1 << n) cycles. For example, if you set to 26,
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
# system). Also you can control this mumber by THREAD_TIMEOUT
# system). Also you can control this number by THREAD_TIMEOUT
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26
# Using special device driver for mapping physically contigous memory
# Using special device driver for mapping physically contiguous memory
# to the user space. If bigphysarea is enabled, it will use it.
# DEVICEDRIVER_ALLOCATION = 1
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
# CONSISTENT_FPCSR = 1
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
# with single thread. (Actually in recent versions this is a factor proportional to the
# number of floating point operations necessary for the given problem size, no longer
# an individual dimension). You can use this setting to avoid the overhead of multi-
@@ -239,6 +258,21 @@ COMMON_PROF = -pg
# SYMBOLPREFIX=
# SYMBOLSUFFIX=
# Run a C++ based thread safety tester after the build is done.
# This is mostly intended as a developer feature to spot regressions, but users and
# package maintainers can enable this if they have doubts about the thread safety of
# the library, given the configuration in this file.
# By default, the thread safety tester launches 52 concurrent calculations at the same
# time.
#
# Please note that the test uses ~1300 MiB of RAM for the DGEMM test.
#
# The test requires CBLAS to be built, a C++11 capable compiler and the presence of
# an OpenMP implementation. If you are cross-compiling this test will probably not
# work at all.
#
# CPP_THREAD_SAFETY_TEST = 1
#
# End of user configuration
#

View File

@@ -9,6 +9,13 @@ ifndef TOPDIR
TOPDIR = .
endif
# If ARCH is not set, we use the host system's architecture for getarch compile options.
ifndef ARCH
HOSTARCH := $(shell uname -m)
else
HOSTARCH = $(ARCH)
endif
# Catch conflicting usage of ARCH in some BSD environments
ifeq ($(ARCH), amd64)
override ARCH=x86_64
@@ -18,6 +25,8 @@ else ifeq ($(ARCH), i386)
override ARCH=x86
else ifeq ($(ARCH), aarch64)
override ARCH=arm64
else ifeq ($(ARCH), zarch)
override ARCH=zarch
endif
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
@@ -137,7 +146,12 @@ endif
endif
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
ifeq ($(HOSTARCH), x86_64)
ifeq ($(findstring pgcc,$(HOSTCC)),)
GETARCH_FLAGS += -march=native
endif
endif
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
@@ -237,6 +251,10 @@ SMP = 1
endif
endif
ifeq ($(SMP), 1)
USE_LOCKING =
endif
ifndef NEED_PIC
NEED_PIC = 1
endif
@@ -253,9 +271,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy
OBJCONV = $(CROSS_SUFFIX)objconv
# For detect fortran failed, only build BLAS.
# When fortran support was either not detected or actively deselected, only build BLAS.
ifeq ($(NOFORTRAN), 1)
NO_LAPACK = 1
override FEXTRALIB =
endif
#
@@ -305,12 +324,14 @@ CCOMMON_OPT += -DMS_ABI
endif
ifeq ($(C_COMPILER), GCC)
#Test for supporting MS_ABI
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGT4), 1)
# GCC Majar version > 4
# GCC Major version > 4
# It is compatible with MSVC ABI.
CCOMMON_OPT += -DMS_ABI
endif
@@ -388,6 +409,12 @@ ifneq ($(MAX_STACK_ALLOC), 0)
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
endif
ifdef USE_LOCKING
ifneq ($(USE_LOCKING), 0)
CCOMMON_OPT += -DUSE_LOCKING
endif
endif
#
# Architecture dependent settings
#
@@ -523,16 +550,35 @@ endif
ifeq ($(ARCH), arm64)
DYNAMIC_CORE = ARMV8
DYNAMIC_CORE += CORTEXA53
DYNAMIC_CORE += CORTEXA57
DYNAMIC_CORE += CORTEXA72
DYNAMIC_CORE += CORTEXA73
DYNAMIC_CORE += FALKOR
DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99
DYNAMIC_CORE += TSV110
endif
ifeq ($(ARCH), zarch)
DYNAMIC_CORE = Z13
DYNAMIC_CORE += Z14
endif
ifeq ($(ARCH), power)
DYNAMIC_CORE = POWER6
DYNAMIC_CORE += POWER8
ifneq ($(C_COMPILER), GCC)
DYNAMIC_CORE += POWER9
endif
ifeq ($(C_COMPILER), GCC)
ifeq ($(GCCVERSIONGT5), 1)
DYNAMIC_CORE += POWER9
else
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
endif
endif
endif
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
ifndef DYNAMIC_CORE
@@ -676,7 +722,7 @@ endif
ifeq ($(C_COMPILER), PGI)
ifdef BINARY64
CCOMMON_OPT += -tp p7-64
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
else
CCOMMON_OPT += -tp p7
endif
@@ -736,6 +782,9 @@ else
FCOMMON_OPT += -m32
endif
endif
ifneq ($(NO_LAPACKE), 1)
FCOMMON_OPT += -fno-second-underscore
endif
endif
endif
@@ -744,6 +793,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
FCOMMON_OPT += -Wall
# make single-threaded LAPACK calls thread-safe #1847
FCOMMON_OPT += -frecursive
# work around ABI problem with passing single-character arguments
FCOMMON_OPT += -fno-optimize-sibling-calls
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
ifneq ($(NO_LAPACK), 1)
EXTRALIB += -lgfortran
@@ -1049,7 +1100,7 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif
ifdef USE_TLS
ifeq ($(USE_TLS), 1)
CCOMMON_OPT += -DUSE_TLS
endif
@@ -1102,8 +1153,12 @@ endif
endif
ifdef NO_AFFINITY
ifeq ($(NO_AFFINITY), 0)
override undefine NO_AFFINITY
else
CCOMMON_OPT += -DNO_AFFINITY
endif
endif
ifdef FUNCTION_PROFILE
CCOMMON_OPT += -DFUNCTION_PROFILE

View File

@@ -28,11 +28,15 @@ endif
ifeq ($(CORE), HASWELL)
ifndef DYNAMIC_ARCH
ifndef NO_AVX2
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -mavx2
endif
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -mavx2
endif
endif
endif
endif

View File

@@ -6,11 +6,13 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
## Binary Packages
@@ -22,8 +24,10 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
## Installation from Source
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
using Git from https://github.com/xianyi/OpenBLAS.git.
Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
Most can also be given directly on the make or cmake command line.
### Dependencies
@@ -63,9 +67,7 @@ A debug version can be built using `make DEBUG=1`.
### Compile with MASS support on Power CPU (optional)
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
consists of a set of mathematical functions for C, C++, and Fortran applications that are
are tuned for optimum performance on POWER architectures.
The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
The library can be installed as shown:
@@ -101,7 +103,7 @@ The default installation directory is `/opt/OpenBLAS`.
## Supported CPUs and Operating Systems
Please read `GotoBLAS_01Readme.txt`.
Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by the 2010 GotoBLAS.
### Additional supported CPUs
@@ -109,12 +111,13 @@ Please read `GotoBLAS_01Readme.txt`.
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
- **AMD ZEN**: Uses Haswell codes with some optimizations.
#### MIPS64
@@ -128,26 +131,51 @@ Please read `GotoBLAS_01Readme.txt`.
#### ARM64
- **ARMv8**: Experimental
- **ARM Cortex-A57**: Experimental
- **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS
- **Cortex-A53**: same as ARMV8 (different cpu specifications)
- **Cortex A57**: Optimized Level-3 and Level-2 functions
- **Cortex A72**: same as A57 ( different cpu specifications)
- **Cortex A73**: same as A57 (different cpu specifications)
- **Falkor**: same as A57 (different cpu specifications)
- **ThunderX**: Optimized some Level-1 functions
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
- **TSV110**: Optimized some Level-3 helper functions
#### PPC/PPC64
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
#### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
- **Z13**: Optimized Level-3 BLAS and Level-1,2
- **Z14**: Optimized Level-3 BLAS and (single precision) Level-1,2
### Support for multiple targets in a single library
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake.
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default.
DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the
common code in the library, usually you will want to set this to the oldest model you expect to encounter.
Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
### Supported OS
- **GNU/Linux**
- **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts.
- **Darwin/macOS/OSX/iOS**: Experimental. Although GotoBLAS2 already supports Darwin, we are not OSX/iOS experts.
- **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
- **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
- **NetBSD**: Supported by the community. We don't actively test the library on this OS.
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
- **AIX**: Supported on PPC up to POWER8
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
- **SunOS**: Supported by the community. We don't actively test the library on this OS:
## Usage
@@ -202,7 +230,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
Clang 3.0 will generate the wrong AVX binary code.
* Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels.
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
* The number of CPUs/cores should be less than or equal to 256. On Linux `x86_64` (`amd64`),
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
the library with `BIGNUMA=1`.
* OpenBLAS does not set processor affinity by default.

View File

@@ -35,7 +35,15 @@ environment:
DYNAMIC_ARCH: ON
WITH_FORTRAN: no
- COMPILER: cl
- COMPILER: MinGW64-gcc-7.2.0-mingw
DYNAMIC_ARCH: OFF
WITH_FORTRAN: ignore
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
COMPILER: MinGW-gcc-6.3.0-32
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
COMPILER: MinGW-gcc-5.3.0
WITH_FORTRAN: ignore
install:
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
@@ -52,7 +60,14 @@ install:
before_build:
- ps: if (-Not (Test-Path .\build)) { mkdir build }
- cd build
- set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
- if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] set PATH=C:\msys64\usr\bin;C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
@@ -64,3 +79,4 @@ test_script:
- echo Running Test
- cd utest
- openblas_utest

51
azure-pipelines.yml Normal file
View File

@@ -0,0 +1,51 @@
trigger:
# start a new build for every push
batch: False
branches:
include:
- develop
jobs:
# manylinux1 is useful to test because the
# standard Docker container uses an old version
# of gcc / glibc
- job: manylinux1_gcc
pool:
vmImage: 'ubuntu-16.04'
steps:
- script: |
echo "FROM quay.io/pypa/manylinux1_x86_64
COPY . /tmp/openblas
RUN cd /tmp/openblas && \
COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \
BTYPE='BINARY=64' CC=gcc && \
make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \
make -C test $COMMON_FLAGS $BTYPE && \
make -C ctest $COMMON_FLAGS $BTYPE && \
make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile
docker build .
displayName: Run manylinux1 docker build
- job: Intel_SDE_skx
pool:
vmImage: 'ubuntu-16.04'
steps:
- script: |
# at the time of writing the available Azure Ubuntu vm image
# does not support AVX512VL, so use more recent LTS version
echo "FROM ubuntu:bionic
COPY . /tmp/openblas
RUN apt-get -y update && apt-get -y install \\
cmake \\
gfortran \\
make \\
wget
RUN mkdir /tmp/SDE && cd /tmp/SDE && \\
mkdir sde-external-8.35.0-2019-03-11-lin && \\
wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\
tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1
RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64
CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile
docker build -t intel_sde .
# we need a privileged docker run for sde process attachment
docker run --privileged intel_sde
displayName: 'Run AVX512 SkylakeX docker build / test'

View File

@@ -207,7 +207,7 @@ int main(int argc, char *argv[]){
for (i = 0; i < m * n * COMPSIZE; i++) {
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
fprintf(stderr, " SIZE Flops Time\n");
for (i = from; i <= to; i += step) {

View File

@@ -197,7 +197,7 @@ int main(int argc, char *argv[]){
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
@@ -208,7 +208,7 @@ int main(int argc, char *argv[]){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
@@ -234,7 +234,7 @@ int main(int argc, char *argv[]){
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
@@ -245,7 +245,7 @@ int main(int argc, char *argv[]){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);

29
c_check
View File

@@ -188,14 +188,14 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
if ($@){
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
} else {
$tmpf = new File::Temp( UNLINK => 1 );
$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
$args = "$msa_flags -o $tmpf.o $tmpf";
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
@@ -229,10 +229,13 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
$no_avx512 = 0;
} else {
# $tmpf = new File::Temp( UNLINK => 1 );
($fh,$tmpf) = tempfile( UNLINK => 1 );
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
$args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf";
if ($compiler eq "PGI") {
$args = " -tp skylake -c -o $tmpf.o $tmpf";
}
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
@@ -240,7 +243,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
} else {
$no_avx512 = 0;
}
unlink("tmpf.o");
unlink("$tmpf.o");
}
}
@@ -260,6 +263,19 @@ if ($architecture ne $hostarch) {
$cross = 1 if ($os ne $hostos);
# rework cross suffix and architecture if we are on OSX cross-compiling for ARMV8-based IOS
# the initial autodetection will have been confused by the command-line arguments to clang
# and the cross-compiler apparently still claims to build for x86_64 in its CC -E output
if (($os eq "Darwin") && ($cross_suffix ne "")) {
my $tmpnam = `xcrun --sdk iphoneos --find clang`;
$cross_suffix = substr($tmpnam, 0, rindex($tmpnam, "/")+1 );
# this should produce something like $cross_suffix="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/";
$cross =1;
$architecture = arm64;
}
$openmp = "" if $ENV{USE_OPENMP} != 1;
$linker_L = "";
@@ -305,6 +321,7 @@ $linker_a = "";
&& ($flags !~ /kernel32/)
&& ($flags !~ /advapi32/)
&& ($flags !~ /shell32/)
&& ($flags !~ /omp/)
) {
$linker_l .= $flags . " "
}

View File

@@ -45,7 +45,11 @@ endif ()
if (DYNAMIC_ARCH)
if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110)
endif ()
if (POWER)
set(DYNAMIC_CORE POWER6 POWER8 POWER9)
endif ()
if (X86)
@@ -73,14 +77,16 @@ if (DYNAMIC_ARCH)
endif ()
if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
endif ()
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
endif ()
endif ()
if (NOT DYNAMIC_CORE)
unset(DYNAMIC_ARCH)
message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options")
unset(DYNAMIC_ARCH CACHE)
endif ()
endif ()

View File

@@ -3,7 +3,7 @@
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets C related variables.
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang")
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
set(CCOMMON_OPT "${CCOMMON_OPT} -Wall")
set(COMMON_PROF "${COMMON_PROF} -fno-inline")
@@ -43,7 +43,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "PGI")
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64")
else ()
@@ -51,7 +51,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PGI")
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
else ()
@@ -59,7 +59,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64")
if (MIPS64)
@@ -87,7 +87,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "SUN")
if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
set(CCOMMON_OPT "${CCOMMON_OPT} -w")
if (X86)
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
@@ -96,3 +96,10 @@ if (${CMAKE_C_COMPILER} STREQUAL "SUN")
endif ()
endif ()
if (${CORE} STREQUAL "SKYLAKEX")
if (NOT DYNAMIC_ARCH)
if (NOT NO_AVX512)
set (CCOMMON_OPT = "${CCOMMON_OPT} -march=skylake-avx512")
endif ()
endif ()
endif ()

View File

@@ -44,7 +44,10 @@ endif ()
if (${F_COMPILER} STREQUAL "GFORTRAN")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
# ensure reentrancy of lapack codes
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
# work around ABI violation in passing string arguments from C
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
if (NOT NO_LAPACK)
set(EXTRALIB "{EXTRALIB} -lgfortran")

View File

@@ -1,7 +1,7 @@
# helper functions for the kernel CMakeLists.txt
# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file.
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
macro(SetDefaultL1)
set(SAMAXKERNEL amax.S)
set(DAMAXKERNEL amax.S)

View File

@@ -115,7 +115,9 @@ set(SLASRC
stplqt.f stplqt2.f stpmlqt.f
ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f)
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
scombssq.f sgesvdq.f slaorhr_col_getrfnp.f
slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@@ -210,7 +212,9 @@ set(CLASRC
ctplqt.f ctplqt2.f ctpmlqt.f
chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f)
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
cungtsqr.f cunhr_col.f )
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@@ -299,7 +303,9 @@ set(DLASRC
dtplqt.f dtplqt2.f dtpmlqt.f
dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f)
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@@ -398,7 +404,9 @@ set(ZLASRC
zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f)
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
zungtsqr.f zunhr_col.f)
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f

View File

@@ -715,6 +715,8 @@ set(DSRC
lapacke_dgesv_work.c
lapacke_dgesvd.c
lapacke_dgesvd_work.c
lapacke_dgesvdq.c
lapacke_dgesvdq_work.c
lapacke_dgesvdx.c
lapacke_dgesvdx_work.c
lapacke_dgesvj.c
@@ -1287,6 +1289,8 @@ set(SSRC
lapacke_sgesv_work.c
lapacke_sgesvd.c
lapacke_sgesvd_work.c
lapacke_sgesvdq.c
lapacke_sgesvdq_work.c
lapacke_sgesvdx.c
lapacke_sgesvdx_work.c
lapacke_sgesvj.c
@@ -1853,6 +1857,8 @@ set(ZSRC
lapacke_zgesv_work.c
lapacke_zgesvd.c
lapacke_zgesvd_work.c
lapacke_zgesvdq.c
lapacke_zgesvdq_work.c
lapacke_zgesvdx.c
lapacke_zgesvdx_work.c
lapacke_zgesvj.c

View File

@@ -59,6 +59,9 @@ set(FU "")
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
set(FU "_")
endif()
if(MINGW AND NOT MINGW64)
set(FU "_")
endif()
set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
if (${COMPILER_ID} STREQUAL "GNU")
@@ -82,6 +85,11 @@ endif ()
# f_check
if (NOT NOFORTRAN)
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
else ()
file(APPEND ${TARGET_CONF_TEMP}
"#define BUNDERSCORE _\n"
"#define NEEDBUNDERSCORE 1\n")
set(BU "_")
endif ()
# Cannot run getarch on target if we are cross-compiling
@@ -97,8 +105,39 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
# Perhaps this should be inside a different file as it grows larger
file(APPEND ${TARGET_CONF_TEMP}
"#define ${TCORE}\n"
"#define CORE_${TCORE}\n"
"#define CHAR_CORENAME \"${TCORE}\"\n")
if ("${TCORE}" STREQUAL "ARMV7")
if ("${TCORE}" STREQUAL "CORE2")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t1048576\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t256\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_CMOV\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSSE3\n"
"#define SLOCAL_BUFFER_SIZE\t16384\n"
"#define DLOCAL_BUFFER_SIZE\t16384\n"
"#define CLOCAL_BUFFER_SIZE\t16384\n"
"#define ZLOCAL_BUFFER_SIZE\t16384\n")
set(SGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 4)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "ARMV7")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t32\n"
@@ -113,6 +152,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
elseif ("${TCORE}" STREQUAL "ARMV8")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
@@ -266,6 +309,83 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "TSV110")
file(APPEND ${TARGET_CONF_TEMP}
"#define ARMV8\n"
"#define L1_CODE_SIZE\t65536\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t4\n"
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t4\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t8\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "POWER6")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"
"#define L1_DATA_LINESIZE 128\n"
"#define L2_SIZE 524288\n"
"#define L2_LINESIZE 128 \n"
"#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 8\n")
set(SGEMM_UNROLL_M 4)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 8)
elseif ("${TCORE}" STREQUAL "POWER8")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"
"#define L1_DATA_LINESIZE 128\n"
"#define L2_SIZE 524288\n"
"#define L2_LINESIZE 128 \n"
"#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 16)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 8)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 8)
elseif ("${TCORE}" STREQUAL "POWER9")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"
"#define L1_DATA_LINESIZE 128\n"
"#define L2_SIZE 524288\n"
"#define L2_LINESIZE 128 \n"
"#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 16)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 8)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 8)
endif()
# Or should this actually be NUM_CORES?
@@ -301,6 +421,9 @@ else(NOT CMAKE_CROSSCOMPILING)
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
else()
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
if (DEFINED TARGET_CORE)
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE})
endif ()
endif ()
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")

View File

@@ -65,6 +65,18 @@ if (DEFINED TARGET)
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
endif ()
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
endif ()
# On x86 no AVX support is available
if (X86 OR X86_64)
if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4"))
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512")
endif ()
endif ()
if (INTERFACE64)
message(STATUS "Using 64-bit integers.")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
@@ -136,10 +148,16 @@ endif ()
if (USE_THREAD)
message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.")
else()
if (${USE_LOCKING})
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING")
endif ()
endif ()
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
if (DEFINED BINARY)
message(STATUS "Compiling a ${BINARY}-bit binary.")
endif ()
if (NOT DEFINED NEED_PIC)
set(NEED_PIC 1)
endif ()
@@ -156,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
if (NOT NOFORTRAN)
# Fortran Compiler dependent settings
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
else ()
set(NO_LAPACK 1)
set(NO_LAPACKE 1)
endif ()
if (BINARY64)
@@ -181,9 +202,14 @@ if (NEED_PIC)
endif ()
if (DYNAMIC_ARCH)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
if (X86 OR X86_64 OR ARM64 OR PPC)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
endif ()
else ()
unset (DYNAMIC_ARCH)
message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing")
endif ()
endif ()
@@ -263,6 +289,10 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}")
if (BUFFERSIZE)
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUFFERSIZE=${BUFFERSIZE}")
endif ()
if (USE_SIMPLE_THREADED_LEVEL3)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
endif ()
@@ -283,7 +313,7 @@ endif ()
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
# TODO: nead to convert these Makefiles
# TODO: need to convert these Makefiles
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
if (${CORE} STREQUAL "PPC440")

View File

@@ -15,7 +15,7 @@ if (${HOST_OS} STREQUAL "LINUX")
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
if(${OPERATING_SYSTEM} MATCHES "Android")
set(HOST_OS ANDROID)
endif(${OPERATING_SYSTEM} MATCHES "Android")
endif()
endif()
@@ -39,10 +39,18 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(X86_64 1)
if (NOT BINARY)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(X86_64 1)
else()
set(X86 1)
endif()
else()
set(X86 1)
if (${BINARY} EQUAL "64")
set(X86_64 1)
else ()
set(X86 1)
endif()
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
set(X86 1)
@@ -54,6 +62,22 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
else()
set(ARM 1)
endif()
elseif (${CMAKE_CROSSCOMPILING})
if (${TARGET} STREQUAL "CORE2")
if (NOT BINARY)
set(X86 1)
elseif (${BINARY} EQUAL "64")
set(X86_64 1)
else ()
set(X86 1)
endif()
elseif (${TARGET} STREQUAL "ARMV7")
set(ARM 1)
else()
set(ARM64 1)
endif ()
else ()
message(WARNING "Target ARCH could not be determined, got \"${CMAKE_SYSTEM_PROCESSOR}\"")
endif()
if (X86_64)
@@ -92,4 +116,3 @@ set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif()
file(REMOVE "avx512.tmp" "avx512.o")
endif()

View File

@@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
endfunction ()
# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition
# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
# @param sources_in the source files to build from
# @param defines_in (optional) preprocessor definitions that will be applied to all objects
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.

View File

@@ -131,7 +131,7 @@ extern "C" {
#include <time.h>
#include <unistd.h>
#include <math.h>
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
#include <pthread.h>
#endif
#endif
@@ -200,7 +200,7 @@ extern "C" {
#error "You can't specify both LOCK operation!"
#endif
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
#define USE_PTHREAD_LOCK
#undef USE_PTHREAD_SPINLOCK
#endif

View File

@@ -78,7 +78,18 @@ static void __inline blas_lock(volatile BLASULONG *address){
#define BLAS_LOCK_DEFINED
#if !defined(OS_DARWIN) && !defined (OS_ANDROID)
static __inline BLASULONG rpcc(void){
BLASULONG ret = 0;
__asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret));
return ret;
}
#define RPCC_DEFINED
#define RPCC64BIT
#endif
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
@@ -103,12 +114,16 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \
.text ;\
.align 4 ;\
.global REALNAME ;\
.type REALNAME, %function ;\
.macro PROLOGUE
.text ;
.p2align 2 ;
.global REALNAME ;
#ifndef __APPLE__
.type REALNAME, %function ;
#endif
REALNAME:
.endm
#define EPILOGUE

View File

@@ -293,4 +293,150 @@ blasint zlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLO
blasint xlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint strtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint dtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint qtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint ctrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ztrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint xtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint strtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint dtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint qtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint ctrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ztrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint xtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
#endif

View File

@@ -641,7 +641,7 @@
#define IMATCOPY_K_CT DIMATCOPY_K_CT
#define IMATCOPY_K_RT DIMATCOPY_K_RT
#define GEADD_K DGEADD_K
#define GEADD_K DGEADD_K
#else
#define AMAX_K SAMAX_K
@@ -944,7 +944,7 @@
#define IMATCOPY_K_CT SIMATCOPY_K_CT
#define IMATCOPY_K_RT SIMATCOPY_K_RT
#define GEADD_K SGEADD_K
#define GEADD_K SGEADD_K
#endif
#else
#ifdef XDOUBLE
@@ -1770,7 +1770,7 @@
#define IMATCOPY_K_CTC ZIMATCOPY_K_CTC
#define IMATCOPY_K_RTC ZIMATCOPY_K_RTC
#define GEADD_K ZGEADD_K
#define GEADD_K ZGEADD_K
#else
@@ -2193,7 +2193,7 @@
#define IMATCOPY_K_CTC CIMATCOPY_K_CTC
#define IMATCOPY_K_RTC CIMATCOPY_K_RTC
#define GEADD_K CGEADD_K
#define GEADD_K CGEADD_K
#endif
#endif
@@ -2806,3 +2806,160 @@ typedef struct {
#endif
#endif
#ifndef COMPLEX
#ifdef XDOUBLE
#define TRTRS_UNU_SINGLE qtrtrs_UNU_single
#define TRTRS_UNN_SINGLE qtrtrs_UNN_single
#define TRTRS_UTU_SINGLE qtrtrs_UTU_single
#define TRTRS_UTN_SINGLE qtrtrs_UTN_single
#define TRTRS_LNU_SINGLE qtrtrs_LNU_single
#define TRTRS_LNN_SINGLE qtrtrs_LNN_single
#define TRTRS_LTU_SINGLE qtrtrs_LTU_single
#define TRTRS_LTN_SINGLE qtrtrs_LTN_single
#define TRTRS_UNU_PARALLEL qtrtrs_UNU_parallel
#define TRTRS_UNN_PARALLEL qtrtrs_UNN_parallel
#define TRTRS_UTU_PARALLEL qtrtrs_UTU_parallel
#define TRTRS_UTN_PARALLEL qtrtrs_UTN_parallel
#define TRTRS_LNU_PARALLEL qtrtrs_LNU_parallel
#define TRTRS_LNN_PARALLEL qtrtrs_LNN_parallel
#define TRTRS_LTU_PARALLEL qtrtrs_LTU_parallel
#define TRTRS_LTN_PARALLEL qtrtrs_LTN_parallel
#elif defined(DOUBLE)
#define TRTRS_UNU_SINGLE dtrtrs_UNU_single
#define TRTRS_UNN_SINGLE dtrtrs_UNN_single
#define TRTRS_UTU_SINGLE dtrtrs_UTU_single
#define TRTRS_UTN_SINGLE dtrtrs_UTN_single
#define TRTRS_LNU_SINGLE dtrtrs_LNU_single
#define TRTRS_LNN_SINGLE dtrtrs_LNN_single
#define TRTRS_LTU_SINGLE dtrtrs_LTU_single
#define TRTRS_LTN_SINGLE dtrtrs_LTN_single
#define TRTRS_UNU_PARALLEL dtrtrs_UNU_parallel
#define TRTRS_UNN_PARALLEL dtrtrs_UNN_parallel
#define TRTRS_UTU_PARALLEL dtrtrs_UTU_parallel
#define TRTRS_UTN_PARALLEL dtrtrs_UTN_parallel
#define TRTRS_LNU_PARALLEL dtrtrs_LNU_parallel
#define TRTRS_LNN_PARALLEL dtrtrs_LNN_parallel
#define TRTRS_LTU_PARALLEL dtrtrs_LTU_parallel
#define TRTRS_LTN_PARALLEL dtrtrs_LTN_parallel
#else
#define TRTRS_UNU_SINGLE strtrs_UNU_single
#define TRTRS_UNN_SINGLE strtrs_UNN_single
#define TRTRS_UTU_SINGLE strtrs_UTU_single
#define TRTRS_UTN_SINGLE strtrs_UTN_single
#define TRTRS_LNU_SINGLE strtrs_LNU_single
#define TRTRS_LNN_SINGLE strtrs_LNN_single
#define TRTRS_LTU_SINGLE strtrs_LTU_single
#define TRTRS_LTN_SINGLE strtrs_LTN_single
#define TRTRS_UNU_PARALLEL strtrs_UNU_parallel
#define TRTRS_UNN_PARALLEL strtrs_UNN_parallel
#define TRTRS_UTU_PARALLEL strtrs_UTU_parallel
#define TRTRS_UTN_PARALLEL strtrs_UTN_parallel
#define TRTRS_LNU_PARALLEL strtrs_LNU_parallel
#define TRTRS_LNN_PARALLEL strtrs_LNN_parallel
#define TRTRS_LTU_PARALLEL strtrs_LTU_parallel
#define TRTRS_LTN_PARALLEL strtrs_LTN_parallel
#endif
#else
#ifdef XDOUBLE
#define TRTRS_UNU_SINGLE xtrtrs_UNU_single
#define TRTRS_UNN_SINGLE xtrtrs_UNN_single
#define TRTRS_UTU_SINGLE xtrtrs_UTU_single
#define TRTRS_UTN_SINGLE xtrtrs_UTN_single
#define TRTRS_URU_SINGLE xtrtrs_URU_single
#define TRTRS_URN_SINGLE xtrtrs_URN_single
#define TRTRS_UCU_SINGLE xtrtrs_UCU_single
#define TRTRS_UCN_SINGLE xtrtrs_UCN_single
#define TRTRS_LNU_SINGLE xtrtrs_LNU_single
#define TRTRS_LNN_SINGLE xtrtrs_LNN_single
#define TRTRS_LTU_SINGLE xtrtrs_LTU_single
#define TRTRS_LTN_SINGLE xtrtrs_LTN_single
#define TRTRS_LRU_SINGLE xtrtrs_LRU_single
#define TRTRS_LRN_SINGLE xtrtrs_LRN_single
#define TRTRS_LCU_SINGLE xtrtrs_LCU_single
#define TRTRS_LCN_SINGLE xtrtrs_LCN_single
#define TRTRS_UNU_PARALLEL xtrtrs_UNU_parallel
#define TRTRS_UNN_PARALLEL xtrtrs_UNN_parallel
#define TRTRS_UTU_PARALLEL xtrtrs_UTU_parallel
#define TRTRS_UTN_PARALLEL xtrtrs_UTN_parallel
#define TRTRS_URU_PARALLEL xtrtrs_URU_parallel
#define TRTRS_URN_PARALLEL xtrtrs_URN_parallel
#define TRTRS_UCU_PARALLEL xtrtrs_UCU_parallel
#define TRTRS_UCN_PARALLEL xtrtrs_UCN_parallel
#define TRTRS_LNU_PARALLEL xtrtrs_LNU_parallel
#define TRTRS_LNN_PARALLEL xtrtrs_LNN_parallel
#define TRTRS_LTU_PARALLEL xtrtrs_LTU_parallel
#define TRTRS_LTN_PARALLEL xtrtrs_LTN_parallel
#define TRTRS_LRU_PARALLEL xtrtrs_LRU_parallel
#define TRTRS_LRN_PARALLEL xtrtrs_LRN_parallel
#define TRTRS_LCU_PARALLEL xtrtrs_LCU_parallel
#define TRTRS_LCN_PARALLEL xtrtrs_LCN_parallel
#elif defined(DOUBLE)
#define TRTRS_UNU_SINGLE ztrtrs_UNU_single
#define TRTRS_UNN_SINGLE ztrtrs_UNN_single
#define TRTRS_UTU_SINGLE ztrtrs_UTU_single
#define TRTRS_UTN_SINGLE ztrtrs_UTN_single
#define TRTRS_URU_SINGLE ztrtrs_URU_single
#define TRTRS_URN_SINGLE ztrtrs_URN_single
#define TRTRS_UCU_SINGLE ztrtrs_UCU_single
#define TRTRS_UCN_SINGLE ztrtrs_UCN_single
#define TRTRS_LNU_SINGLE ztrtrs_LNU_single
#define TRTRS_LNN_SINGLE ztrtrs_LNN_single
#define TRTRS_LTU_SINGLE ztrtrs_LTU_single
#define TRTRS_LTN_SINGLE ztrtrs_LTN_single
#define TRTRS_LRU_SINGLE ztrtrs_LRU_single
#define TRTRS_LRN_SINGLE ztrtrs_LRN_single
#define TRTRS_LCU_SINGLE ztrtrs_LCU_single
#define TRTRS_LCN_SINGLE ztrtrs_LCN_single
#define TRTRS_UNU_PARALLEL ztrtrs_UNU_parallel
#define TRTRS_UNN_PARALLEL ztrtrs_UNN_parallel
#define TRTRS_UTU_PARALLEL ztrtrs_UTU_parallel
#define TRTRS_UTN_PARALLEL ztrtrs_UTN_parallel
#define TRTRS_URU_PARALLEL ztrtrs_URU_parallel
#define TRTRS_URN_PARALLEL ztrtrs_URN_parallel
#define TRTRS_UCU_PARALLEL ztrtrs_UCU_parallel
#define TRTRS_UCN_PARALLEL ztrtrs_UCN_parallel
#define TRTRS_LNU_PARALLEL ztrtrs_LNU_parallel
#define TRTRS_LNN_PARALLEL ztrtrs_LNN_parallel
#define TRTRS_LTU_PARALLEL ztrtrs_LTU_parallel
#define TRTRS_LTN_PARALLEL ztrtrs_LTN_parallel
#define TRTRS_LRU_PARALLEL ztrtrs_LRU_parallel
#define TRTRS_LRN_PARALLEL ztrtrs_LRN_parallel
#define TRTRS_LCU_PARALLEL ztrtrs_LCU_parallel
#define TRTRS_LCN_PARALLEL ztrtrs_LCN_parallel
#else
#define TRTRS_UNU_SINGLE ctrtrs_UNU_single
#define TRTRS_UNN_SINGLE ctrtrs_UNN_single
#define TRTRS_UTU_SINGLE ctrtrs_UTU_single
#define TRTRS_UTN_SINGLE ctrtrs_UTN_single
#define TRTRS_URU_SINGLE ctrtrs_URU_single
#define TRTRS_URN_SINGLE ctrtrs_URN_single
#define TRTRS_UCU_SINGLE ctrtrs_UCU_single
#define TRTRS_UCN_SINGLE ctrtrs_UCN_single
#define TRTRS_LNU_SINGLE ctrtrs_LNU_single
#define TRTRS_LNN_SINGLE ctrtrs_LNN_single
#define TRTRS_LTU_SINGLE ctrtrs_LTU_single
#define TRTRS_LTN_SINGLE ctrtrs_LTN_single
#define TRTRS_LRU_SINGLE ctrtrs_LRU_single
#define TRTRS_LRN_SINGLE ctrtrs_LRN_single
#define TRTRS_LCU_SINGLE ctrtrs_LCU_single
#define TRTRS_LCN_SINGLE ctrtrs_LCN_single
#define TRTRS_UNU_PARALLEL ctrtrs_UNU_parallel
#define TRTRS_UNN_PARALLEL ctrtrs_UNN_parallel
#define TRTRS_UTU_PARALLEL ctrtrs_UTU_parallel
#define TRTRS_UTN_PARALLEL ctrtrs_UTN_parallel
#define TRTRS_URU_PARALLEL ctrtrs_URU_parallel
#define TRTRS_URN_PARALLEL ctrtrs_URN_parallel
#define TRTRS_UCU_PARALLEL ctrtrs_UCU_parallel
#define TRTRS_UCN_PARALLEL ctrtrs_UCN_parallel
#define TRTRS_LNU_PARALLEL ctrtrs_LNU_parallel
#define TRTRS_LNN_PARALLEL ctrtrs_LNN_parallel
#define TRTRS_LTU_PARALLEL ctrtrs_LTU_parallel
#define TRTRS_LTN_PARALLEL ctrtrs_LTN_parallel
#define TRTRS_LRU_PARALLEL ctrtrs_LRU_parallel
#define TRTRS_LRN_PARALLEL ctrtrs_LRN_parallel
#define TRTRS_LCU_PARALLEL ctrtrs_LCU_parallel
#define TRTRS_LCN_PARALLEL ctrtrs_LCN_parallel
#endif
#endif

View File

@@ -39,6 +39,35 @@
#ifndef COMMON_POWER
#define COMMON_POWER
#define str(x) #x
#ifdef OS_AIX
#define XXSPLTD(T,A,z) xxpermdi T, A, A, 0b##z##z
#define XXMRGHD(T,A,B) xxpermdi T, A, B, 0b00
#define XXMRGLD(T,A,B) xxpermdi T, A, B, 0b11
#define XXSWAPD(T,A) xxpermdi T, A, A, 0b10
#define XVMOVDP(T,A) xvcpsgndp T, A, A
#define XXSPLTD_S(T,A,z) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) " \n\t"
#define XXMRGHD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b00 \n\t"
#define XXMRGLD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b11 \n\t"
#define XXSWAPD_S(T,A) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b10 \n\t"
#else
#define XXSPLTD(T,A,z) xxspltd T, A, z
#define XXMRGHD(T,A,B) xxmrghd T, A, B
#define XXMRGLD(T,A,B) xxmrgld T, A, B
#define XXSWAPD(T,A) xxswapd T, A
#define XVMOVDP(T,A) xvmovdp T, A
#define XXSPLTD_S(T,A,z) "xxspltd " str(T) ", " str(A) ", " str(z)" \n\t"
#define XXMRGHD_S(T,A,B) "xxmrghd " str(T) ", " str(A) ", " str(B)" \n\t"
#define XXMRGLD_S(T,A,B) "xxmrgld " str(T) ", " str(A) ", " str(B)" \n\t"
#define XXSWAPD_S(T,A) "xxswapd " str(T) ", " str(A) " \n\t"
#endif
#if defined(POWER8) || defined(POWER9)
#define MB __asm__ __volatile__ ("eieio":::"memory")
#define WMB __asm__ __volatile__ ("eieio":::"memory")
@@ -241,7 +270,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH
#endif
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) )
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970)
#define DCBT_ARG 0
#else
#define DCBT_ARG 8
@@ -499,7 +528,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#ifdef OS_LINUX
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifndef __64BIT__
#define PROLOGUE \
.section .text;\
@@ -784,7 +813,7 @@ Lmcount$lazy_ptr:
#define HALT mfspr r0, 1023
#ifdef OS_LINUX
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#if defined(PPC440) || defined(PPC440FP2)
#undef MAX_CPU_NUMBER
#define MAX_CPU_NUMBER 1
@@ -829,7 +858,7 @@ Lmcount$lazy_ptr:
#define MAP_ANONYMOUS MAP_ANON
#endif
#ifdef OS_LINUX
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifndef __64BIT__
#define FRAMESLOT(X) (((X) * 4) + 8)
#else

View File

@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* SIZE must be carefully chosen to be:
* - as small as possible to maximize the number of stack allocation
* - large enough to support all architectures and kernel
* Chosing a too small SIZE will lead to a stack smashing.
* Choosing a SIZE too small will lead to a stack smashing.
*/
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \

View File

@@ -194,10 +194,6 @@ int trsm_thread(int mode, BLASLONG m, BLASLONG n,
int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
int beta_thread(int mode, BLASLONG m, BLASLONG n,
double alpha_r, double alpha_i,
void *c, BLASLONG ldc, int (*fuction)());
int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k,
void *offsetA, BLASLONG lda,
void *offsetB, BLASLONG jb,

View File

@@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#endif
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona.
//Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

View File

@@ -129,12 +129,13 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
*ecx=cpuinfo[2];
*edx=cpuinfo[3];
#else
__asm__ __volatile__("cpuid"
__asm__ __volatile__("mov $0, %%ecx;"
"cpuid"
: "=a" (*eax),
"=b" (*ebx),
"=c" (*ecx),
"=d" (*edx)
: "0" (op), "c"(0));
: "0" (op));
#endif
}
@@ -224,7 +225,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#endif
#define HUGE_PAGESIZE ( 2 << 20)
#ifndef BUFFERSIZE
#define BUFFER_SIZE (32 << 20)
#else
#define BUFFER_SIZE (32 << BUFFERSIZE)
#endif
#define SEEK_ADDRESS
@@ -276,7 +281,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#ifdef ASSEMBLER
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona.
//Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

14
cpp_thread_test/Makefile Normal file
View File

@@ -0,0 +1,14 @@
include ../Makefile.rule
all :: dgemv_tester dgemm_tester
dgemv_tester :
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
./dgemv_tester
dgemm_tester : dgemv_tester
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
./dgemm_tester
clean ::
rm -f dgemv_tester dgemm_tester

View File

@@ -0,0 +1,55 @@
inline void pauser(){
/// a portable way to pause a program
std::string dummy;
std::cout << "Press enter to continue...";
std::getline(std::cin, dummy);
}
void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
for(uint32_t i=0; i<numMat; i++){
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
matBlock[i][j] = rngdist(PRNG);
}
}
for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){
for(uint32_t j=0; j<numMat; j++){
matBlock[i+j] = matBlock[j];
}
}
}
void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){
for(uint32_t i=0; i<numVec; i++){
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
vecBlock[i][j] = rngdist(PRNG);
}
}
for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){
for(uint32_t j=0; j<numVec; j++){
vecBlock[i+j] = vecBlock[j];
}
}
}
std::mt19937_64 InitPRNG(){
std::random_device rd;
std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
//make sure the internal state of the PRNG is properly mixed by generating 10M random numbers
//PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed
for (uint32_t i=0;i<10000000;i++) rngdist(PRNG);
return PRNG;
}
void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){
std::cout<<i<<std::endl;
for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){
std::cout<<matBlock[i][j*randomMatSize + k]<<" ";
}
std::cout<<std::endl;
}
std::cout<<std::endl;
}
}

View File

@@ -0,0 +1,92 @@
#include <iostream>
#include <vector>
#include <random>
#include <future>
#include <omp.h>
#include "../cblas.h"
#include "cpp_thread_safety_common.h"
void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize);
}
int main(int argc, char* argv[]){
blasint randomMatSize = 1024; //dimension of the random square matrices used
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
uint32_t numTestRounds = 16; //number of testing rounds before success exit
if (argc > 4){
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
abort();
}
if(argc == 4){
std::vector<std::string> cliArgs;
for (int i = 1; i < argc; i++){
cliArgs.push_back(argv[i]);
std::cout<<argv[i]<<std::endl;
}
randomMatSize = std::stoul(cliArgs[0]);
numConcurrentThreads = std::stoul(cliArgs[1]);
numTestRounds = std::stoul(cliArgs[2]);
}
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
std::vector<std::vector<double>> matBlock(numConcurrentThreads*3);
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
std::cout<<"*----------------------------*\n";
std::cout<<"| DGEMM thread safety tester |\n";
std::cout<<"*----------------------------*\n";
std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n';
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
std::cout<<"Initializing random number generator..."<<std::flush;
std::mt19937_64 PRNG = InitPRNG();
std::cout<<"done\n";
std::cout<<"Preparing to test CBLAS DGEMM thread safety\n";
std::cout<<"Allocating matrices..."<<std::flush;
for(uint32_t i=0; i<(numConcurrentThreads*3); i++){
matBlock[i].resize(randomMatSize*randomMatSize);
}
std::cout<<"done\n";
//pauser();
std::cout<<"Filling matrices with random numbers..."<<std::flush;
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3);
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
std::cout<<"done\n";
std::cout<<"Testing CBLAS DGEMM thread safety\n";
omp_set_num_threads(numConcurrentThreads);
for(uint32_t R=0; R<numTestRounds; R++){
std::cout<<"DGEMM round #"<<R<<std::endl;
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
#pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads)
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize);
//launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]);
}
std::cout<<"done\n";
std::cout<<"Waiting for threads to finish..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i].get();
}
std::cout<<"done\n";
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
std::cout<<"Comparing results from different threads..."<<std::flush;
for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl;
std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl;
return -1;
}
}
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl;
return 0;
}

View File

@@ -0,0 +1,101 @@
#include <iostream>
#include <vector>
#include <random>
#include <future>
#include <omp.h>
#include "../cblas.h"
#include "cpp_thread_safety_common.h"
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
const blasint inc = 1;
cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
}
int main(int argc, char* argv[]){
blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
uint32_t numTestRounds = 16; //number of testing rounds before success exit
if (argc > 4){
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
abort();
}
if(argc == 4){
std::vector<std::string> cliArgs;
for (int i = 1; i < argc; i++){
cliArgs.push_back(argv[i]);
std::cout<<argv[i]<<std::endl;
}
randomMatSize = std::stoul(cliArgs.at(0));
numConcurrentThreads = std::stoul(cliArgs.at(1));
numTestRounds = std::stoul(cliArgs.at(2));
}
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
std::vector<std::vector<double>> matBlock(numConcurrentThreads);
std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2);
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
std::cout<<"*----------------------------*\n";
std::cout<<"| DGEMV thread safety tester |\n";
std::cout<<"*----------------------------*\n";
std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n';
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
std::cout<<"Initializing random number generator..."<<std::flush;
std::mt19937_64 PRNG = InitPRNG();
std::cout<<"done\n";
std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
std::cout<<"Allocating matrices..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
matBlock.at(i).resize(randomMatSize*randomMatSize);
}
std::cout<<"done\n";
std::cout<<"Allocating vectors..."<<std::flush;
for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
vecBlock.at(i).resize(randomMatSize);
}
std::cout<<"done\n";
//pauser();
std::cout<<"Filling matrices with random numbers..."<<std::flush;
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1);
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads);
std::cout<<"done\n";
std::cout<<"Filling vectors with random numbers..."<<std::flush;
FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2);
std::cout<<"done\n";
std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
omp_set_num_threads(numConcurrentThreads);
for(uint32_t R=0; R<numTestRounds; R++){
std::cout<<"DGEMV round #"<<R<<std::endl;
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
#pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
}
std::cout<<"done\n";
std::cout<<"Waiting for threads to finish..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i].get();
}
std::cout<<"done\n";
std::cout<<"Comparing results from different threads..."<<std::flush;
for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
return -1;
}
}
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
return 0;
}

View File

@@ -94,7 +94,7 @@ int get_feature(char *search)
if( p == NULL ) return 0;
t = strtok(p," ");
while( t = strtok(NULL," "))
while( (t = strtok(NULL," ")))
{
if (!strcmp(t, search)) { return(1); }
}
@@ -206,6 +206,33 @@ void get_subdirname(void)
printf("arm64");
}
void get_cpucount(void)
{
int n=0;
#ifdef linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if (!strncmp("processor", buffer, 9))
n++;
}
fclose(infile);
printf("#define NUM_CORES %d\n",n);
#endif
}
void get_cpuconfig(void)
{
@@ -309,6 +336,7 @@ void get_cpuconfig(void)
printf("#define DTB_SIZE 4096 \n");
break;
}
get_cpucount();
}
@@ -344,12 +372,10 @@ void get_features(void)
if( p == NULL ) return;
t = strtok(p," ");
while( t = strtok(NULL," "))
while( (t = strtok(NULL," ")))
{
}
#endif
return;
}

View File

@@ -1197,7 +1197,11 @@ int get_cpuname(void){
case 3:
case 5:
case 6:
#if defined(__x86_64__) || defined(__amd64__)
return CPUTYPE_CORE2;
#else
return CPUTYPE_PENTIUM2;
#endif
case 7:
case 8:
case 10:
@@ -1211,7 +1215,7 @@ int get_cpuname(void){
return CPUTYPE_CORE2;
}
break;
case 1:
case 1: // family 6 exmodel 1
switch (model) {
case 6:
return CPUTYPE_CORE2;
@@ -1228,7 +1232,7 @@ int get_cpuname(void){
return CPUTYPE_DUNNINGTON;
}
break;
case 2:
case 2: // family 6 exmodel 2
switch (model) {
case 5:
//Intel Core (Clarkdale) / Core (Arrandale)
@@ -1257,7 +1261,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 3:
case 3: // family 6 exmodel 3
switch (model) {
case 7:
// Bay Trail
@@ -1287,7 +1291,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 4:
case 4: // family 6 exmodel 4
switch (model) {
case 5:
case 6:
@@ -1321,7 +1325,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 5:
case 5: // family 6 exmodel 5
switch (model) {
case 6:
//Broadwell
@@ -1364,7 +1368,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 6:
case 6: // family 6 exmodel 6
switch (model) {
case 6: // Cannon Lake
if(support_avx512())
@@ -1376,7 +1380,22 @@ int get_cpuname(void){
else
return CPUTYPE_NEHALEM;
}
break;
break;
case 7: // family 6 exmodel 7
switch (model) {
case 10: // Goldmont Plus
return CPUTYPE_NEHALEM;
case 14: // Ice Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
case 9:
case 8:
switch (model) {
@@ -1412,7 +1431,11 @@ int get_cpuname(void){
case 0x5:
return CPUTYPE_AMDK6;
case 0x6:
#if defined(__x86_64__) || defined(__amd64__)
return CPUTYPE_BARCELONA;
#else
return CPUTYPE_ATHLON;
#endif
case 0xf:
switch (exfamily) {
case 0:
@@ -1795,7 +1818,11 @@ int get_coretype(void){
case 4:
case 5:
case 6:
#if defined(__x86_64__) || defined(__amd64__)
return CORE_CORE2;
#else
return CORE_P6;
#endif
case 7:
return CORE_KATMAI;
case 8:
@@ -1979,6 +2006,38 @@ int get_coretype(void){
return CORE_NEHALEM;
}
break;
case 6:
if (model == 6)
#ifndef NO_AVX512
return CORE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
#endif
break;
case 7:
if (model == 10)
return CORE_NEHALEM;
if (model == 14)
#ifndef NO_AVX512
return CORE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
#endif
break;
case 9:
case 8:
if (model == 14) { // Kaby Lake
@@ -2002,7 +2061,11 @@ int get_coretype(void){
if (vendor == VENDOR_AMD){
if (family <= 0x5) return CORE_80486;
#if defined(__x86_64__) || defined(__amd64__)
if (family <= 0xe) return CORE_BARCELONA;
#else
if (family <= 0xe) return CORE_ATHLON;
#endif
if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
else if (exfamily == 5) return CORE_BOBCAT;

View File

@@ -30,17 +30,20 @@
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
#define CPU_Z15 3
static char *cpuname[] = {
"ZARCH_GENERIC",
"Z13",
"Z14"
"Z14",
"Z15"
};
static char *cpuname_lower[] = {
"zarch_generic",
"z13",
"z14"
"z14",
"z15"
};
int detect(void)
@@ -66,6 +69,8 @@ int detect(void)
if (strstr(p, "2965")) return CPU_Z13;
if (strstr(p, "3906")) return CPU_Z14;
if (strstr(p, "3907")) return CPU_Z14;
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14
return CPU_GENERIC;
}

View File

@@ -6,6 +6,8 @@ TOPDIR = ..
include $(TOPDIR)/Makefile.system
override CFLAGS += -DADD$(BU) -DCBLAS
override TARGET_ARCH=
override TARGET_MACH=
LIB = $(TOPDIR)/$(LIBNAME)

View File

@@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@@ -1503,6 +1503,8 @@ C $ ' .' )
NC = 0
RESET = .TRUE.
ERRMAX = RZERO
RALS = RONE
RBETS = RONE
*
DO 100 IN = 1, NIDIM
N = IDIM( IN )

View File

@@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@@ -1504,6 +1504,8 @@ C $ ' .' )
NC = 0
RESET = .TRUE.
ERRMAX = RZERO
RALS = RONE
RBETS = RONE
*
DO 100 IN = 1, NIDIM
N = IDIM( IN )

View File

@@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO
6 NUMBER OF VALUES OF N
7 NUMBER OF VALUES OF N
1 2 3 5 7 9 35 VALUES OF N
3 NUMBER OF VALUES OF ALPHA
0.0 1.0 0.7 VALUES OF ALPHA

View File

@@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO
6 NUMBER OF VALUES OF N
7 NUMBER OF VALUES OF N
0 1 2 3 5 9 35 VALUES OF N
3 NUMBER OF VALUES OF ALPHA
0.0 1.0 0.7 VALUES OF ALPHA

View File

@@ -338,7 +338,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
START_RPCC();
@@ -398,7 +398,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
START_RPCC();
@@ -463,7 +463,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
START_RPCC();

View File

@@ -332,13 +332,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#else
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
START_RPCC();

View File

@@ -104,7 +104,7 @@ typedef struct {
#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
BETA[0], BETA[1], NULL, 0, NULL, 0, \
(FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC)
(FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
#endif
#ifndef ICOPYB_OPERATION
@@ -408,13 +408,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Make sure if no one is using another buffer */
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
START_RPCC();
@@ -441,7 +441,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for (i = 0; i < args -> nthreads; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
}
WMB;
}
current = mypos;
@@ -458,7 +459,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC();
/* thread has to wait */
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2);
@@ -477,6 +478,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
}
}
} while (current != mypos);
@@ -517,6 +519,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (is + min_i >= m_to) {
/* Thread doesn't need this buffer any more */
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
}
}
@@ -541,13 +544,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Make sure if no one is using another buffer */
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
START_RPCC();
@@ -595,7 +598,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC();
/* thread has to wait */
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2);
@@ -613,6 +616,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
}
}
} while (current != mypos);
@@ -677,13 +681,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Make sure if no one is using another buffer */
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
START_RPCC();
@@ -731,7 +735,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC();
/* thread has to wait */
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2);
@@ -748,8 +752,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
}
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}
} while (current != mypos);
@@ -787,7 +792,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif
if (is + min_i >= m_to) {
/* Thread doesn't need this buffer any more */
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}
@@ -804,7 +810,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for (i = 0; i < args -> nthreads; i++) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;MB;};
}
}
@@ -840,6 +846,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
*range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
#else
CRITICAL_SECTION level3_lock;
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
blas_arg_t newarg;
blas_queue_t queue[MAX_CPU_NUMBER];
@@ -869,6 +884,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
#endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_lock(&level3_lock);
#else
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
newarg.m = args -> m;
newarg.n = args -> n;
newarg.k = args -> k;
@@ -973,6 +996,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
free(job);
#endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_unlock(&level3_lock);
#else
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
return 0;
}

View File

@@ -365,12 +365,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Split local region of B into parts */
for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
min_jj = MIN(n_to, js + div_n) - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
/* Copy part of local region of B into workspace */
START_RPCC();
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,

View File

@@ -135,10 +135,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
START_RPCC();
GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
@@ -201,10 +205,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
START_RPCC();
GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
@@ -292,10 +300,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
START_RPCC();
GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb,
@@ -358,10 +370,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
START_RPCC();
GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb,

View File

@@ -122,10 +122,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < ls - js; jjs += min_jj){
min_jj = ls - js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
#ifndef TRANSA
GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE);
#else
@@ -142,10 +146,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < min_l; jjs += min_jj){
min_jj = min_l - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
#ifndef TRANSA
TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE);
#else
@@ -195,10 +203,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
#ifndef TRANSA
GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
#else
@@ -246,10 +258,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < min_l; jjs += min_jj){
min_jj = min_l - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
#ifndef TRANSA
TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE);
#else
@@ -267,10 +283,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
min_jj = js - ls - min_l - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
#ifndef TRANSA
GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda,
sb + min_l * (min_l + jjs) * COMPSIZE);
@@ -324,10 +344,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
#ifndef TRANSA
GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
#else

View File

@@ -21,9 +21,13 @@ else
ifeq ($(ARCH),power)
COMMONOBJS += dynamic_power.$(SUFFIX)
else
ifeq ($(ARCH),zarch)
COMMONOBJS += dynamic_zarch.$(SUFFIX)
else
COMMONOBJS += dynamic.$(SUFFIX)
endif
endif
endif
else
COMMONOBJS += parameter.$(SUFFIX)
endif
@@ -85,9 +89,13 @@ else
ifeq ($(ARCH),power)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX)
else
ifeq ($(ARCH),zarch)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX)
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
endif
endif
endif
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
endif

View File

@@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
/* jobs is queued. */
/* We need this grobal for cheking if initialization is finished. */
/* We need this global for checking if initialization is finished. */
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
/* Local Variables */
@@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
#ifdef MONITOR
/* Monitor is a function to see thread's status for every seconds. */
/* Usually it turns off and it's for debugging. */
/* Monitor is a function to see thread's status for every second. */
/* Usually it turns off and it's for debugging. */
static pthread_t monitor_thread;
static int main_status[MAX_CPU_NUMBER];

View File

@@ -50,7 +50,7 @@
/* This is a thread implementation for Win32 lazy implementation */
/* Thread server common infomation */
/* Thread server common information */
typedef struct{
CRITICAL_SECTION lock;
HANDLE filled;
@@ -61,7 +61,7 @@ typedef struct{
} blas_pool_t;
/* We need this global for cheking if initialization is finished. */
/* We need this global for checking if initialization is finished. */
int blas_server_avail = 0;
/* Local Variables */
@@ -462,11 +462,15 @@ int BLASFUNC(blas_thread_shutdown)(void){
for(i = 0; i < blas_num_threads - 1; i++){
// Could also just use WaitForMultipleObjects
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
#ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
TerminateThread(blas_threads[i],0);
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
if (WAIT_OBJECT_0 != wait_thread_value) {
TerminateThread(blas_threads[i],0);
}
#endif
CloseHandle(blas_threads[i]);
}

View File

@@ -329,7 +329,7 @@ int support_avx512(){
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 1){
if((ebx & (1<<7)) == 0){
ret=0; //OS does not even support AVX2
}
if((ebx & (1<<31)) != 0){
@@ -585,9 +585,29 @@ static gotoblas_t *get_coretype(void){
}
}
return NULL;
case 7:
if (model == 10) // Goldmont Plus
return &gotoblas_NEHALEM;
if (model == 14) {
// Ice Lake
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 9:
case 8:
if (model == 14 ) { // Kaby Lake
if (model == 14 ) { // Kaby Lake, Coffee Lake
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {

View File

@@ -37,17 +37,24 @@
/*********************************************************************/
#include "common.h"
#if (defined OS_LINUX || defined OS_ANDROID)
#include <asm/hwcap.h>
#include <sys/auxv.h>
#endif
extern gotoblas_t gotoblas_ARMV8;
extern gotoblas_t gotoblas_CORTEXA53;
extern gotoblas_t gotoblas_CORTEXA57;
extern gotoblas_t gotoblas_CORTEXA72;
extern gotoblas_t gotoblas_CORTEXA73;
extern gotoblas_t gotoblas_FALKOR;
extern gotoblas_t gotoblas_THUNDERX;
extern gotoblas_t gotoblas_THUNDERX2T99;
extern gotoblas_t gotoblas_TSV110;
extern void openblas_warning(int verbose, const char * msg);
#define NUM_CORETYPES 4
#define NUM_CORETYPES 9
/*
* In case asm/hwcap.h is outdated on the build system, make sure
@@ -63,17 +70,27 @@ extern void openblas_warning(int verbose, const char * msg);
static char *corename[] = {
"armv8",
"cortexa53",
"cortexa57",
"cortexa72",
"cortexa73",
"falkor",
"thunderx",
"thunderx2t99",
"tsv110",
"unknown"
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1];
if (gotoblas == &gotoblas_THUNDERX) return corename[ 2];
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3];
if (gotoblas == &gotoblas_CORTEXA53) return corename[ 1];
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 2];
if (gotoblas == &gotoblas_CORTEXA72) return corename[ 3];
if (gotoblas == &gotoblas_CORTEXA73) return corename[ 4];
if (gotoblas == &gotoblas_FALKOR) return corename[ 5];
if (gotoblas == &gotoblas_THUNDERX) return corename[ 6];
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7];
if (gotoblas == &gotoblas_TSV110) return corename[ 8];
return corename[NUM_CORETYPES];
}
@@ -94,9 +111,14 @@ static gotoblas_t *force_coretype(char *coretype) {
switch (found)
{
case 0: return (&gotoblas_ARMV8);
case 1: return (&gotoblas_CORTEXA57);
case 2: return (&gotoblas_THUNDERX);
case 3: return (&gotoblas_THUNDERX2T99);
case 1: return (&gotoblas_CORTEXA53);
case 2: return (&gotoblas_CORTEXA57);
case 3: return (&gotoblas_CORTEXA72);
case 4: return (&gotoblas_CORTEXA73);
case 5: return (&gotoblas_FALKOR);
case 6: return (&gotoblas_THUNDERX);
case 7: return (&gotoblas_THUNDERX2T99);
case 8: return (&gotoblas_TSV110);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
@@ -105,13 +127,17 @@ static gotoblas_t *force_coretype(char *coretype) {
static gotoblas_t *get_coretype(void) {
int implementer, variant, part, arch, revision, midr_el1;
#if (defined OS_LINUX || defined OS_ANDROID)
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
char coremsg[128];
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
openblas_warning(1, coremsg);
return NULL;
}
#else
return NULL;
#endif
get_cpu_ftr(MIDR_EL1, midr_el1);
/*
@@ -130,10 +156,14 @@ static gotoblas_t *get_coretype(void) {
case 0x41: // ARM
switch (part)
{
case 0xd07: // Cortex A57
case 0xd08: // Cortex A72
case 0xd03: // Cortex A53
return &gotoblas_CORTEXA53;
case 0xd07: // Cortex A57
return &gotoblas_CORTEXA57;
case 0xd08: // Cortex A72
return &gotoblas_CORTEXA72;
case 0xd09: // Cortex A73
return &gotoblas_CORTEXA73;
}
break;
case 0x42: // Broadcom
@@ -152,6 +182,20 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_THUNDERX2T99;
}
break;
case 0x48: // HiSilicon
switch (part)
{
case 0xd01: // tsv110
return &gotoblas_TSV110;
}
break;
case 0x51: // Qualcomm
switch (part)
{
case 0xc00: // Falkor
return &gotoblas_FALKOR;
}
break;
}
return NULL;
}

View File

@@ -3,7 +3,9 @@
extern gotoblas_t gotoblas_POWER6;
extern gotoblas_t gotoblas_POWER8;
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
extern gotoblas_t gotoblas_POWER9;
#endif
extern void openblas_warning(int verbose, const char *msg);
@@ -19,7 +21,9 @@ static char *corename[] = {
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_POWER6) return corename[1];
if (gotoblas == &gotoblas_POWER8) return corename[2];
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
if (gotoblas == &gotoblas_POWER9) return corename[3];
#endif
return corename[0];
}
@@ -29,8 +33,10 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_POWER6;
if (__builtin_cpu_is("power8"))
return &gotoblas_POWER8;
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
if (__builtin_cpu_is("power9"))
return &gotoblas_POWER9;
#endif
return NULL;
}
@@ -53,7 +59,9 @@ static gotoblas_t *force_coretype(char * coretype) {
{
case 1: return (&gotoblas_POWER6);
case 2: return (&gotoblas_POWER8);
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
case 3: return (&gotoblas_POWER9);
#endif
default: return NULL;
}
snprintf(message, 128, "Core not found: %s\n", coretype);

View File

@@ -0,0 +1,131 @@
#include "common.h"
extern gotoblas_t gotoblas_Z13;
extern gotoblas_t gotoblas_Z14;
//extern gotoblas_t gotoblas_Z15;
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
//extern gotoblas_t gotoblas_Z14;
//#endif
#define NUM_CORETYPES 4
extern void openblas_warning(int verbose, const char* msg);
static char* corename[] = {
"unknown",
"Z13",
"Z14",
// "Z15",
"ZARCH_GENERIC",
};
char* gotoblas_corename(void) {
if (gotoblas == &gotoblas_Z13) return corename[1];
if (gotoblas == &gotoblas_Z14) return corename[2];
// if (gotoblas == &gotoblas_Z15) return corename[3];
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
// if (gotoblas == &gotoblas_POWER9) return corename[3];
//#endif
return corename[0]; // try generic?
}
// __builtin_cpu_is is not supported by zarch
static gotoblas_t* get_coretype(void) {
FILE* infile;
char buffer[512], * p;
p = (char*)NULL;
infile = fopen("/proc/sysinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)) {
if (!strncmp("Type", buffer, 4)) {
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
#endif
break;
}
}
fclose(infile);
if (strstr(p, "2964")) return &gotoblas_Z13;
if (strstr(p, "2965")) return &gotoblas_Z13;
if (strstr(p, "3906")) return &gotoblas_Z14;
if (strstr(p, "3907")) return &gotoblas_Z14;
if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14
if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14
return NULL; // should be ZARCH_GENERIC
}
static gotoblas_t* force_coretype(char* coretype) {
int i;
int found = -1;
char message[128];
for (i = 0; i < NUM_CORETYPES; i++)
{
if (!strncasecmp(coretype, corename[i], 20))
{
found = i;
break;
}
}
switch (found)
{
case 1: return (&gotoblas_Z13);
case 2: return (&gotoblas_Z14);
// case 3: return (&gotoblas_Z15);
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
// case 3: return (&gotoblas_POWER9);
//#endif
default: return NULL;
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
}
void gotoblas_dynamic_init(void) {
char coremsg[128];
char coren[22];
char* p;
if (gotoblas) return;
p = getenv("OPENBLAS_CORETYPE");
if (p)
{
gotoblas = force_coretype(p);
}
else
{
gotoblas = get_coretype();
}
if (gotoblas == NULL)
{
snprintf(coremsg, 128, "Falling back to Z14 core\n");
openblas_warning(1, coremsg);
gotoblas = &gotoblas_Z14;
}
if (gotoblas && gotoblas->init) {
strncpy(coren, gotoblas_corename(), 20);
sprintf(coremsg, "Core: %s\n", coren);
openblas_warning(2, coremsg);
gotoblas->init();
}
else {
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1);
}
}
void gotoblas_dynamic_quit(void) {
gotoblas = NULL;
}

View File

@@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {
int mynode = 1;
/* if number of threads is larger than inital condition */
/* if number of threads is larger than initial condition */
if (pos < 0) {
sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
return 0;
@@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) {
common -> shmid = pshmid;
if (common -> magic != SH_MAGIC) {
#if defined(__GLIBC_PREREQ)
#if __GLIBC_PREREQ(2, 7)
cpu_set_t *cpusetp;
#else
cpu_set_t cpuset;
#endif
#endif
int nums;
int ret;
@@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) {
}
CPU_FREE(cpusetp);
#else
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
if (ret!=0) {
common->num_procs = nums;
} else {
@@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) {
int i;
int n = 0;
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++;
if (CPU_ISSET(i,&cpuset)) n++;
common->num_procs = n;
}
#else
common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
common->num_procs = CPU_COUNT(&cpuset);
}
#endif

View File

@@ -129,7 +129,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <unistd.h>
#endif
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#include <sys/sysctl.h>
#include <sys/resource.h>
#endif
@@ -192,7 +192,7 @@ void goto_set_num_threads(int num_threads) {};
#else
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
#if defined(OS_LINUX) || defined(OS_SUNOS)
#ifndef NO_AFFINITY
int get_num_procs(void);
#else
@@ -229,7 +229,7 @@ int get_num_procs(void) {
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpuset)) n++;
if (CPU_ISSET(i,&cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
@@ -312,7 +312,7 @@ int get_num_procs(void) {
#endif
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
int get_num_procs(void) {
@@ -404,7 +404,7 @@ extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num;
#endif
int blas_goto_num = 0;
@@ -412,7 +412,7 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
max_num = get_num_procs();
#endif
@@ -436,7 +436,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif
@@ -822,7 +822,7 @@ static void *alloc_qalloc(void *address){
static void alloc_windows_free(struct alloc_t *alloc_info){
VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT);
VirtualFree(alloc_info, 0, MEM_RELEASE);
}
@@ -935,7 +935,7 @@ static void alloc_hugetlb_free(struct alloc_t *alloc_info){
#ifdef OS_WINDOWS
VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT);
VirtualFree(alloc_info, 0, MEM_LARGE_PAGES | MEM_RELEASE);
#endif
@@ -1622,6 +1622,7 @@ void gotoblas_dummy_for_PGI(void) {
gotoblas_init();
gotoblas_quit();
#if __PGIC__ < 19
#if 0
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
@@ -1629,6 +1630,7 @@ void gotoblas_dummy_for_PGI(void) {
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
#endif
#endif
}
#endif
@@ -1671,7 +1673,7 @@ void gotoblas_dummy_for_PGI(void) {
#include <sys/resource.h>
#endif
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#include <sys/sysctl.h>
#include <sys/resource.h>
#endif
@@ -1734,7 +1736,7 @@ void goto_set_num_threads(int num_threads) {};
#else
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
#if defined(OS_LINUX) || defined(OS_SUNOS)
#ifndef NO_AFFINITY
int get_num_procs(void);
#else
@@ -1772,7 +1774,7 @@ int get_num_procs(void) {
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpuset)) n++;
if (CPU_ISSET(i,&cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
@@ -1853,7 +1855,7 @@ int get_num_procs(void) {
#endif
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
int get_num_procs(void) {
@@ -1943,7 +1945,7 @@ extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num;
#endif
int blas_goto_num = 0;
@@ -1951,7 +1953,7 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
max_num = get_num_procs();
#endif
@@ -1975,7 +1977,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif
@@ -2039,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL;
static void alloc_mmap_free(struct release_t *release){
if (!release->address) return;
if (munmap(release -> address, BUFFER_SIZE)) {
printf("OpenBLAS : munmap failed\n");
int errsv=errno;
perror("OpenBLAS : munmap failed:");
printf("error code=%d,\trelease->address=%lx\n",errsv,release->address);
}
}
@@ -2062,15 +2068,21 @@ static void *alloc_mmap(void *address){
}
if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
} else {
#ifdef DEBUG
int errsv=errno;
perror("OpenBLAS : mmap failed:");
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
#endif
}
#ifdef OS_LINUX
@@ -2214,13 +2226,13 @@ static void *alloc_mmap(void *address){
#endif
if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
@@ -2298,7 +2310,7 @@ static void *alloc_qalloc(void *address){
static void alloc_windows_free(struct release_t *release){
VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
VirtualFree(release -> address, 0, MEM_RELEASE);
}
@@ -2420,7 +2432,7 @@ static void alloc_hugetlb_free(struct release_t *release){
#ifdef OS_WINDOWS
VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
VirtualFree(release -> address, 0, MEM_LARGE_PAGES | MEM_RELEASE);
#endif
@@ -2701,7 +2713,7 @@ void *blas_memory_alloc(int procpos){
position = 0;
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
do {
@@ -2718,7 +2730,7 @@ void *blas_memory_alloc(int procpos){
position ++;
} while (position < NUM_BUFFERS);
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
goto error;
@@ -2730,7 +2742,7 @@ void *blas_memory_alloc(int procpos){
#endif
memory[position].used = 1;
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
@@ -2751,7 +2763,7 @@ void *blas_memory_alloc(int procpos){
#ifdef ALLOC_DEVICEDRIVER
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
}
#endif
@@ -2779,11 +2791,11 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1);
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
memory[position].addr = map_address;
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
@@ -2839,7 +2851,7 @@ void blas_memory_free(void *free_area){
#endif
position = 0;
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
@@ -2855,7 +2867,7 @@ void blas_memory_free(void *free_area){
WMB;
memory[position].used = 0;
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
@@ -2872,7 +2884,7 @@ void blas_memory_free(void *free_area){
for (position = 0; position < NUM_BUFFERS; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
return;
@@ -2924,7 +2936,7 @@ void blas_shutdown(void){
#if defined(OS_LINUX) && !defined(NO_WARMUP)
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
@@ -2949,7 +2961,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
if (hot_alloc != 2) {
#endif
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
LOCK_COMMAND(&init_lock);
#endif
@@ -2959,7 +2971,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
size -= PAGESIZE;
}
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
UNLOCK_COMMAND(&init_lock);
#endif
@@ -3192,7 +3204,7 @@ void gotoblas_dummy_for_PGI(void) {
gotoblas_init();
gotoblas_quit();
#if __PGIC__ < 19
#if 0
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
@@ -3200,6 +3212,7 @@ void gotoblas_dummy_for_PGI(void) {
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
#endif
#endif
}
#endif

View File

@@ -38,21 +38,29 @@
#include <stdio.h>
#include "common.h"
#ifndef SMP
#define blas_cpu_number 1
#else
int blas_cpu_number = 1;
int blas_get_cpu_number(void){
return blas_cpu_number;
}
#ifdef OS_LINUX
#include <sys/sysinfo.h>
#include <sched.h>
#include <errno.h>
#include <linux/unistd.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/resource.h>
#endif
#ifdef OS_HAIKU
#include <unistd.h>
#endif
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#include <sys/sysctl.h>
#include <sys/resource.h>
#endif
#define FIXED_PAGESIZE 4096
void *sa = NULL;
void *sb = NULL;
static double static_buffer[BUFFER_SIZE/sizeof(double)];
@@ -60,7 +68,7 @@ static double static_buffer[BUFFER_SIZE/sizeof(double)];
void *blas_memory_alloc(int numproc){
if (sa == NULL){
#if 1
#if 0
sa = (void *)qalloc(QFAST, BUFFER_SIZE);
#else
sa = (void *)malloc(BUFFER_SIZE);
@@ -75,3 +83,296 @@ void blas_memory_free(void *free_area){
return;
}
extern void openblas_warning(int verbose, const char * msg);
#ifndef SMP
#define blas_cpu_number 1
#define blas_num_threads 1
/* Dummy Function */
int goto_get_num_procs (void) { return 1;};
void goto_set_num_threads(int num_threads) {};
#else
#if defined(OS_LINUX) || defined(OS_SUNOS)
#ifndef NO_AFFINITY
int get_num_procs(void);
#else
int get_num_procs(void) {
static int nums = 0;
cpu_set_t cpuset,*cpusetp;
size_t size;
int ret;
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 7)
int i;
#if !__GLIBC_PREREQ(2, 6)
int n;
#endif
#endif
#endif
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX)
return nums;
#endif
/*
#if !defined(__GLIBC_PREREQ)
return nums;
#else
#if !__GLIBC_PREREQ(2, 3)
return nums;
#endif
#if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
if (ret!=0) return nums;
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,&cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
#endif
return nums;
#else
if (nums >= CPU_SETSIZE) {
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) {
return nums;
}
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) {
CPU_FREE(cpusetp);
return nums;
}
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
} else {
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
if (ret!=0) {
return nums;
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
return nums;
}
#endif
#endif
*/
return 1;
}
#endif
#endif
#ifdef OS_ANDROID
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
#ifdef OS_HAIKU
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
#ifdef OS_AIX
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
#ifdef OS_WINDOWS
int get_num_procs(void) {
static int nums = 0;
if (nums == 0) {
SYSTEM_INFO sysinfo;
GetSystemInfo(&sysinfo);
nums = sysinfo.dwNumberOfProcessors;
}
return nums;
}
#endif
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
int get_num_procs(void) {
static int nums = 0;
int m[2];
size_t len;
if (nums == 0) {
m[0] = CTL_HW;
m[1] = HW_NCPU;
len = sizeof(int);
sysctl(m, 2, &nums, &len, NULL, 0);
}
return nums;
}
#endif
#if defined(OS_DARWIN)
int get_num_procs(void) {
static int nums = 0;
size_t len;
if (nums == 0){
len = sizeof(int);
sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
}
return nums;
}
/*
void set_stack_limit(int limitMB){
int result=0;
struct rlimit rl;
rlim_t StackSize;
StackSize=limitMB*1024*1024;
result=getrlimit(RLIMIT_STACK, &rl);
if(result==0){
if(rl.rlim_cur < StackSize){
rl.rlim_cur=StackSize;
result=setrlimit(RLIMIT_STACK, &rl);
if(result !=0){
fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
}
}
}
}
*/
#endif
/*
OpenBLAS uses the numbers of CPU cores in multithreading.
It can be set by openblas_set_num_threads(int num_threads);
*/
int blas_cpu_number = 0;
/*
The numbers of threads in the thread pool.
This value is equal or large than blas_cpu_number. This means some threads are sleep.
*/
int blas_num_threads = 0;
int goto_get_num_procs (void) {
return blas_cpu_number;
}
void openblas_fork_handler()
{
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
// built with "make USE_OPENMP=0".
// Hanging can still happen when OpenBLAS is built against the libgomp
// implementation of OpenMP. The problem is tracked at:
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
// In the mean time build with USE_OPENMP=0 or link against another
// implementation of OpenMP.
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
int err;
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
if(err != 0)
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
#endif
}
extern int openblas_num_threads_env();
extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num;
#endif
int blas_goto_num = 0;
int blas_omp_num = 0;
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
max_num = get_num_procs();
#endif
// blas_goto_num = 0;
#ifndef USE_OPENMP
blas_goto_num=openblas_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
if (blas_goto_num == 0) {
blas_goto_num=openblas_goto_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
}
#endif
// blas_omp_num = 0;
blas_omp_num=openblas_omp_num_threads_env();
if (blas_omp_num < 0) blas_omp_num = 0;
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif
if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
#ifdef DEBUG
printf( "Adjusted number of threads : %3d\n", blas_num_threads);
#endif
blas_cpu_number = blas_num_threads;
return blas_num_threads;
}
#endif
int openblas_get_num_procs(void) {
#ifndef SMP
return 1;
#else
return get_num_procs();
#endif
}
int openblas_get_num_threads(void) {
#ifndef SMP
return 1;
#else
// init blas_cpu_number if needed
blas_get_cpu_number();
return blas_cpu_number;
#endif
}

View File

@@ -78,10 +78,10 @@ char tmpstr[20];
#ifdef DYNAMIC_ARCH
strcat(tmp_config_str, gotoblas_corename());
#endif
if (openblas_get_parallel() == 0)
sprintf(tmpstr, " SINGLE_THREADED");
else
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
if (openblas_get_parallel() == 0)
sprintf(tmpstr, " SINGLE_THREADED");
else
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
strcat(tmp_config_str, tmpstr);
return tmp_config_str;
}

View File

@@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol
libgoto_hpl.def : gensymbol
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
ifeq ($(OSNAME), Darwin)
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
else
@@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
endif
ifneq (,$(filter 1 2,$(NOFORTRAN)))
#only build without Fortran
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
endif
dllinit.$(SUFFIX) : dllinit.c

View File

@@ -50,7 +50,10 @@ BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {
gotoblas_init();
break;
case DLL_PROCESS_DETACH:
gotoblas_quit();
// If the process is about to exit, don't bother releasing any resources
// The kernel is much better at bulk releasing then.
if (!reserved)
gotoblas_quit();
break;
case DLL_THREAD_ATTACH:
break;

View File

@@ -618,19 +618,6 @@
# functions added for lapack-3.7.0
slarfy,
slasyf_rk,
ssyconvf_rook,
ssytf2_rk,
ssytrf_rk,
ssytrs_3,
ssytri_3,
ssytri_3x,
ssycon_3,
ssysv_rk,
slasyf_aa,
ssysv_aa,
ssytrf_aa,
ssytrs_aa,
strevc3,
sgelqt,
sgelqt3,
@@ -647,33 +634,8 @@
stplqt,
stplqt2,
stpmlqt,
ssytrd_2stage,
ssytrd_sy2sb,
ssytrd_sb2st,
ssb2st_kernels,
ssyevd_2stage,
ssyev_2stage,
ssyevx_2stage,
ssyevr_2stage,
ssbev_2stage,
ssbevx_2stage,
ssbevd_2stage,
ssygv_2stage,
dlarfy,
dlasyf_rk,
dsyconvf,
dsyconvf_rook,
dsytf2_rk,
dsytrf_rk,
dsytrs_3,
dsytri_3,
dsytri_3x,
dsycon_3,
dsysv_rk,
dlasyf_aa,
dsysv_aa,
dsytrf_aa,
dsytrs_aa,
dtrevc3,
dgelqt,
dgelqt3,
@@ -690,45 +652,8 @@
dtplqt,
dtplqt2,
dtpmlqt,
dsytrd_2stage,
dsytrd_sy2sb,
dsytrd_sb2st,
dsb2st_kernels,
dsyevd_2stage,
dsyev_2stage,
dsyevx_2stage,
dsyevr_2stage,
dsbev_2stage,
dsbevx_2stage,
dsbevd_2stage,
dsygv_2stage,
chetf2_rk,
chetrf_rk,
chetri_3,
chetri_3x,
chetrs_3,
checon_3,
chesv_rk,
chesv_aa,
chetrf_aa,
chetrs_aa,
clahef_aa,
clahef_rk,
clarfy,
clasyf_rk,
clasyf_aa,
csyconvf,
csyconvf_rook,
csytf2_rk,
csytrf_rk,
csytrf_aa,
csytrs_3,
csytrs_aa,
csytri_3,
csytri_3x,
csycon_3,
csysv_rk,
csysv_aa,
ctrevc3,
cgelqt,
cgelqt3,
@@ -745,45 +670,8 @@
ctplqt,
ctplqt2,
ctpmlqt,
chetrd_2stage,
chetrd_he2hb,
chetrd_hb2st,
chb2st_kernels,
cheevd_2stage,
cheev_2stage,
cheevx_2stage,
cheevr_2stage,
chbev_2stage,
chbevx_2stage,
chbevd_2stage,
chegv_2stage,
zhetf2_rk,
zhetrf_rk,
zhetri_3,
zhetri_3x,
zhetrs_3,
zhecon_3,
zhesv_rk,
zhesv_aa,
zhetrf_aa,
zhetrs_aa,
zlahef_aa,
zlahef_rk,
zlarfy,
zlasyf_rk,
zlasyf_aa,
zsyconvf,
zsyconvf_rook,
zsytrs_aa,
zsytf2_rk,
zsytrf_rk,
zsytrf_aa,
zsytrs_3,
zsytri_3,
zsytri_3x,
zsycon_3,
zsysv_rk,
zsysv_aa,
ztrevc3,
ztplqt,
ztplqt2,
@@ -800,18 +688,6 @@
zlaswlq,
zlamswlq,
zgemlq,
zhetrd_2stage,
zhetrd_he2hb,
zhetrd_hb2st,
zhb2st_kernels,
zheevd_2stage,
zheev_2stage,
zheevx_2stage,
zheevr_2stage,
zhbev_2stage,
zhbevx_2stage,
zhbevd_2stage,
zhegv_2stage,
sladiv1,
dladiv1,
iparam2stage,
@@ -819,24 +695,18 @@
# functions added for lapack-3.8.0
ilaenv2stage,
ssysv_aa_2stage,
ssytrf_aa_2stage,
ssytrs_aa_2stage,
chesv_aa_2stage,
chetrf_aa_2stage,
chetrs_aa_2stage,
csysv_aa_2stage,
csytrf_aa_2stage,
csytrs_aa_2stage,
dsysv_aa_2stage,
dsytrf_aa_2stage,
dsytrs_aa_2stage,
zhesv_aa_2stage,
zhetrf_aa_2stage,
zhetrs_aa_2stage,
zsysv_aa_2stage,
zsytrf_aa_2stage,
zsytrs_aa_2stage
# functions added for lapack-3.9.0
cgesvdq,
cungtsqr,
dcombssq,
dgesvdq,
dorgtsqr,
scombssq,
sgesvdq,
sorgtsqr,
zgesvdq,
zungtsqr
);
@lapack_extendedprecision_objs = (
@@ -3489,6 +3359,15 @@
LAPACKE_zsytrf_aa_2stage_work,
LAPACKE_zsytrs_aa_2stage,
LAPACKE_zsytrs_aa_2stage_work,
# new functions from 3.9.0
LAPACKE_dgesvdq,
LAPACKE_dgesvdq_work,
LAPACKE_sgesvdq,
LAPACKE_sgesvdq_work,
LAPACKE_zgesvdq,
LAPACKE_zgesvdq_work
);
#These function may need 2 underscores.
@@ -3509,6 +3388,65 @@
zlahef_rook, zlasyf_rook,
zsytf2_rook, zsytrf_rook, zsytrs_rook,
zsytri_rook, zsycon_rook, zsysv_rook,
# 3.7.0
slasyf_rk, ssyconvf_rook, ssytf2_rk,
ssytrf_rk, ssytrs_3, ssytri_3,
ssytri_3x, ssycon_3, ssysv_rk,
slasyf_aa, ssysv_aa, ssytrf_aa,
ssytrs_aa, ssytrd_2stage, ssytrd_sy2sb,
ssytrd_sb2st, ssb2st_kernels, ssyevd_2stage,
ssyev_2stage, ssyevx_2stage, ssyevr_2stage,
ssbev_2stage, ssbevx_2stage, ssbevd_2stage,
ssygv_2stage, dlasyf_rk, dsyconvf_rook,
dsytf2_rk, dsytrf_rk, dsytrs_3,
dsytri_3, dsytri_3x, dsycon_3,
dsysv_rk, dlasyf_aa, dsysv_aa,
dsytrf_aa, dsytrs_aa, dsytrd_2stage,
dsytrd_sy2sb, dsytrd_sb2st, dsb2st_kernels,
dsyevd_2stage, dsyev_2stage, dsyevx_2stage,
dsyevr_2stage, dsbev_2stage, dsbevx_2stage,
dsbevd_2stage, dsygv_2stage, chetf2_rk,
chetrf_rk, chetri_3, chetri_3x,
chetrs_3, checon_3, chesv_rk,
chesv_aa, chetrf_aa, chetrs_aa,
clahef_aa, clahef_rk, clasyf_rk,
clasyf_aa, csytf2_rk, csytrf_rk,
csytrf_aa, csytrs_3, csytrs_aa,
csytri_3, csytri_3x, csycon_3,
csysv_rk, csysv_aa, csyconvf_rook,
chetrd_2stage, chetrd_he2hb, chetrd_hb2st,
chb2st_kernels, cheevd_2stage, cheev_2stage,
cheevx_2stage, cheevr_2stage, chbev_2stage,
chbevx_2stage, chbevd_2stage, chegv_2stage,
zhetf2_rk, zhetrf_rk, zhetri_3,
zhetri_3x, zhetrs_3, zhecon_3,
zhesv_rk, zhesv_aa, zhetrf_aa,
zhetrs_aa, zlahef_aa, zlahef_rk,
zlasyf_rk, zlasyf_aa, zsyconvf_rook,
zsytrs_aa, zsytf2_rk, zsytrf_rk,
zsytrf_aa, zsytrs_3, zsytri_3,
zsytri_3x, zsycon_3, zsysv_rk,
zsysv_aa, zhetrd_2stage, zhetrd_he2hb,
zhetrd_hb2st, zhb2st_kernels, zheevd_2stage,
zheev_2stage, zheevx_2stage, zheevr_2stage,
zhbev_2stage, zhbevx_2stage, zhbevd_2stage,
zhegv_2stage,
# 3.8.0
ssysv_aa_2stage, ssytrf_aa_2stage,
ssytrs_aa_2stage, chesv_aa_2stage,
chetrf_aa_2stage, chetrs_aa_2stage,
csysv_aa_2stage, csytrf_aa_2stage,
csytrs_aa_2stage, dsysv_aa_2stage,
dsytrf_aa_2stage, dsytrs_aa_2stage,
zhesv_aa_2stage, zhetrf_aa_2stage,
zhetrs_aa_2stage, zsysv_aa_2stage,
zsytrf_aa_2stage, zsytrs_aa_2stage,
# 3.9.0
claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col,
dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col,
slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col,
zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col
);

14
f_check
View File

@@ -19,7 +19,7 @@ $nofortran = 0;
$compiler = join(" ", @ARGV);
$compiler_bin = shift(@ARGV);
# f77 is too ambiguous
$compiler = "" if $compiler eq "f77";
@@ -71,7 +71,7 @@ if ($compiler eq "") {
if ($data =~ /GNU/) {
$data =~ /(\d)\.(\d).(\d)/;
$data =~ /(\d+)\.(\d+).(\d+)/;
$major = $1;
$minor = $2;
@@ -125,11 +125,16 @@ if ($compiler eq "") {
$openmp = "-openmp";
}
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data =~ / zho_ge__/) {
$need2bu = 1;
}
if ($vendor =~ /G95/) {
if ($ENV{NO_LAPACKE} != 1) {
$need2bu = "";
}
}
}
if ($vendor eq "") {
@@ -277,6 +282,8 @@ $linker_a = "";
if ($link ne "") {
$link =~ s/\-Y\sP\,/\-Y/g;
$link =~ s/\-R\s*/\-rpath\@/g;
$link =~ s/\-rpath\s+/\-rpath\@/g;
@@ -327,6 +334,7 @@ if ($link ne "") {
&& ($flags !~ /kernel32/)
&& ($flags !~ /advapi32/)
&& ($flags !~ /shell32/)
&& ($flags !~ /omp/)
&& ($flags !~ /^\-l$/)
) {
$linker_l .= $flags . " ";

View File

@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef OS_WINDOWS
#include <windows.h>
#endif
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
@@ -1201,7 +1201,7 @@ static int get_num_cores(void) {
#ifdef OS_WINDOWS
SYSTEM_INFO sysinfo;
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
int m[2], count;
size_t len;
#endif
@@ -1215,7 +1215,7 @@ static int get_num_cores(void) {
GetSystemInfo(&sysinfo);
return sysinfo.dwNumberOfProcessors;
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
m[0] = CTL_HW;
m[1] = HW_NCPU;
len = sizeof(int);

View File

@@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES
axpby.c
)
# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f
# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f
# these all have 'z' sources for complex versions
set(BLAS2_SOURCES
gemv.c ger.c

View File

@@ -394,7 +394,7 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS)
SLAPACKOBJS = \
sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \
spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX)
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX)
#DLAPACKOBJS = \
@@ -405,14 +405,14 @@ SLAPACKOBJS = \
DLAPACKOBJS = \
dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \
dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX)
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX)
QLAPACKOBJS = \
qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \
qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \
qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \
qlaswp.$(SUFFIX) qtrtrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \
qtrtrs.$(SUFFIX)
#CLAPACKOBJS = \
# cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
@@ -423,7 +423,7 @@ QLAPACKOBJS = \
CLAPACKOBJS = \
cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \
clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX)
clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX)
#ZLAPACKOBJS = \
@@ -435,13 +435,14 @@ CLAPACKOBJS = \
ZLAPACKOBJS = \
zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \
zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \
zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX)
zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX)
XLAPACKOBJS = \
xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \
xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \
xlaswp.$(SUFFIX) xgetrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \
xlaswp.$(SUFFIX) xtrtrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \
xtrtrs.$(SUFFIX)
ifneq ($(NO_LAPACK), 1)
SBLASOBJS += $(SLAPACKOBJS)
@@ -2031,7 +2032,7 @@ sgetrs.$(SUFFIX) sgetrs.$(PSUFFIX) : lapack/getrs.c
dgetrs.$(SUFFIX) dgetrs.$(PSUFFIX) : lapack/getrs.c
$(CC) -c $(CFLAGS) $< -o $(@F)
qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : getrs.c
qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : lapack/getrs.c
$(CC) -c $(CFLAGS) $< -o $(@F)
cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c
@@ -2040,7 +2041,25 @@ cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c
zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : lapack/zgetrs.c
$(CC) -c $(CFLAGS) $< -o $(@F)
xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : zgetrs.c
xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : lapack/zgetrs.c
$(CC) -c $(CFLAGS) $< -o $(@F)
strtrs.$(SUFFIX) strtrs.$(PSUFFIX) : lapack/trtrs.c
$(CC) -c $(CFLAGS) $< -o $(@F)
dtrtrs.$(SUFFIX) dtrtrs.$(PSUFFIX) : lapack/trtrs.c
$(CC) -c $(CFLAGS) $< -o $(@F)
qtrtrs.$(SUFFIX) qtrtrs.$(PSUFFIX) : lapack/trtrs.c
$(CC) -c $(CFLAGS) $< -o $(@F)
ctrtrs.$(SUFFIX) ctrtrs.$(PSUFFIX) : lapack/ztrtrs.c
$(CC) -c $(CFLAGS) $< -o $(@F)
ztrtrs.$(SUFFIX) ztrtrs.$(PSUFFIX) : lapack/ztrtrs.c
$(CC) -c $(CFLAGS) $< -o $(@F)
xtrtrs.$(SUFFIX) xtrtrs.$(PSUFFIX) : lapack/ztrtrs.c
$(CC) -c $(CFLAGS) $< -o $(@F)
sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : lapack/gesv.c

View File

@@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
//
//Temporarily work-around the low performance issue with small imput size &
//Temporarily work-around the low performance issue with small input size &
//multithreads.
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1;

View File

@@ -44,19 +44,19 @@
#ifndef COMPLEX
#ifdef XDOUBLE
#define ERROR_NAME "QGESV "
#define ERROR_NAME "QGESV"
#elif defined(DOUBLE)
#define ERROR_NAME "DGESV "
#define ERROR_NAME "DGESV"
#else
#define ERROR_NAME "SGESV "
#define ERROR_NAME "SGESV"
#endif
#else
#ifdef XDOUBLE
#define ERROR_NAME "XGESV "
#define ERROR_NAME "XGESV"
#elif defined(DOUBLE)
#define ERROR_NAME "ZGESV "
#define ERROR_NAME "ZGESV"
#else
#define ERROR_NAME "CGESV "
#define ERROR_NAME "CGESV"
#endif
#endif
@@ -89,7 +89,7 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv,
if (args.m < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

View File

@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
if (args.n < 0) info = 2;
if (args.m < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

View File

@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
if (args.n < 0) info = 2;
if (args.m < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

View File

@@ -102,7 +102,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
if (trans < 0) info = 1;
if (info != 0) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
return 0;
}

View File

@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
if (args.n < 0) info = 2;
if (uplo < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

View File

@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
if (args.n < 0) info = 2;
if (uplo < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

View File

@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
if (args.n < 0) info = 2;
if (uplo < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

View File

@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
if (args.n < 0) info = 2;
if (uplo < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

View File

@@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
if (uplo < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

View File

@@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
if (diag < 0) info = 2;
if (uplo < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

View File

@@ -99,7 +99,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
if (diag < 0) info = 2;
if (uplo < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

171
interface/lapack/trtrs.c Normal file
View File

@@ -0,0 +1,171 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#ifdef XDOUBLE
#define ERROR_NAME "QTRTRS"
#elif defined(DOUBLE)
#define ERROR_NAME "DTRTRS"
#else
#define ERROR_NAME "STRTRS"
#endif
static blasint (*trtrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
TRTRS_UNU_SINGLE, TRTRS_UNN_SINGLE, TRTRS_UTU_SINGLE, TRTRS_UTN_SINGLE, TRTRS_LNU_SINGLE, TRTRS_LNN_SINGLE, TRTRS_LTU_SINGLE, TRTRS_LTN_SINGLE,
};
#ifdef SMP
static blasint (*trtrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
TRTRS_UNU_PARALLEL, TRTRS_UNN_PARALLEL, TRTRS_UTU_PARALLEL, TRTRS_UTN_PARALLEL, TRTRS_LNU_PARALLEL, TRTRS_LNN_PARALLEL, TRTRS_LTU_PARALLEL, TRTRS_LTN_PARALLEL,
};
#endif
int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
FLOAT *b, blasint *ldB, blasint *Info){
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
char diag_arg = *DIAG;
blas_arg_t args;
blasint info;
int uplo, trans, diag;
FLOAT *buffer;
#ifdef PPC440
extern
#endif
FLOAT *sa, *sb;
PRINT_DEBUG_NAME;
args.m = *N;
args.n = *NRHS;
args.a = (void *)a;
args.lda = *ldA;
args.b = (void *)b;
args.ldb = *ldB;
info = 0;
TOUPPER(trans_arg);
trans = -1;
if (trans_arg == 'N') trans = 0;
if (trans_arg == 'T') trans = 1;
if (trans_arg == 'R') trans = 0;
if (trans_arg == 'C') trans = 1;
uplo = -1;
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
diag = -1;
if (diag_arg == 'U') diag = 0;
if (diag_arg == 'N') diag = 1;
if (args.ldb < MAX(1, args.m)) info = 9;
if (args.lda < MAX(1, args.m)) info = 7;
if (args.n < 0) info = 5;
if (args.m < 0) info = 4;
if (trans < 0) info = 2;
if (uplo < 0) info = 1;
if (diag < 0) info = 3;
if (info != 0) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}
args.alpha = NULL;
args.beta = NULL;
*Info = 0;
if (args.m == 0) return 0;
if (diag) {
if (AMIN_K(args.m, args.a, args.lda + 1) == ZERO) {
*Info = IAMIN_K(args.m, args.a, args.lda + 1);
return 0;
}
}
IDEBUG_START;
FUNCTION_PROFILE_START();
#ifndef PPC440
buffer = (FLOAT *)blas_memory_alloc(1);
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
#endif
#ifdef SMP
args.common = NULL;
args.nthreads = num_cpu_avail(4);
if (args.nthreads == 1) {
#endif
(trtrs_single[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
#ifdef SMP
} else {
(trtrs_parallel[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
}
#endif
#ifndef PPC440
blas_memory_free(buffer);
#endif
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n);
IDEBUG_END;
return 0;
}

View File

@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
if (args.n < 0) info = 2;
if (args.m < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

View File

@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
if (args.n < 0) info = 2;
if (args.m < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

View File

@@ -102,7 +102,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
if (trans < 0) info = 1;
if (info != 0) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
return 0;
}

View File

@@ -91,7 +91,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
if (args.n < 0) info = 2;
if (uplo < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

View File

@@ -91,7 +91,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
if (args.n < 0) info = 2;
if (uplo < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

View File

@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
if (args.n < 0) info = 2;
if (uplo < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

View File

@@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
if (uplo < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

View File

@@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
if (diag < 0) info = 2;
if (uplo < 0) info = 1;
if (info) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
*Info = - info;
return 0;
}

Some files were not shown because too many files have changed in this diff Show More