Compare commits
528 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e32f3b1447 | ||
|
|
5e94aa4877 | ||
|
|
93f3e27574 | ||
|
|
785c389b0e | ||
|
|
c222b25b81 | ||
|
|
221da8bf05 | ||
|
|
eb285b4d20 | ||
|
|
cafdd999b8 | ||
|
|
92ca92a46c | ||
|
|
486c35c5dc | ||
|
|
0e05ea9bac | ||
|
|
5ba3699f41 | ||
|
|
8eefa530cd | ||
|
|
de40d47edf | ||
|
|
7c162b8a21 | ||
|
|
0544cbc806 | ||
|
|
120d20731f | ||
|
|
dc345d84df | ||
|
|
616921fd91 | ||
|
|
8a9e9a82a1 | ||
|
|
7ea5e07d1c | ||
|
|
cb6ef49857 | ||
|
|
63994e1cdb | ||
|
|
496e3019bc | ||
|
|
169be3f097 | ||
|
|
6ccbb089c2 | ||
|
|
59ebe3636a | ||
|
|
5a6bba3061 | ||
|
|
dff173e50e | ||
|
|
7e5cbb6f35 | ||
|
|
303bdb673b | ||
|
|
754433f420 | ||
|
|
7f0d523b42 | ||
|
|
c353d8b106 | ||
|
|
579be3aa9d | ||
|
|
449e8ea443 | ||
|
|
3bec250cf9 | ||
|
|
f03dd23e90 | ||
|
|
fa93d63365 | ||
|
|
90e6c66a57 | ||
|
|
32d97330b3 | ||
|
|
29eaf4b6d7 | ||
|
|
47c1bf7f4d | ||
|
|
2b55f0ad30 | ||
|
|
a5b32ab06c | ||
|
|
50545b19d0 | ||
|
|
b3cbd60d7a | ||
|
|
70199d1905 | ||
|
|
cfe63d8cc2 | ||
|
|
d55b10830f | ||
|
|
c1c10cbb21 | ||
|
|
5989841524 | ||
|
|
68a43db358 | ||
|
|
9694037b23 | ||
|
|
71faa1c1a7 | ||
|
|
3447d04eaf | ||
|
|
8b5cdcc64c | ||
|
|
4e00d96a78 | ||
|
|
ce9ea8f826 | ||
|
|
0b909203cb | ||
|
|
096da2f51a | ||
|
|
2f96a2c55b | ||
|
|
833bd0f8ff | ||
|
|
77b8f49556 | ||
|
|
1c3e20ce48 | ||
|
|
83b6be7976 | ||
|
|
081b188529 | ||
|
|
f3f969f681 | ||
|
|
8019e70211 | ||
|
|
8d2a796f49 | ||
|
|
8dc9fd4dfe | ||
|
|
abc67bdd74 | ||
|
|
1f62a82789 | ||
|
|
e9fb8f62b1 | ||
|
|
fbf4f48f4a | ||
|
|
b9ad450295 | ||
|
|
e011ad820a | ||
|
|
ff42e68652 | ||
|
|
23f322f997 | ||
|
|
093d37de8d | ||
|
|
d65e9a2bbd | ||
|
|
78100b8093 | ||
|
|
70f45749b9 | ||
|
|
e5dcdeb550 | ||
|
|
952cc2ba38 | ||
|
|
feaafbedd3 | ||
|
|
1c67567008 | ||
|
|
4e979bf75b | ||
|
|
daa4310db5 | ||
|
|
b8f3605132 | ||
|
|
b36018be6d | ||
|
|
3a100b2797 | ||
|
|
38742d5547 | ||
|
|
bd4c032f52 | ||
|
|
9dc9b7b95e | ||
|
|
9f5cdc49d4 | ||
|
|
b7b408a120 | ||
|
|
92b10212de | ||
|
|
b73bf01378 | ||
|
|
eb3c9f1db9 | ||
|
|
fd2ff2714f | ||
|
|
2ea2bd99c7 | ||
|
|
fbb894948c | ||
|
|
e711659c90 | ||
|
|
893e6e57c4 | ||
|
|
456ee2e1f0 | ||
|
|
9998f8ed8b | ||
|
|
80db5f11e1 | ||
|
|
52de4cc8fd | ||
|
|
44028581cc | ||
|
|
86ab939936 | ||
|
|
375b1875c8 | ||
|
|
6c85cb1869 | ||
|
|
995768bbc5 | ||
|
|
96ad579428 | ||
|
|
8d84403205 | ||
|
|
8729db117c | ||
|
|
0833a4846a | ||
|
|
50f7fc1401 | ||
|
|
d1b53806be | ||
|
|
a0f0a802fc | ||
|
|
700fe5b5ee | ||
|
|
bb2729c855 | ||
|
|
aae44d040d | ||
|
|
6362c34ee6 | ||
|
|
f60840c420 | ||
|
|
109e18cd96 | ||
|
|
ae1579be13 | ||
|
|
3ccf8885ac | ||
|
|
454847588e | ||
|
|
0257f26488 | ||
|
|
c45b7aef14 | ||
|
|
312060d0d6 | ||
|
|
cd765f094b | ||
|
|
64639f440f | ||
|
|
3a66c8cac1 | ||
|
|
4c35b8dbaa | ||
|
|
ed9af2f7da | ||
|
|
5fd1edead9 | ||
|
|
26478eb0d0 | ||
|
|
eeecd623d8 | ||
|
|
3ce6bcdb5f | ||
|
|
6fbe51072b | ||
|
|
611445c7f8 | ||
|
|
2cd9306bb5 | ||
|
|
c418c81224 | ||
|
|
025741f16a | ||
|
|
0ae49d2990 | ||
|
|
105e26e12a | ||
|
|
f41d52665d | ||
|
|
d573d24de7 | ||
|
|
31d6c2eb7d | ||
|
|
b7cc69ee62 | ||
|
|
aeef942c4f | ||
|
|
445ca2f418 | ||
|
|
13226e3101 | ||
|
|
1a6ea8ee6d | ||
|
|
c6ecb195e6 | ||
|
|
b28db31429 | ||
|
|
6baa9b07d7 | ||
|
|
a4896b5538 | ||
|
|
3938e59569 | ||
|
|
9d5079008f | ||
|
|
3518617f5b | ||
|
|
715f4650d9 | ||
|
|
10705183ce | ||
|
|
235599f17a | ||
|
|
b863b32ac5 | ||
|
|
dd04143d4a | ||
|
|
f3a6164bff | ||
|
|
dedd822d1a | ||
|
|
2181fb7047 | ||
|
|
a9b62c03f8 | ||
|
|
97762234f9 | ||
|
|
948d11fc51 | ||
|
|
c815b8fb85 | ||
|
|
e20709e976 | ||
|
|
934e601e93 | ||
|
|
a4c3668f99 | ||
|
|
867232c6a4 | ||
|
|
5aaf70ef95 | ||
|
|
ae2a0995cc | ||
|
|
83dae28ae2 | ||
|
|
da986d2e83 | ||
|
|
6bc487de35 | ||
|
|
cf2a8e410c | ||
|
|
eb1e9c8c92 | ||
|
|
f95989cbc1 | ||
|
|
f3065a0eed | ||
|
|
04226f1e97 | ||
|
|
0925ef70db | ||
|
|
371e6f73d4 | ||
|
|
d117dfd505 | ||
|
|
883c39773a | ||
|
|
b09b5be0a4 | ||
|
|
bfb5fbdb4d | ||
|
|
3da6d66da9 | ||
|
|
08fa83aba2 | ||
|
|
63d3ee8dfc | ||
|
|
1191db1a49 | ||
|
|
1f6071590d | ||
|
|
0caf1434c9 | ||
|
|
73128f3883 | ||
|
|
cad0d150db | ||
|
|
eba0aeb7cd | ||
|
|
0c07c356c1 | ||
|
|
82b75f97e5 | ||
|
|
7887c45077 | ||
|
|
3e67017ac8 | ||
|
|
b3ac6ee222 | ||
|
|
6082e556cd | ||
|
|
92315173d5 | ||
|
|
351d12b94e | ||
|
|
bf73aa141b | ||
|
|
71e96163db | ||
|
|
819e852ae7 | ||
|
|
4e466d739c | ||
|
|
4c6a457358 | ||
|
|
836c414e22 | ||
|
|
d403eb3c2f | ||
|
|
3cd97f1a80 | ||
|
|
9955f0996f | ||
|
|
430c11e135 | ||
|
|
fbacd2605d | ||
|
|
6fa89b06a1 | ||
|
|
68597002ea | ||
|
|
d2a6285549 | ||
|
|
d999688d1a | ||
|
|
928fe1b28e | ||
|
|
ccc28c6d60 | ||
|
|
ae43b75a6a | ||
|
|
54fc06fd70 | ||
|
|
1df9a2013d | ||
|
|
274ff5cdb8 | ||
|
|
eb2eddf241 | ||
|
|
8691825944 | ||
|
|
7dc8a76f60 | ||
|
|
df857551c0 | ||
|
|
85ccdce8c4 | ||
|
|
aeabe0a83f | ||
|
|
1b90989662 | ||
|
|
e3e8b5cdca | ||
|
|
69b16a894d | ||
|
|
6782e5767d | ||
|
|
48f5a89f92 | ||
|
|
4ae1610f37 | ||
|
|
911c3e2f4b | ||
|
|
fab49e49e5 | ||
|
|
b687fba5bc | ||
|
|
46a8c2519a | ||
|
|
e9437eebd2 | ||
|
|
3a39062cfc | ||
|
|
eaa0be1313 | ||
|
|
6ff013bae0 | ||
|
|
0d669e04bb | ||
|
|
17cdd9f9e1 | ||
|
|
6bcb06fcb1 | ||
|
|
b7315f8401 | ||
|
|
9b19e9e1b0 | ||
|
|
6bd67ddbab | ||
|
|
5da9484d93 | ||
|
|
844629af57 | ||
|
|
2beaa82c05 | ||
|
|
e8a2aed2b9 | ||
|
|
f262031685 | ||
|
|
5f6206fa2d | ||
|
|
f2cde2ccfb | ||
|
|
ba7838d2e1 | ||
|
|
a448884a63 | ||
|
|
17609f88f1 | ||
|
|
3a2df19db6 | ||
|
|
d2093a40d3 | ||
|
|
aa04b0925e | ||
|
|
258ac56e0a | ||
|
|
56837e9d92 | ||
|
|
bb5413863f | ||
|
|
32f5907fef | ||
|
|
ac10236cc8 | ||
|
|
8617d75548 | ||
|
|
c07d78b9e9 | ||
|
|
6355c25dde | ||
|
|
5e244d80f2 | ||
|
|
ede5efebab | ||
|
|
84908d60d2 | ||
|
|
596a22325a | ||
|
|
7f58f3ad0e | ||
|
|
c0d570a357 | ||
|
|
6b83079368 | ||
|
|
673e5a0495 | ||
|
|
bfa2cc7d64 | ||
|
|
e7c4d6705a | ||
|
|
2a1911cc14 | ||
|
|
9f7a9a32e3 | ||
|
|
2463938879 | ||
|
|
5d6525c87c | ||
|
|
6cb47ea3f0 | ||
|
|
459bb9291d | ||
|
|
3f1077ce6f | ||
|
|
eb45eb6942 | ||
|
|
f2becb777a | ||
|
|
5997b6b491 | ||
|
|
4b21b646ea | ||
|
|
7ec7b999a5 | ||
|
|
af9ac0898a | ||
|
|
c7b5a459b6 | ||
|
|
9b2f0323d6 | ||
|
|
9f6984fe4b | ||
|
|
42203dafdc | ||
|
|
a4f17a9297 | ||
|
|
733d97b2df | ||
|
|
ea747cf933 | ||
|
|
4de545aa7d | ||
|
|
6e9a93ec19 | ||
|
|
fde8a8e6a0 | ||
|
|
256fc15f5f | ||
|
|
ee498525e0 | ||
|
|
1fec0570f6 | ||
|
|
b5af7b9c78 | ||
|
|
f3c314550c | ||
|
|
847c20c9b7 | ||
|
|
4c22828812 | ||
|
|
e79712d969 | ||
|
|
be09551cdf | ||
|
|
ec1ef6aa9e | ||
|
|
11c59acfb1 | ||
|
|
bf0d92a310 | ||
|
|
db066151ee | ||
|
|
3a55dca2dc | ||
|
|
7d380f7d79 | ||
|
|
300f158d3b | ||
|
|
3635fdbf2b | ||
|
|
b6552b11eb | ||
|
|
3dc6b26eff | ||
|
|
5fdf9ad24f | ||
|
|
2fe967c542 | ||
|
|
6d8595351c | ||
|
|
f40200f559 | ||
|
|
a95a5e52b8 | ||
|
|
e3d846ab57 | ||
|
|
8506386d82 | ||
|
|
9ef96b32a6 | ||
|
|
b48c025974 | ||
|
|
a1fce67743 | ||
|
|
103b32fdb7 | ||
|
|
aef9804089 | ||
|
|
303869f572 | ||
|
|
02d9203981 | ||
|
|
7b6808b69c | ||
|
|
5f36f18148 | ||
|
|
d47fe78b0e | ||
|
|
ebe2f47a0f | ||
|
|
20d417762f | ||
|
|
321288597c | ||
|
|
be147a9f28 | ||
|
|
c275290ea6 | ||
|
|
b7bbb02447 | ||
|
|
bf1430f7d7 | ||
|
|
dccff2e785 | ||
|
|
5c3458a6e7 | ||
|
|
1776ad82c0 | ||
|
|
4e2f81cfa1 | ||
|
|
acf6002ab2 | ||
|
|
96a794e9fd | ||
|
|
3d36c45116 | ||
|
|
648491e1aa | ||
|
|
2dfb804cb9 | ||
|
|
4c153ec9da | ||
|
|
7eecd8e39c | ||
|
|
f0406a7708 | ||
|
|
561f3fd995 | ||
|
|
30efed14d1 | ||
|
|
af2e7f28fc | ||
|
|
4250e6ed64 | ||
|
|
7b0b7c11d2 | ||
|
|
d14cf1ccf4 | ||
|
|
3f6ab1582a | ||
|
|
28e96458e5 | ||
|
|
95fb98f556 | ||
|
|
4801c6d36b | ||
|
|
9440fa607d | ||
|
|
94db259e5b | ||
|
|
f49f8047ac | ||
|
|
825777faab | ||
|
|
9c89757562 | ||
|
|
b0b7600bef | ||
|
|
9b04baeaee | ||
|
|
8a074b3965 | ||
|
|
211ab03b14 | ||
|
|
1733f927e6 | ||
|
|
182b06d6ad | ||
|
|
7a9050d681 | ||
|
|
0ba29fd262 | ||
|
|
bafa021ed6 | ||
|
|
b89d9762a2 | ||
|
|
08dedf4c5e | ||
|
|
b89c781637 | ||
|
|
dd7ff77f4b | ||
|
|
8fb76134bc | ||
|
|
04d671aae2 | ||
|
|
f69a0be712 | ||
|
|
ae9e8b131e | ||
|
|
9086543f50 | ||
|
|
abea977ded | ||
|
|
6b6c9b1441 | ||
|
|
a97b301aaa | ||
|
|
2f13f04224 | ||
|
|
7c7505a778 | ||
|
|
5a4f1a2118 | ||
|
|
3b761892df | ||
|
|
eebfeba768 | ||
|
|
7684c4f8f8 | ||
|
|
7faf42b7bb | ||
|
|
a575f1e4c7 | ||
|
|
cdbfb891da | ||
|
|
280552b988 | ||
|
|
bbd4bb0154 | ||
|
|
6d3efb2b58 | ||
|
|
d9ff2cd90d | ||
|
|
2a43062de7 | ||
|
|
4ea794a522 | ||
|
|
ece0bfb881 | ||
|
|
1f4b6a5d5d | ||
|
|
be8f70d269 | ||
|
|
e674e1c735 | ||
|
|
6ca898b63b | ||
|
|
26411acd56 | ||
|
|
0ab4076dd8 | ||
|
|
a0caa762b3 | ||
|
|
900d5a3205 | ||
|
|
a17cf36225 | ||
|
|
148c4cc5fd | ||
|
|
d0c3543c3f | ||
|
|
909ad04aef | ||
|
|
417efd41c6 | ||
|
|
9cdc828afa | ||
|
|
7a9a4dbc4f | ||
|
|
a469b32cf4 | ||
|
|
27649b9543 | ||
|
|
16f3df5d35 | ||
|
|
1aded69821 | ||
|
|
c00289ba54 | ||
|
|
8fe794f059 | ||
|
|
74c10b57c6 | ||
|
|
c5495d2056 | ||
|
|
c70496b108 | ||
|
|
ca8d8835f5 | ||
|
|
d76b20b4d2 | ||
|
|
85af04da3c | ||
|
|
11e0dcbffb | ||
|
|
79366ff7a9 | ||
|
|
21d05a4835 | ||
|
|
940f38f6dd | ||
|
|
1778fd4219 | ||
|
|
969dd6175e | ||
|
|
d8d5682481 | ||
|
|
f66c11fc22 | ||
|
|
5ecffc28f2 | ||
|
|
86dda5c2fa | ||
|
|
1e52572be3 | ||
|
|
d2cb610272 | ||
|
|
a211bc9b6a | ||
|
|
9208ab8603 | ||
|
|
b43deb4ad6 | ||
|
|
b911525c81 | ||
|
|
7ff44e0016 | ||
|
|
e3cb8ad2d6 | ||
|
|
7aa6faad5f | ||
|
|
3d94ab660f | ||
|
|
cd99dfe034 | ||
|
|
dadafcdcd8 | ||
|
|
d40c109eb0 | ||
|
|
608cd69b66 | ||
|
|
231472c4c6 | ||
|
|
612c2d78e0 | ||
|
|
dc110e179d | ||
|
|
9184590c33 | ||
|
|
a0aaf308ed | ||
|
|
15f925fe9a | ||
|
|
21acf03e9a | ||
|
|
ff807473bb | ||
|
|
58829c0988 | ||
|
|
d86f0b9e74 | ||
|
|
63554d5dec | ||
|
|
43068288e9 | ||
|
|
999a04f101 | ||
|
|
3cb1c8d210 | ||
|
|
ff1bfe7b16 | ||
|
|
9ea30f3788 | ||
|
|
a3d4c65d62 | ||
|
|
e1fc02095c | ||
|
|
0cd6d8508f | ||
|
|
c2f152c470 | ||
|
|
4efbac28ed | ||
|
|
406c7242f4 | ||
|
|
53703585aa | ||
|
|
ad20ceaa68 | ||
|
|
dd77a3f0e2 | ||
|
|
a598ab1d32 | ||
|
|
16fd8e3dbe | ||
|
|
aa4c41bad2 | ||
|
|
5cf434167a | ||
|
|
3a49e8c05a | ||
|
|
95e2cf32e1 | ||
|
|
70cea0b96b | ||
|
|
ae0dec77ec | ||
|
|
e47b63466b | ||
|
|
7d1b468d9d | ||
|
|
575a84398a | ||
|
|
5cabda79d0 | ||
|
|
c516209581 | ||
|
|
a6a8cc2b7f | ||
|
|
3d7debbb28 | ||
|
|
5a9cce2bf6 | ||
|
|
6a8b4269b5 | ||
|
|
b1561ecc68 | ||
|
|
7ed8431527 | ||
|
|
a387a23518 | ||
|
|
b46875b76b | ||
|
|
858e609e1f | ||
|
|
3f427c0cf9 | ||
|
|
c95317158f | ||
|
|
47f892198c | ||
|
|
b43c8382c8 | ||
|
|
daf2fec12d | ||
|
|
4f8143b098 | ||
|
|
bfeb9c16b0 | ||
|
|
628b335e83 | ||
|
|
0f105dd8a5 |
143
.drone.yml
Normal file
143
.drone.yml
Normal file
@@ -0,0 +1,143 @@
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm64_gcc_make
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm64
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:19.04
|
||||
environment:
|
||||
CC: gcc
|
||||
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
|
||||
commands:
|
||||
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC gfortran perl
|
||||
- $CC --version
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||
- make -C test $COMMON_FLAGS
|
||||
- make -C ctest $COMMON_FLAGS
|
||||
- make -C utest $COMMON_FLAGS
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm32_gcc_make
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:19.04
|
||||
environment:
|
||||
CC: gcc
|
||||
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
|
||||
commands:
|
||||
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC gfortran perl
|
||||
- $CC --version
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||
- make -C test $COMMON_FLAGS
|
||||
- make -C ctest $COMMON_FLAGS
|
||||
- make -C utest $COMMON_FLAGS
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm64_clang_make
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm64
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:18.04
|
||||
environment:
|
||||
CC: clang
|
||||
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
|
||||
commands:
|
||||
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC gfortran perl
|
||||
- $CC --version
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||
- make -C test $COMMON_FLAGS
|
||||
- make -C ctest $COMMON_FLAGS
|
||||
- make -C utest $COMMON_FLAGS
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm32_clang_cmake
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:18.04
|
||||
environment:
|
||||
CC: clang
|
||||
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
|
||||
commands:
|
||||
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC g++ perl cmake
|
||||
- $CC --version
|
||||
- mkdir build && cd build
|
||||
- cmake $CMAKE_FLAGS ..
|
||||
- make -j
|
||||
- ctest -V
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm64_gcc_cmake
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm64
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:18.04
|
||||
environment:
|
||||
CC: gcc
|
||||
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
|
||||
commands:
|
||||
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC g++ perl cmake
|
||||
- $CC --version
|
||||
- mkdir build && cd build
|
||||
- cmake $CMAKE_FLAGS ..
|
||||
- make -j
|
||||
- ctest -V
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm64_clang_cmake
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm64
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:18.04
|
||||
environment:
|
||||
CC: clang
|
||||
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
|
||||
commands:
|
||||
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC g++ perl cmake
|
||||
- $CC --version
|
||||
- mkdir build && cd build
|
||||
- cmake $CMAKE_FLAGS ..
|
||||
- make -j
|
||||
- ctest -V
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -87,4 +87,5 @@ build.*
|
||||
*.swp
|
||||
benchmark/*.goto
|
||||
benchmark/smallscaling
|
||||
|
||||
CMakeCache.txt
|
||||
CMakeFiles/*
|
||||
|
||||
62
.travis.yml
62
.travis.yml
@@ -17,7 +17,7 @@ matrix:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
script:
|
||||
- set -e
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
@@ -25,6 +25,15 @@ matrix:
|
||||
- TARGET_BOX=LINUX64
|
||||
- BTYPE="BINARY=64"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
os: linux-ppc64le
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
|
||||
env:
|
||||
# for matrix annotation only
|
||||
- TARGET_BOX=PPC64LE_LINUX
|
||||
- BTYPE="BINARY=64 USE_OPENMP=1"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
env:
|
||||
- TARGET_BOX=LINUX64
|
||||
@@ -151,54 +160,25 @@ matrix:
|
||||
os: osx
|
||||
osx_image: xcode10.1
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
- brew update
|
||||
- brew install gcc # for gfortran
|
||||
- brew install gcc@8 # for gfortran
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode8.3
|
||||
osx_image: xcode10.0
|
||||
env:
|
||||
- BTYPE="BINARY=32"
|
||||
- BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
|
||||
|
||||
- &emulated-arm
|
||||
dist: trusty
|
||||
sudo: required
|
||||
services: docker
|
||||
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
|
||||
name: "Emulated Build for ARMV6 with gcc"
|
||||
before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset
|
||||
script: |
|
||||
echo "FROM openblas/alpine:${IMAGE_ARCH}
|
||||
COPY . /tmp/openblas
|
||||
RUN mkdir /tmp/openblas/build && \
|
||||
cd /tmp/openblas/build && \
|
||||
CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \
|
||||
-D TARGET=${TARGET_ARCH} \
|
||||
-D BUILD_SHARED_LIBS=ON \
|
||||
-D BUILD_WITHOUT_LAPACK=ON \
|
||||
-D BUILD_WITHOUT_CBLAS=ON \
|
||||
-D CMAKE_BUILD_TYPE=Release ../ && \
|
||||
cmake --build ." > Dockerfile
|
||||
docker build .
|
||||
- <<: *emulated-arm
|
||||
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
|
||||
name: "Emulated Build for ARMV6 with clang"
|
||||
- <<: *emulated-arm
|
||||
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
|
||||
name: "Emulated Build for ARMV8 with gcc"
|
||||
- <<: *emulated-arm
|
||||
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
|
||||
name: "Emulated Build for ARMV8 with clang"
|
||||
|
||||
allow_failures:
|
||||
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
|
||||
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
|
||||
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
|
||||
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
|
||||
- <<: *test-macos
|
||||
osx_image: xcode10.1
|
||||
env:
|
||||
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk"
|
||||
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
|
||||
|
||||
# whitelist
|
||||
branches:
|
||||
|
||||
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
||||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 6)
|
||||
set(OpenBLAS_PATCH_VERSION 9.dev)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
@@ -20,9 +20,14 @@ if(MSVC)
|
||||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
|
||||
endif()
|
||||
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
||||
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF)
|
||||
option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
|
||||
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
|
||||
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
|
||||
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
||||
else()
|
||||
set(NO_AFFINITY 1)
|
||||
endif()
|
||||
|
||||
# Add a prefix or suffix to all exported symbol names in the shared library.
|
||||
# Avoids conflicts with other BLAS libraries, especially when using
|
||||
@@ -206,7 +211,8 @@ if (USE_THREAD)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
|
||||
endif()
|
||||
|
||||
if (MSVC OR NOT NOFORTRAN)
|
||||
#if (MSVC OR NOT NOFORTRAN)
|
||||
if (NOT NO_CBLAS)
|
||||
# Broken without fortran on unix
|
||||
add_subdirectory(utest)
|
||||
endif()
|
||||
|
||||
@@ -167,4 +167,16 @@ In chronological order:
|
||||
* [2017-02-26] ztrmm kernel for IBM z13
|
||||
* [2017-03-13] strmm and ctrmm kernel for IBM z13
|
||||
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
|
||||
* [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes
|
||||
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
|
||||
* [2019-03-14] power9 dgemm/dtrmm kernel
|
||||
* [2019-04-29] power9 sgemm/strmm kernel
|
||||
|
||||
* Jiachen Wang <https://github.com/wjc404>
|
||||
* [2019-07-29] optimize AVX2 DGEMM
|
||||
* [2019-10-20] AVX512 DGEMM kernel (4x8)
|
||||
* [2019-11-06] optimize AVX512 SGEMM
|
||||
* [2019-11-12] AVX512 CGEMM & ZGEMM kernels
|
||||
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM
|
||||
* [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels
|
||||
* [2020-01-07] optimize AVX2 SGEMM and STRMM
|
||||
|
||||
@@ -1,4 +1,101 @@
|
||||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.8
|
||||
9-Feb-2020
|
||||
|
||||
common:
|
||||
` * LAPACK has been updated to 3.9.0 (plus patches up to
|
||||
January 2nd, 2020)
|
||||
* CMAKE support has been improved in several areas including
|
||||
cross-compilation
|
||||
* a thread race condition in the GEMM3M kernels was resolved
|
||||
* the "generic" (plain C) gemm beta kernel used by many targets
|
||||
has been sped up
|
||||
* an optimized version of the LAPACK trtrs functions has been added
|
||||
* an incompatibilty between the LAPACK tests and the OpenBLAS
|
||||
implementation of XERBLA was resolved, removing the numerous
|
||||
warnings about wrong error exits in the former
|
||||
* support for NetBSD has been added
|
||||
* support for compilation with g95 and non-GNU versions of ld
|
||||
has been improved
|
||||
* support for compilation with (upcoming) gcc 10 has been added
|
||||
|
||||
POWER:
|
||||
* worked around miscompilation of several POWER8 and POWER9
|
||||
kernels by older versions of gcc
|
||||
* added support for big-endian POWER8 and for compilation on AIX
|
||||
* corrected bugs in the big-endian support for PPC440 and PPC970
|
||||
* DYNAMIC_ARCH support is now available in CMAKE builds as well
|
||||
|
||||
ARMV8:
|
||||
* performance of DGEMM_BETA and SGEMM_NCOPY has been improved
|
||||
* compilation for 32bit works again
|
||||
* performance of the RPCC function has been improved
|
||||
* improved performance on small systems
|
||||
* DYNAMIC_ARCH support is now available in CMAKE builds as well
|
||||
* cross-compilation from OSX to IOS was simplified
|
||||
|
||||
x86_64:
|
||||
* a new AVX512 DGEMM kernel was added and the AVX512 SGEMM kernel
|
||||
was significantly improved
|
||||
* optimized AVX512 kernels for CGEMM and ZGEMM have been added
|
||||
* AVX2 kernels for STRMM, SGEMM, and CGEMM have been significantly
|
||||
sped up and optimized CGEMM3M and ZGEMM3M kernels have been added
|
||||
* added support for QEMU virtual cpus
|
||||
* a compilation problem with PGI and SUN compilers was fixed
|
||||
* Intel "Goldmont plus" is now autodetected
|
||||
* a potential crash on program exit on MS Windows has been fixed
|
||||
|
||||
x86:
|
||||
* an unwanted case sensitivity in the implementation of LSAME
|
||||
on older 32bit AMD cpus was fixed
|
||||
|
||||
zarch:
|
||||
* Z15 is now supported as Z14
|
||||
* DYNAMIC_ARCH is now available on ZARCH as well
|
||||
|
||||
====================================================================
|
||||
Version 0.3.7
|
||||
11-Aug 2019
|
||||
|
||||
common:
|
||||
* having the gmake special variables TARGET_ARCH or TARGET_MACH
|
||||
defined no longer causes build failures in ctest or utest
|
||||
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
|
||||
has the same effect as setting them to 1
|
||||
* a new test program was added to allow checking the library for
|
||||
thread safety
|
||||
* a new option USE_LOCKING was added to ensure thread safety when
|
||||
OpenBLAS itself is built without multithreading but will be
|
||||
called from multiple threads.
|
||||
* a build failure on Linux with glibc versions earlier than 2.5
|
||||
was fixed
|
||||
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
|
||||
on glibc 2.6 was fixed
|
||||
* NO_AFFINITY was added to the CMAKE options (and defaults to being
|
||||
active on Linux, as in the gmake builds)
|
||||
|
||||
x86_64:
|
||||
* the build-time logic for detection of AVX512 availability in
|
||||
the processor and compiler was fixed
|
||||
* gmake builds on OSX now set the internal name of the library to
|
||||
libopenblas.0.dylib (consistent with CMAKE)
|
||||
* the Haswell DGEMM kernel received a significant speedup through
|
||||
improved prefetch and load instructions
|
||||
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
|
||||
increased by avoiding vpermpd instructions
|
||||
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
|
||||
to fix remaining errors in DGEMM, DSYMM and DTRMM
|
||||
|
||||
POWER:
|
||||
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
|
||||
* added optimized kernels for POWER9 SGEMM and STRMM
|
||||
|
||||
ARMV7:
|
||||
* fixed the softfp implementations of xAMAX and IxAMAX
|
||||
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
|
||||
they were appropriate for only a subset of platforms
|
||||
|
||||
====================================================================
|
||||
Version 0.3.6
|
||||
29-Apr-2019
|
||||
|
||||
28
Makefile
28
Makefile
@@ -34,7 +34,7 @@ endif
|
||||
|
||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
|
||||
|
||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
|
||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
|
||||
|
||||
.PHONY : all libs netlib $(RELA) test ctest shared install
|
||||
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
|
||||
@@ -109,6 +109,7 @@ endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
@$(MAKE) -C exports dyn
|
||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
@$(MAKE) -C exports dll
|
||||
@@ -123,10 +124,13 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
touch $(LIBNAME)
|
||||
ifndef NO_FBLAS
|
||||
$(MAKE) -C test all
|
||||
$(MAKE) -C utest all
|
||||
endif
|
||||
$(MAKE) -C utest all
|
||||
ifndef NO_CBLAS
|
||||
$(MAKE) -C ctest all
|
||||
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
|
||||
$(MAKE) -C cpp_thread_test all
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
@@ -243,21 +247,21 @@ prof_lapack : lapack_prebuild
|
||||
|
||||
lapack_prebuild :
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
@@ -315,7 +319,7 @@ lapack-test :
|
||||
ifneq ($(CROSS), 1)
|
||||
( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
|
||||
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
|
||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
|
||||
endif
|
||||
|
||||
lapack-runtest:
|
||||
|
||||
13
Makefile.arm
13
Makefile.arm
@@ -1,7 +1,7 @@
|
||||
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
|
||||
ifeq ($(OSNAME), Android)
|
||||
CCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
CCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
else
|
||||
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||
@@ -9,11 +9,6 @@ endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV6)
|
||||
CCOMMON_OPT += -mfpu=vfp -march=armv6
|
||||
FCOMMON_OPT += -mfpu=vfp -march=armv6
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV5)
|
||||
CCOMMON_OPT += -march=armv5
|
||||
FCOMMON_OPT += -march=armv5
|
||||
CCOMMON_OPT += -mfpu=vfp
|
||||
FCOMMON_OPT += -mfpu=vfp
|
||||
endif
|
||||
|
||||
@@ -39,7 +39,10 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq ($(CORE), TSV110)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
@@ -51,6 +51,7 @@ endif
|
||||
ifneq ($(OSNAME), AIX)
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
@@ -83,7 +84,8 @@ ifeq ($(OSNAME), Darwin)
|
||||
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
||||
@@ -99,6 +101,7 @@ else
|
||||
#install on AIX has different options syntax
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
|
||||
@@ -29,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas
|
||||
endif
|
||||
endif
|
||||
|
||||
# workaround for C->FORTRAN ABI violation in LAPACKE
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
FCOMMON_OPT += -fno-optimize-sibling-calls
|
||||
endif
|
||||
|
||||
FLAMEPATH = $(HOME)/flame/lib
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.6
|
||||
VERSION = 0.3.9.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
@@ -58,6 +58,12 @@ VERSION = 0.3.6
|
||||
# For force setting for multi threaded, specify USE_THREAD = 1
|
||||
# USE_THREAD = 0
|
||||
|
||||
# If you want to build a single-threaded OpenBLAS, but expect to call this
|
||||
# from several concurrent threads in some other program, comment this in for
|
||||
# thread safety. (This is done automatically for USE_THREAD=1 , and should not
|
||||
# be necessary when USE_OPENMP=1)
|
||||
# USE_LOCKING = 1
|
||||
|
||||
# If you're going to use this library with OpenMP, please comment it in.
|
||||
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
|
||||
# USE_OPENMP = 1
|
||||
@@ -91,6 +97,15 @@ VERSION = 0.3.6
|
||||
# they need to wait for the preceding API calls to finish or risk data corruption.
|
||||
# NUM_PARALLEL = 2
|
||||
|
||||
# When multithreading, OpenBLAS needs to use a memory buffer for communicating
|
||||
# and collating results for individual subranges of the original matrix. Since
|
||||
# the original GotoBLAS of the early 2000s, the default size of this buffer has
|
||||
# been set at a value of 32<<20 (which is 32MB) on x86_64 , twice that on PPC.
|
||||
# If you expect to handle large problem sizes (beyond about 30000x30000) uncomment
|
||||
# this line and adjust the (32<<n) factor if necessary. Usually an insufficient value
|
||||
# manifests itself as a crash in the relevant scal kernel (sscal_k, dscal_k etc)
|
||||
# BUFFERSIZE = 25
|
||||
|
||||
# If you don't need to install the static library, please comment this in.
|
||||
# NO_STATIC = 1
|
||||
|
||||
@@ -157,6 +172,10 @@ NO_AFFINITY = 1
|
||||
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
|
||||
# NO_AVX2 = 1
|
||||
|
||||
# Don't use SkylakeX optimizations if binutils or compiler are too old (the build
|
||||
# system will try to determine this automatically)
|
||||
# NO_AVX512 = 1
|
||||
|
||||
# Don't use parallel make.
|
||||
# NO_PARALLEL_MAKE = 1
|
||||
|
||||
@@ -181,17 +200,17 @@ NO_AFFINITY = 1
|
||||
# time out to improve performance. This number should be from 4 to 30
|
||||
# which corresponds to (1 << n) cycles. For example, if you set to 26,
|
||||
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
|
||||
# system). Also you can control this mumber by THREAD_TIMEOUT
|
||||
# system). Also you can control this number by THREAD_TIMEOUT
|
||||
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26
|
||||
|
||||
# Using special device driver for mapping physically contigous memory
|
||||
# Using special device driver for mapping physically contiguous memory
|
||||
# to the user space. If bigphysarea is enabled, it will use it.
|
||||
# DEVICEDRIVER_ALLOCATION = 1
|
||||
|
||||
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
|
||||
# CONSISTENT_FPCSR = 1
|
||||
|
||||
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
||||
# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
|
||||
# with single thread. (Actually in recent versions this is a factor proportional to the
|
||||
# number of floating point operations necessary for the given problem size, no longer
|
||||
# an individual dimension). You can use this setting to avoid the overhead of multi-
|
||||
@@ -239,6 +258,21 @@ COMMON_PROF = -pg
|
||||
# SYMBOLPREFIX=
|
||||
# SYMBOLSUFFIX=
|
||||
|
||||
# Run a C++ based thread safety tester after the build is done.
|
||||
# This is mostly intended as a developer feature to spot regressions, but users and
|
||||
# package maintainers can enable this if they have doubts about the thread safety of
|
||||
# the library, given the configuration in this file.
|
||||
# By default, the thread safety tester launches 52 concurrent calculations at the same
|
||||
# time.
|
||||
#
|
||||
# Please note that the test uses ~1300 MiB of RAM for the DGEMM test.
|
||||
#
|
||||
# The test requires CBLAS to be built, a C++11 capable compiler and the presence of
|
||||
# an OpenMP implementation. If you are cross-compiling this test will probably not
|
||||
# work at all.
|
||||
#
|
||||
# CPP_THREAD_SAFETY_TEST = 1
|
||||
|
||||
#
|
||||
# End of user configuration
|
||||
#
|
||||
|
||||
@@ -9,6 +9,13 @@ ifndef TOPDIR
|
||||
TOPDIR = .
|
||||
endif
|
||||
|
||||
# If ARCH is not set, we use the host system's architecture for getarch compile options.
|
||||
ifndef ARCH
|
||||
HOSTARCH := $(shell uname -m)
|
||||
else
|
||||
HOSTARCH = $(ARCH)
|
||||
endif
|
||||
|
||||
# Catch conflicting usage of ARCH in some BSD environments
|
||||
ifeq ($(ARCH), amd64)
|
||||
override ARCH=x86_64
|
||||
@@ -18,6 +25,8 @@ else ifeq ($(ARCH), i386)
|
||||
override ARCH=x86
|
||||
else ifeq ($(ARCH), aarch64)
|
||||
override ARCH=arm64
|
||||
else ifeq ($(ARCH), zarch)
|
||||
override ARCH=zarch
|
||||
endif
|
||||
|
||||
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
|
||||
@@ -137,7 +146,12 @@ endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
|
||||
ifeq ($(HOSTARCH), x86_64)
|
||||
ifeq ($(findstring pgcc,$(HOSTCC)),)
|
||||
GETARCH_FLAGS += -march=native
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef INTERFACE64
|
||||
ifneq ($(INTERFACE64), 0)
|
||||
@@ -237,6 +251,10 @@ SMP = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(SMP), 1)
|
||||
USE_LOCKING =
|
||||
endif
|
||||
|
||||
ifndef NEED_PIC
|
||||
NEED_PIC = 1
|
||||
endif
|
||||
@@ -253,9 +271,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy
|
||||
OBJCONV = $(CROSS_SUFFIX)objconv
|
||||
|
||||
|
||||
# For detect fortran failed, only build BLAS.
|
||||
# When fortran support was either not detected or actively deselected, only build BLAS.
|
||||
ifeq ($(NOFORTRAN), 1)
|
||||
NO_LAPACK = 1
|
||||
override FEXTRALIB =
|
||||
endif
|
||||
|
||||
#
|
||||
@@ -305,12 +324,14 @@ CCOMMON_OPT += -DMS_ABI
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
#Test for supporting MS_ABI
|
||||
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
||||
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||
ifeq ($(GCCVERSIONGT4), 1)
|
||||
# GCC Majar version > 4
|
||||
# GCC Major version > 4
|
||||
# It is compatible with MSVC ABI.
|
||||
CCOMMON_OPT += -DMS_ABI
|
||||
endif
|
||||
@@ -388,6 +409,12 @@ ifneq ($(MAX_STACK_ALLOC), 0)
|
||||
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
|
||||
endif
|
||||
|
||||
ifdef USE_LOCKING
|
||||
ifneq ($(USE_LOCKING), 0)
|
||||
CCOMMON_OPT += -DUSE_LOCKING
|
||||
endif
|
||||
endif
|
||||
|
||||
#
|
||||
# Architecture dependent settings
|
||||
#
|
||||
@@ -523,16 +550,35 @@ endif
|
||||
|
||||
ifeq ($(ARCH), arm64)
|
||||
DYNAMIC_CORE = ARMV8
|
||||
DYNAMIC_CORE += CORTEXA53
|
||||
DYNAMIC_CORE += CORTEXA57
|
||||
DYNAMIC_CORE += CORTEXA72
|
||||
DYNAMIC_CORE += CORTEXA73
|
||||
DYNAMIC_CORE += FALKOR
|
||||
DYNAMIC_CORE += THUNDERX
|
||||
DYNAMIC_CORE += THUNDERX2T99
|
||||
DYNAMIC_CORE += TSV110
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), zarch)
|
||||
DYNAMIC_CORE = Z13
|
||||
DYNAMIC_CORE += Z14
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), power)
|
||||
DYNAMIC_CORE = POWER6
|
||||
DYNAMIC_CORE += POWER8
|
||||
ifneq ($(C_COMPILER), GCC)
|
||||
DYNAMIC_CORE += POWER9
|
||||
endif
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
ifeq ($(GCCVERSIONGT5), 1)
|
||||
DYNAMIC_CORE += POWER9
|
||||
else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
ifndef DYNAMIC_CORE
|
||||
@@ -676,7 +722,7 @@ endif
|
||||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
ifdef BINARY64
|
||||
CCOMMON_OPT += -tp p7-64
|
||||
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
|
||||
else
|
||||
CCOMMON_OPT += -tp p7
|
||||
endif
|
||||
@@ -736,6 +782,9 @@ else
|
||||
FCOMMON_OPT += -m32
|
||||
endif
|
||||
endif
|
||||
ifneq ($(NO_LAPACKE), 1)
|
||||
FCOMMON_OPT += -fno-second-underscore
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
@@ -744,6 +793,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
|
||||
FCOMMON_OPT += -Wall
|
||||
# make single-threaded LAPACK calls thread-safe #1847
|
||||
FCOMMON_OPT += -frecursive
|
||||
# work around ABI problem with passing single-character arguments
|
||||
FCOMMON_OPT += -fno-optimize-sibling-calls
|
||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
EXTRALIB += -lgfortran
|
||||
@@ -1049,7 +1100,7 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||
endif
|
||||
|
||||
ifdef USE_TLS
|
||||
ifeq ($(USE_TLS), 1)
|
||||
CCOMMON_OPT += -DUSE_TLS
|
||||
endif
|
||||
|
||||
@@ -1102,8 +1153,12 @@ endif
|
||||
endif
|
||||
|
||||
ifdef NO_AFFINITY
|
||||
ifeq ($(NO_AFFINITY), 0)
|
||||
override undefine NO_AFFINITY
|
||||
else
|
||||
CCOMMON_OPT += -DNO_AFFINITY
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef FUNCTION_PROFILE
|
||||
CCOMMON_OPT += -DFUNCTION_PROFILE
|
||||
|
||||
@@ -28,11 +28,15 @@ endif
|
||||
ifeq ($(CORE), HASWELL)
|
||||
ifndef DYNAMIC_ARCH
|
||||
ifndef NO_AVX2
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
CCOMMON_OPT += -mavx2
|
||||
endif
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
FCOMMON_OPT += -mavx2
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
56
README.md
56
README.md
@@ -6,11 +6,13 @@ Travis CI: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
||||
|
||||
[](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
|
||||
|
||||
## Introduction
|
||||
|
||||
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
|
||||
|
||||
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
|
||||
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
|
||||
|
||||
## Binary Packages
|
||||
|
||||
@@ -22,8 +24,10 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
|
||||
|
||||
## Installation from Source
|
||||
|
||||
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
|
||||
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
|
||||
using Git from https://github.com/xianyi/OpenBLAS.git.
|
||||
Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
|
||||
Most can also be given directly on the make or cmake command line.
|
||||
|
||||
### Dependencies
|
||||
|
||||
@@ -63,9 +67,7 @@ A debug version can be built using `make DEBUG=1`.
|
||||
|
||||
### Compile with MASS support on Power CPU (optional)
|
||||
|
||||
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
|
||||
consists of a set of mathematical functions for C, C++, and Fortran applications that are
|
||||
are tuned for optimum performance on POWER architectures.
|
||||
The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
|
||||
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
|
||||
The library can be installed as shown:
|
||||
|
||||
@@ -101,7 +103,7 @@ The default installation directory is `/opt/OpenBLAS`.
|
||||
|
||||
## Supported CPUs and Operating Systems
|
||||
|
||||
Please read `GotoBLAS_01Readme.txt`.
|
||||
Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by the 2010 GotoBLAS.
|
||||
|
||||
### Additional supported CPUs
|
||||
|
||||
@@ -109,12 +111,13 @@ Please read `GotoBLAS_01Readme.txt`.
|
||||
|
||||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
|
||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||
- **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
|
||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
||||
- **AMD ZEN**: Uses Haswell codes with some optimizations.
|
||||
|
||||
#### MIPS64
|
||||
|
||||
@@ -128,26 +131,51 @@ Please read `GotoBLAS_01Readme.txt`.
|
||||
|
||||
#### ARM64
|
||||
|
||||
- **ARMv8**: Experimental
|
||||
- **ARM Cortex-A57**: Experimental
|
||||
- **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS
|
||||
- **Cortex-A53**: same as ARMV8 (different cpu specifications)
|
||||
- **Cortex A57**: Optimized Level-3 and Level-2 functions
|
||||
- **Cortex A72**: same as A57 ( different cpu specifications)
|
||||
- **Cortex A73**: same as A57 (different cpu specifications)
|
||||
- **Falkor**: same as A57 (different cpu specifications)
|
||||
- **ThunderX**: Optimized some Level-1 functions
|
||||
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
|
||||
- **TSV110**: Optimized some Level-3 helper functions
|
||||
|
||||
#### PPC/PPC64
|
||||
|
||||
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
|
||||
- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
|
||||
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
|
||||
|
||||
#### IBM zEnterprise System
|
||||
|
||||
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
|
||||
- **Z13**: Optimized Level-3 BLAS and Level-1,2
|
||||
- **Z14**: Optimized Level-3 BLAS and (single precision) Level-1,2
|
||||
|
||||
### Support for multiple targets in a single library
|
||||
|
||||
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake.
|
||||
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default.
|
||||
DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
|
||||
Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
|
||||
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
|
||||
For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
|
||||
The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the
|
||||
common code in the library, usually you will want to set this to the oldest model you expect to encounter.
|
||||
Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
|
||||
|
||||
### Supported OS
|
||||
|
||||
- **GNU/Linux**
|
||||
- **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts.
|
||||
- **Darwin/macOS/OSX/iOS**: Experimental. Although GotoBLAS2 already supports Darwin, we are not OSX/iOS experts.
|
||||
- **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **NetBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
- **AIX**: Supported on PPC up to POWER8
|
||||
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **SunOS**: Supported by the community. We don't actively test the library on this OS:
|
||||
|
||||
## Usage
|
||||
|
||||
@@ -202,7 +230,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
||||
Clang 3.0 will generate the wrong AVX binary code.
|
||||
* Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels.
|
||||
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||
* The number of CPUs/cores should be less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
|
||||
the library with `BIGNUMA=1`.
|
||||
* OpenBLAS does not set processor affinity by default.
|
||||
|
||||
18
appveyor.yml
18
appveyor.yml
@@ -35,7 +35,15 @@ environment:
|
||||
DYNAMIC_ARCH: ON
|
||||
WITH_FORTRAN: no
|
||||
- COMPILER: cl
|
||||
|
||||
- COMPILER: MinGW64-gcc-7.2.0-mingw
|
||||
DYNAMIC_ARCH: OFF
|
||||
WITH_FORTRAN: ignore
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
COMPILER: MinGW-gcc-6.3.0-32
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
COMPILER: MinGW-gcc-5.3.0
|
||||
WITH_FORTRAN: ignore
|
||||
|
||||
install:
|
||||
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
|
||||
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
|
||||
@@ -52,7 +60,14 @@ install:
|
||||
before_build:
|
||||
- ps: if (-Not (Test-Path .\build)) { mkdir build }
|
||||
- cd build
|
||||
- set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
|
||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] set PATH=C:\msys64\usr\bin;C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
|
||||
@@ -64,3 +79,4 @@ test_script:
|
||||
- echo Running Test
|
||||
- cd utest
|
||||
- openblas_utest
|
||||
|
||||
|
||||
51
azure-pipelines.yml
Normal file
51
azure-pipelines.yml
Normal file
@@ -0,0 +1,51 @@
|
||||
trigger:
|
||||
# start a new build for every push
|
||||
batch: False
|
||||
branches:
|
||||
include:
|
||||
- develop
|
||||
|
||||
jobs:
|
||||
# manylinux1 is useful to test because the
|
||||
# standard Docker container uses an old version
|
||||
# of gcc / glibc
|
||||
- job: manylinux1_gcc
|
||||
pool:
|
||||
vmImage: 'ubuntu-16.04'
|
||||
steps:
|
||||
- script: |
|
||||
echo "FROM quay.io/pypa/manylinux1_x86_64
|
||||
COPY . /tmp/openblas
|
||||
RUN cd /tmp/openblas && \
|
||||
COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \
|
||||
BTYPE='BINARY=64' CC=gcc && \
|
||||
make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \
|
||||
make -C test $COMMON_FLAGS $BTYPE && \
|
||||
make -C ctest $COMMON_FLAGS $BTYPE && \
|
||||
make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile
|
||||
docker build .
|
||||
displayName: Run manylinux1 docker build
|
||||
- job: Intel_SDE_skx
|
||||
pool:
|
||||
vmImage: 'ubuntu-16.04'
|
||||
steps:
|
||||
- script: |
|
||||
# at the time of writing the available Azure Ubuntu vm image
|
||||
# does not support AVX512VL, so use more recent LTS version
|
||||
echo "FROM ubuntu:bionic
|
||||
COPY . /tmp/openblas
|
||||
RUN apt-get -y update && apt-get -y install \\
|
||||
cmake \\
|
||||
gfortran \\
|
||||
make \\
|
||||
wget
|
||||
RUN mkdir /tmp/SDE && cd /tmp/SDE && \\
|
||||
mkdir sde-external-8.35.0-2019-03-11-lin && \\
|
||||
wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\
|
||||
tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1
|
||||
RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64
|
||||
CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile
|
||||
docker build -t intel_sde .
|
||||
# we need a privileged docker run for sde process attachment
|
||||
docker run --privileged intel_sde
|
||||
displayName: 'Run AVX512 SkylakeX docker build / test'
|
||||
@@ -207,7 +207,7 @@ int main(int argc, char *argv[]){
|
||||
for (i = 0; i < m * n * COMPSIZE; i++) {
|
||||
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time\n");
|
||||
|
||||
for (i = from; i <= to; i += step) {
|
||||
|
||||
@@ -197,7 +197,7 @@ int main(int argc, char *argv[]){
|
||||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < n * COMPSIZE; i++){
|
||||
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -208,7 +208,7 @@ int main(int argc, char *argv[]){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
@@ -234,7 +234,7 @@ int main(int argc, char *argv[]){
|
||||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < n * COMPSIZE; i++){
|
||||
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -245,7 +245,7 @@ int main(int argc, char *argv[]){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
29
c_check
29
c_check
@@ -188,14 +188,14 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
|
||||
} else {
|
||||
$tmpf = new File::Temp( UNLINK => 1 );
|
||||
$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$code = '"addvi.b $w0, $w1, 1"';
|
||||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
|
||||
print $tmpf "#include <msa.h>\n\n";
|
||||
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
|
||||
|
||||
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args");
|
||||
$args = "$msa_flags -o $tmpf.o $tmpf";
|
||||
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$have_msa = 0;
|
||||
@@ -229,10 +229,13 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||
$no_avx512 = 0;
|
||||
} else {
|
||||
# $tmpf = new File::Temp( UNLINK => 1 );
|
||||
($fh,$tmpf) = tempfile( UNLINK => 1 );
|
||||
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
|
||||
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
|
||||
$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
|
||||
$args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf";
|
||||
if ($compiler eq "PGI") {
|
||||
$args = " -tp skylake -c -o $tmpf.o $tmpf";
|
||||
}
|
||||
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
@@ -240,7 +243,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||
} else {
|
||||
$no_avx512 = 0;
|
||||
}
|
||||
unlink("tmpf.o");
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -260,6 +263,19 @@ if ($architecture ne $hostarch) {
|
||||
|
||||
$cross = 1 if ($os ne $hostos);
|
||||
|
||||
# rework cross suffix and architecture if we are on OSX cross-compiling for ARMV8-based IOS
|
||||
# the initial autodetection will have been confused by the command-line arguments to clang
|
||||
# and the cross-compiler apparently still claims to build for x86_64 in its CC -E output
|
||||
if (($os eq "Darwin") && ($cross_suffix ne "")) {
|
||||
my $tmpnam = `xcrun --sdk iphoneos --find clang`;
|
||||
$cross_suffix = substr($tmpnam, 0, rindex($tmpnam, "/")+1 );
|
||||
# this should produce something like $cross_suffix="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/";
|
||||
$cross =1;
|
||||
$architecture = arm64;
|
||||
}
|
||||
|
||||
|
||||
|
||||
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
||||
|
||||
$linker_L = "";
|
||||
@@ -305,6 +321,7 @@ $linker_a = "";
|
||||
&& ($flags !~ /kernel32/)
|
||||
&& ($flags !~ /advapi32/)
|
||||
&& ($flags !~ /shell32/)
|
||||
&& ($flags !~ /omp/)
|
||||
) {
|
||||
$linker_l .= $flags . " "
|
||||
}
|
||||
|
||||
@@ -45,7 +45,11 @@ endif ()
|
||||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110)
|
||||
endif ()
|
||||
|
||||
if (POWER)
|
||||
set(DYNAMIC_CORE POWER6 POWER8 POWER9)
|
||||
endif ()
|
||||
|
||||
if (X86)
|
||||
@@ -73,14 +77,16 @@ if (DYNAMIC_ARCH)
|
||||
endif ()
|
||||
if (NOT NO_AVX512)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
|
||||
endif ()
|
||||
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||
endif ()
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DYNAMIC_CORE)
|
||||
unset(DYNAMIC_ARCH)
|
||||
message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options")
|
||||
unset(DYNAMIC_ARCH CACHE)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||
## Sets C related variables.
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -Wall")
|
||||
set(COMMON_PROF "${COMMON_PROF} -fno-inline")
|
||||
@@ -43,7 +43,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "PGI")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64")
|
||||
else ()
|
||||
@@ -51,7 +51,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PGI")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
|
||||
else ()
|
||||
@@ -59,7 +59,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64")
|
||||
|
||||
if (MIPS64)
|
||||
|
||||
@@ -87,7 +87,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "SUN")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -w")
|
||||
if (X86)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
|
||||
@@ -96,3 +96,10 @@ if (${CMAKE_C_COMPILER} STREQUAL "SUN")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL "SKYLAKEX")
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (NOT NO_AVX512)
|
||||
set (CCOMMON_OPT = "${CCOMMON_OPT} -march=skylake-avx512")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
@@ -44,7 +44,10 @@ endif ()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "GFORTRAN")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
|
||||
# ensure reentrancy of lapack codes
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
|
||||
# work around ABI violation in passing string arguments from C
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
|
||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||
if (NOT NO_LAPACK)
|
||||
set(EXTRALIB "{EXTRALIB} -lgfortran")
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# helper functions for the kernel CMakeLists.txt
|
||||
|
||||
|
||||
# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file.
|
||||
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
|
||||
macro(SetDefaultL1)
|
||||
set(SAMAXKERNEL amax.S)
|
||||
set(DAMAXKERNEL amax.S)
|
||||
|
||||
@@ -115,7 +115,9 @@ set(SLASRC
|
||||
stplqt.f stplqt2.f stpmlqt.f
|
||||
ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
|
||||
ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
|
||||
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f)
|
||||
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
|
||||
scombssq.f sgesvdq.f slaorhr_col_getrfnp.f
|
||||
slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
|
||||
|
||||
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
|
||||
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
|
||||
@@ -210,7 +212,9 @@ set(CLASRC
|
||||
ctplqt.f ctplqt2.f ctpmlqt.f
|
||||
chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
|
||||
cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
|
||||
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f)
|
||||
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
|
||||
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
|
||||
cungtsqr.f cunhr_col.f )
|
||||
|
||||
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
|
||||
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
|
||||
@@ -299,7 +303,9 @@ set(DLASRC
|
||||
dtplqt.f dtplqt2.f dtpmlqt.f
|
||||
dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
|
||||
dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
|
||||
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f)
|
||||
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
|
||||
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
|
||||
dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
|
||||
|
||||
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
|
||||
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
|
||||
@@ -398,7 +404,9 @@ set(ZLASRC
|
||||
zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
|
||||
zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
|
||||
zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
|
||||
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f)
|
||||
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
|
||||
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
|
||||
zungtsqr.f zunhr_col.f)
|
||||
|
||||
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
|
||||
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
|
||||
|
||||
@@ -715,6 +715,8 @@ set(DSRC
|
||||
lapacke_dgesv_work.c
|
||||
lapacke_dgesvd.c
|
||||
lapacke_dgesvd_work.c
|
||||
lapacke_dgesvdq.c
|
||||
lapacke_dgesvdq_work.c
|
||||
lapacke_dgesvdx.c
|
||||
lapacke_dgesvdx_work.c
|
||||
lapacke_dgesvj.c
|
||||
@@ -1287,6 +1289,8 @@ set(SSRC
|
||||
lapacke_sgesv_work.c
|
||||
lapacke_sgesvd.c
|
||||
lapacke_sgesvd_work.c
|
||||
lapacke_sgesvdq.c
|
||||
lapacke_sgesvdq_work.c
|
||||
lapacke_sgesvdx.c
|
||||
lapacke_sgesvdx_work.c
|
||||
lapacke_sgesvj.c
|
||||
@@ -1853,6 +1857,8 @@ set(ZSRC
|
||||
lapacke_zgesv_work.c
|
||||
lapacke_zgesvd.c
|
||||
lapacke_zgesvd_work.c
|
||||
lapacke_zgesvdq.c
|
||||
lapacke_zgesvdq_work.c
|
||||
lapacke_zgesvdx.c
|
||||
lapacke_zgesvdx_work.c
|
||||
lapacke_zgesvj.c
|
||||
|
||||
@@ -59,6 +59,9 @@ set(FU "")
|
||||
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
|
||||
set(FU "_")
|
||||
endif()
|
||||
if(MINGW AND NOT MINGW64)
|
||||
set(FU "_")
|
||||
endif()
|
||||
|
||||
set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
|
||||
if (${COMPILER_ID} STREQUAL "GNU")
|
||||
@@ -82,6 +85,11 @@ endif ()
|
||||
# f_check
|
||||
if (NOT NOFORTRAN)
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
|
||||
else ()
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define BUNDERSCORE _\n"
|
||||
"#define NEEDBUNDERSCORE 1\n")
|
||||
set(BU "_")
|
||||
endif ()
|
||||
|
||||
# Cannot run getarch on target if we are cross-compiling
|
||||
@@ -97,8 +105,39 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||
# Perhaps this should be inside a different file as it grows larger
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define ${TCORE}\n"
|
||||
"#define CORE_${TCORE}\n"
|
||||
"#define CHAR_CORENAME \"${TCORE}\"\n")
|
||||
if ("${TCORE}" STREQUAL "ARMV7")
|
||||
if ("${TCORE}" STREQUAL "CORE2")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t1048576\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t256\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_CMOV\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSSE3\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t16384\n")
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 4)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "ARMV7")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t32\n"
|
||||
@@ -113,6 +152,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
elseif ("${TCORE}" STREQUAL "ARMV8")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
@@ -266,6 +309,83 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "TSV110")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define ARMV8\n"
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t4\n"
|
||||
"#define L2_SIZE\t524288\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t8\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "POWER6")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE 32768\n"
|
||||
"#define L1_DATA_LINESIZE 128\n"
|
||||
"#define L2_SIZE 524288\n"
|
||||
"#define L2_LINESIZE 128 \n"
|
||||
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define L2_ASSOCIATIVE 8\n")
|
||||
set(SGEMM_UNROLL_M 4)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 8)
|
||||
elseif ("${TCORE}" STREQUAL "POWER8")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE 32768\n"
|
||||
"#define L1_DATA_LINESIZE 128\n"
|
||||
"#define L2_SIZE 524288\n"
|
||||
"#define L2_LINESIZE 128 \n"
|
||||
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define L2_ASSOCIATIVE 8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
set(DGEMM_UNROLL_M 16)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 8)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(SYMV_P 8)
|
||||
elseif ("${TCORE}" STREQUAL "POWER9")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE 32768\n"
|
||||
"#define L1_DATA_LINESIZE 128\n"
|
||||
"#define L2_SIZE 524288\n"
|
||||
"#define L2_LINESIZE 128 \n"
|
||||
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define L2_ASSOCIATIVE 8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
set(DGEMM_UNROLL_M 16)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 8)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(SYMV_P 8)
|
||||
endif()
|
||||
|
||||
# Or should this actually be NUM_CORES?
|
||||
@@ -301,6 +421,9 @@ else(NOT CMAKE_CROSSCOMPILING)
|
||||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
|
||||
else()
|
||||
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
|
||||
if (DEFINED TARGET_CORE)
|
||||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE})
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
|
||||
@@ -65,6 +65,18 @@ if (DEFINED TARGET)
|
||||
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
|
||||
endif ()
|
||||
|
||||
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
|
||||
if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
|
||||
endif ()
|
||||
|
||||
# On x86 no AVX support is available
|
||||
if (X86 OR X86_64)
|
||||
if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4"))
|
||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (INTERFACE64)
|
||||
message(STATUS "Using 64-bit integers.")
|
||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
|
||||
@@ -136,10 +148,16 @@ endif ()
|
||||
|
||||
if (USE_THREAD)
|
||||
message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.")
|
||||
else()
|
||||
if (${USE_LOCKING})
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
||||
|
||||
if (DEFINED BINARY)
|
||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||
endif ()
|
||||
if (NOT DEFINED NEED_PIC)
|
||||
set(NEED_PIC 1)
|
||||
endif ()
|
||||
@@ -156,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
|
||||
if (NOT NOFORTRAN)
|
||||
# Fortran Compiler dependent settings
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
|
||||
else ()
|
||||
set(NO_LAPACK 1)
|
||||
set(NO_LAPACKE 1)
|
||||
endif ()
|
||||
|
||||
if (BINARY64)
|
||||
@@ -181,9 +202,14 @@ if (NEED_PIC)
|
||||
endif ()
|
||||
|
||||
if (DYNAMIC_ARCH)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
||||
if (DYNAMIC_OLDER)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
|
||||
if (X86 OR X86_64 OR ARM64 OR PPC)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
||||
if (DYNAMIC_OLDER)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
|
||||
endif ()
|
||||
else ()
|
||||
unset (DYNAMIC_ARCH)
|
||||
message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
@@ -263,6 +289,10 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}")
|
||||
|
||||
if (BUFFERSIZE)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUFFERSIZE=${BUFFERSIZE}")
|
||||
endif ()
|
||||
|
||||
if (USE_SIMPLE_THREADED_LEVEL3)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
|
||||
endif ()
|
||||
@@ -283,7 +313,7 @@ endif ()
|
||||
|
||||
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
|
||||
|
||||
# TODO: nead to convert these Makefiles
|
||||
# TODO: need to convert these Makefiles
|
||||
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
|
||||
|
||||
if (${CORE} STREQUAL "PPC440")
|
||||
|
||||
@@ -15,7 +15,7 @@ if (${HOST_OS} STREQUAL "LINUX")
|
||||
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
|
||||
if(${OPERATING_SYSTEM} MATCHES "Android")
|
||||
set(HOST_OS ANDROID)
|
||||
endif(${OPERATING_SYSTEM} MATCHES "Android")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
@@ -39,10 +39,18 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
||||
set(MIPS64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(X86_64 1)
|
||||
if (NOT BINARY)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(X86_64 1)
|
||||
else()
|
||||
set(X86 1)
|
||||
endif()
|
||||
else()
|
||||
set(X86 1)
|
||||
if (${BINARY} EQUAL "64")
|
||||
set(X86_64 1)
|
||||
else ()
|
||||
set(X86 1)
|
||||
endif()
|
||||
endif()
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
||||
set(X86 1)
|
||||
@@ -54,6 +62,22 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
|
||||
else()
|
||||
set(ARM 1)
|
||||
endif()
|
||||
elseif (${CMAKE_CROSSCOMPILING})
|
||||
if (${TARGET} STREQUAL "CORE2")
|
||||
if (NOT BINARY)
|
||||
set(X86 1)
|
||||
elseif (${BINARY} EQUAL "64")
|
||||
set(X86_64 1)
|
||||
else ()
|
||||
set(X86 1)
|
||||
endif()
|
||||
elseif (${TARGET} STREQUAL "ARMV7")
|
||||
set(ARM 1)
|
||||
else()
|
||||
set(ARM64 1)
|
||||
endif ()
|
||||
else ()
|
||||
message(WARNING "Target ARCH could not be determined, got \"${CMAKE_SYSTEM_PROCESSOR}\"")
|
||||
endif()
|
||||
|
||||
if (X86_64)
|
||||
@@ -92,4 +116,3 @@ set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||
endif()
|
||||
file(REMOVE "avx512.tmp" "avx512.o")
|
||||
endif()
|
||||
|
||||
|
||||
@@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
|
||||
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
|
||||
endfunction ()
|
||||
|
||||
# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition
|
||||
# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
|
||||
# @param sources_in the source files to build from
|
||||
# @param defines_in (optional) preprocessor definitions that will be applied to all objects
|
||||
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.
|
||||
|
||||
4
common.h
4
common.h
@@ -131,7 +131,7 @@ extern "C" {
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <math.h>
|
||||
#ifdef SMP
|
||||
#if defined(SMP) || defined(USE_LOCKING)
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
#endif
|
||||
@@ -200,7 +200,7 @@ extern "C" {
|
||||
#error "You can't specify both LOCK operation!"
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
#if defined(SMP) || defined(USE_LOCKING)
|
||||
#define USE_PTHREAD_LOCK
|
||||
#undef USE_PTHREAD_SPINLOCK
|
||||
#endif
|
||||
|
||||
@@ -78,7 +78,18 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
#define BLAS_LOCK_DEFINED
|
||||
|
||||
#if !defined(OS_DARWIN) && !defined (OS_ANDROID)
|
||||
static __inline BLASULONG rpcc(void){
|
||||
BLASULONG ret = 0;
|
||||
|
||||
__asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define RPCC_DEFINED
|
||||
#define RPCC64BIT
|
||||
#endif
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
@@ -103,12 +114,16 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.text ;\
|
||||
.align 4 ;\
|
||||
.global REALNAME ;\
|
||||
.type REALNAME, %function ;\
|
||||
.macro PROLOGUE
|
||||
.text ;
|
||||
.p2align 2 ;
|
||||
.global REALNAME ;
|
||||
#ifndef __APPLE__
|
||||
.type REALNAME, %function ;
|
||||
#endif
|
||||
REALNAME:
|
||||
.endm
|
||||
|
||||
|
||||
#define EPILOGUE
|
||||
|
||||
|
||||
146
common_lapack.h
146
common_lapack.h
@@ -293,4 +293,150 @@ blasint zlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLO
|
||||
blasint xlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
blasint strtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint dtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint qtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint ctrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ztrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint xtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
blasint strtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint dtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint qtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint ctrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ztrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint xtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
#endif
|
||||
|
||||
165
common_macro.h
165
common_macro.h
@@ -641,7 +641,7 @@
|
||||
#define IMATCOPY_K_CT DIMATCOPY_K_CT
|
||||
#define IMATCOPY_K_RT DIMATCOPY_K_RT
|
||||
|
||||
#define GEADD_K DGEADD_K
|
||||
#define GEADD_K DGEADD_K
|
||||
#else
|
||||
|
||||
#define AMAX_K SAMAX_K
|
||||
@@ -944,7 +944,7 @@
|
||||
#define IMATCOPY_K_CT SIMATCOPY_K_CT
|
||||
#define IMATCOPY_K_RT SIMATCOPY_K_RT
|
||||
|
||||
#define GEADD_K SGEADD_K
|
||||
#define GEADD_K SGEADD_K
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
@@ -1770,7 +1770,7 @@
|
||||
#define IMATCOPY_K_CTC ZIMATCOPY_K_CTC
|
||||
#define IMATCOPY_K_RTC ZIMATCOPY_K_RTC
|
||||
|
||||
#define GEADD_K ZGEADD_K
|
||||
#define GEADD_K ZGEADD_K
|
||||
|
||||
#else
|
||||
|
||||
@@ -2193,7 +2193,7 @@
|
||||
#define IMATCOPY_K_CTC CIMATCOPY_K_CTC
|
||||
#define IMATCOPY_K_RTC CIMATCOPY_K_RTC
|
||||
|
||||
#define GEADD_K CGEADD_K
|
||||
#define GEADD_K CGEADD_K
|
||||
|
||||
#endif
|
||||
#endif
|
||||
@@ -2806,3 +2806,160 @@ typedef struct {
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
#define TRTRS_UNU_SINGLE qtrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE qtrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE qtrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE qtrtrs_UTN_single
|
||||
#define TRTRS_LNU_SINGLE qtrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE qtrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE qtrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE qtrtrs_LTN_single
|
||||
#define TRTRS_UNU_PARALLEL qtrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL qtrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL qtrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL qtrtrs_UTN_parallel
|
||||
#define TRTRS_LNU_PARALLEL qtrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL qtrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL qtrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL qtrtrs_LTN_parallel
|
||||
|
||||
#elif defined(DOUBLE)
|
||||
#define TRTRS_UNU_SINGLE dtrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE dtrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE dtrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE dtrtrs_UTN_single
|
||||
#define TRTRS_LNU_SINGLE dtrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE dtrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE dtrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE dtrtrs_LTN_single
|
||||
#define TRTRS_UNU_PARALLEL dtrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL dtrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL dtrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL dtrtrs_UTN_parallel
|
||||
#define TRTRS_LNU_PARALLEL dtrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL dtrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL dtrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL dtrtrs_LTN_parallel
|
||||
#else
|
||||
#define TRTRS_UNU_SINGLE strtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE strtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE strtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE strtrs_UTN_single
|
||||
#define TRTRS_LNU_SINGLE strtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE strtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE strtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE strtrs_LTN_single
|
||||
#define TRTRS_UNU_PARALLEL strtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL strtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL strtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL strtrs_UTN_parallel
|
||||
#define TRTRS_LNU_PARALLEL strtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL strtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL strtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL strtrs_LTN_parallel
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
#define TRTRS_UNU_SINGLE xtrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE xtrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE xtrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE xtrtrs_UTN_single
|
||||
#define TRTRS_URU_SINGLE xtrtrs_URU_single
|
||||
#define TRTRS_URN_SINGLE xtrtrs_URN_single
|
||||
#define TRTRS_UCU_SINGLE xtrtrs_UCU_single
|
||||
#define TRTRS_UCN_SINGLE xtrtrs_UCN_single
|
||||
#define TRTRS_LNU_SINGLE xtrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE xtrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE xtrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE xtrtrs_LTN_single
|
||||
#define TRTRS_LRU_SINGLE xtrtrs_LRU_single
|
||||
#define TRTRS_LRN_SINGLE xtrtrs_LRN_single
|
||||
#define TRTRS_LCU_SINGLE xtrtrs_LCU_single
|
||||
#define TRTRS_LCN_SINGLE xtrtrs_LCN_single
|
||||
#define TRTRS_UNU_PARALLEL xtrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL xtrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL xtrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL xtrtrs_UTN_parallel
|
||||
#define TRTRS_URU_PARALLEL xtrtrs_URU_parallel
|
||||
#define TRTRS_URN_PARALLEL xtrtrs_URN_parallel
|
||||
#define TRTRS_UCU_PARALLEL xtrtrs_UCU_parallel
|
||||
#define TRTRS_UCN_PARALLEL xtrtrs_UCN_parallel
|
||||
#define TRTRS_LNU_PARALLEL xtrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL xtrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL xtrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL xtrtrs_LTN_parallel
|
||||
#define TRTRS_LRU_PARALLEL xtrtrs_LRU_parallel
|
||||
#define TRTRS_LRN_PARALLEL xtrtrs_LRN_parallel
|
||||
#define TRTRS_LCU_PARALLEL xtrtrs_LCU_parallel
|
||||
#define TRTRS_LCN_PARALLEL xtrtrs_LCN_parallel
|
||||
#elif defined(DOUBLE)
|
||||
#define TRTRS_UNU_SINGLE ztrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE ztrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE ztrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE ztrtrs_UTN_single
|
||||
#define TRTRS_URU_SINGLE ztrtrs_URU_single
|
||||
#define TRTRS_URN_SINGLE ztrtrs_URN_single
|
||||
#define TRTRS_UCU_SINGLE ztrtrs_UCU_single
|
||||
#define TRTRS_UCN_SINGLE ztrtrs_UCN_single
|
||||
#define TRTRS_LNU_SINGLE ztrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE ztrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE ztrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE ztrtrs_LTN_single
|
||||
#define TRTRS_LRU_SINGLE ztrtrs_LRU_single
|
||||
#define TRTRS_LRN_SINGLE ztrtrs_LRN_single
|
||||
#define TRTRS_LCU_SINGLE ztrtrs_LCU_single
|
||||
#define TRTRS_LCN_SINGLE ztrtrs_LCN_single
|
||||
#define TRTRS_UNU_PARALLEL ztrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL ztrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL ztrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL ztrtrs_UTN_parallel
|
||||
#define TRTRS_URU_PARALLEL ztrtrs_URU_parallel
|
||||
#define TRTRS_URN_PARALLEL ztrtrs_URN_parallel
|
||||
#define TRTRS_UCU_PARALLEL ztrtrs_UCU_parallel
|
||||
#define TRTRS_UCN_PARALLEL ztrtrs_UCN_parallel
|
||||
#define TRTRS_LNU_PARALLEL ztrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL ztrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL ztrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL ztrtrs_LTN_parallel
|
||||
#define TRTRS_LRU_PARALLEL ztrtrs_LRU_parallel
|
||||
#define TRTRS_LRN_PARALLEL ztrtrs_LRN_parallel
|
||||
#define TRTRS_LCU_PARALLEL ztrtrs_LCU_parallel
|
||||
#define TRTRS_LCN_PARALLEL ztrtrs_LCN_parallel
|
||||
#else
|
||||
#define TRTRS_UNU_SINGLE ctrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE ctrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE ctrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE ctrtrs_UTN_single
|
||||
#define TRTRS_URU_SINGLE ctrtrs_URU_single
|
||||
#define TRTRS_URN_SINGLE ctrtrs_URN_single
|
||||
#define TRTRS_UCU_SINGLE ctrtrs_UCU_single
|
||||
#define TRTRS_UCN_SINGLE ctrtrs_UCN_single
|
||||
#define TRTRS_LNU_SINGLE ctrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE ctrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE ctrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE ctrtrs_LTN_single
|
||||
#define TRTRS_LRU_SINGLE ctrtrs_LRU_single
|
||||
#define TRTRS_LRN_SINGLE ctrtrs_LRN_single
|
||||
#define TRTRS_LCU_SINGLE ctrtrs_LCU_single
|
||||
#define TRTRS_LCN_SINGLE ctrtrs_LCN_single
|
||||
#define TRTRS_UNU_PARALLEL ctrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL ctrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL ctrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL ctrtrs_UTN_parallel
|
||||
#define TRTRS_URU_PARALLEL ctrtrs_URU_parallel
|
||||
#define TRTRS_URN_PARALLEL ctrtrs_URN_parallel
|
||||
#define TRTRS_UCU_PARALLEL ctrtrs_UCU_parallel
|
||||
#define TRTRS_UCN_PARALLEL ctrtrs_UCN_parallel
|
||||
#define TRTRS_LNU_PARALLEL ctrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL ctrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL ctrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL ctrtrs_LTN_parallel
|
||||
#define TRTRS_LRU_PARALLEL ctrtrs_LRU_parallel
|
||||
#define TRTRS_LRN_PARALLEL ctrtrs_LRN_parallel
|
||||
#define TRTRS_LCU_PARALLEL ctrtrs_LCU_parallel
|
||||
#define TRTRS_LCN_PARALLEL ctrtrs_LCN_parallel
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -39,6 +39,35 @@
|
||||
#ifndef COMMON_POWER
|
||||
#define COMMON_POWER
|
||||
|
||||
#define str(x) #x
|
||||
|
||||
#ifdef OS_AIX
|
||||
#define XXSPLTD(T,A,z) xxpermdi T, A, A, 0b##z##z
|
||||
#define XXMRGHD(T,A,B) xxpermdi T, A, B, 0b00
|
||||
#define XXMRGLD(T,A,B) xxpermdi T, A, B, 0b11
|
||||
#define XXSWAPD(T,A) xxpermdi T, A, A, 0b10
|
||||
#define XVMOVDP(T,A) xvcpsgndp T, A, A
|
||||
|
||||
#define XXSPLTD_S(T,A,z) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) " \n\t"
|
||||
#define XXMRGHD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b00 \n\t"
|
||||
#define XXMRGLD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b11 \n\t"
|
||||
#define XXSWAPD_S(T,A) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b10 \n\t"
|
||||
|
||||
#else
|
||||
#define XXSPLTD(T,A,z) xxspltd T, A, z
|
||||
#define XXMRGHD(T,A,B) xxmrghd T, A, B
|
||||
#define XXMRGLD(T,A,B) xxmrgld T, A, B
|
||||
#define XXSWAPD(T,A) xxswapd T, A
|
||||
#define XVMOVDP(T,A) xvmovdp T, A
|
||||
|
||||
#define XXSPLTD_S(T,A,z) "xxspltd " str(T) ", " str(A) ", " str(z)" \n\t"
|
||||
#define XXMRGHD_S(T,A,B) "xxmrghd " str(T) ", " str(A) ", " str(B)" \n\t"
|
||||
#define XXMRGLD_S(T,A,B) "xxmrgld " str(T) ", " str(A) ", " str(B)" \n\t"
|
||||
#define XXSWAPD_S(T,A) "xxswapd " str(T) ", " str(A) " \n\t"
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
||||
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
||||
@@ -241,7 +270,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
#define HAVE_PREFETCH
|
||||
#endif
|
||||
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) )
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970)
|
||||
#define DCBT_ARG 0
|
||||
#else
|
||||
#define DCBT_ARG 8
|
||||
@@ -499,7 +528,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||
#ifndef __64BIT__
|
||||
#define PROLOGUE \
|
||||
.section .text;\
|
||||
@@ -784,7 +813,7 @@ Lmcount$lazy_ptr:
|
||||
|
||||
#define HALT mfspr r0, 1023
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||
#if defined(PPC440) || defined(PPC440FP2)
|
||||
#undef MAX_CPU_NUMBER
|
||||
#define MAX_CPU_NUMBER 1
|
||||
@@ -829,7 +858,7 @@ Lmcount$lazy_ptr:
|
||||
#define MAP_ANONYMOUS MAP_ANON
|
||||
#endif
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||
#ifndef __64BIT__
|
||||
#define FRAMESLOT(X) (((X) * 4) + 8)
|
||||
#else
|
||||
|
||||
@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* SIZE must be carefully chosen to be:
|
||||
* - as small as possible to maximize the number of stack allocation
|
||||
* - large enough to support all architectures and kernel
|
||||
* Chosing a too small SIZE will lead to a stack smashing.
|
||||
* Choosing a SIZE too small will lead to a stack smashing.
|
||||
*/
|
||||
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
|
||||
/* make it volatile because some function (ex: dgemv_n.S) */ \
|
||||
|
||||
@@ -194,10 +194,6 @@ int trsm_thread(int mode, BLASLONG m, BLASLONG n,
|
||||
|
||||
int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
|
||||
|
||||
int beta_thread(int mode, BLASLONG m, BLASLONG n,
|
||||
double alpha_r, double alpha_i,
|
||||
void *c, BLASLONG ldc, int (*fuction)());
|
||||
|
||||
int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k,
|
||||
void *offsetA, BLASLONG lda,
|
||||
void *offsetB, BLASLONG jb,
|
||||
|
||||
@@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
#endif
|
||||
|
||||
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
//Enable some optimazation for barcelona.
|
||||
//Enable some optimization for barcelona.
|
||||
#define BARCELONA_OPTIMIZATION
|
||||
#endif
|
||||
|
||||
|
||||
@@ -129,12 +129,13 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
|
||||
*ecx=cpuinfo[2];
|
||||
*edx=cpuinfo[3];
|
||||
#else
|
||||
__asm__ __volatile__("cpuid"
|
||||
__asm__ __volatile__("mov $0, %%ecx;"
|
||||
"cpuid"
|
||||
: "=a" (*eax),
|
||||
"=b" (*ebx),
|
||||
"=c" (*ecx),
|
||||
"=d" (*edx)
|
||||
: "0" (op), "c"(0));
|
||||
: "0" (op));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -224,7 +225,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
#endif
|
||||
#define HUGE_PAGESIZE ( 2 << 20)
|
||||
|
||||
#ifndef BUFFERSIZE
|
||||
#define BUFFER_SIZE (32 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE (32 << BUFFERSIZE)
|
||||
#endif
|
||||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
@@ -276,7 +281,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
#ifdef ASSEMBLER
|
||||
|
||||
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
//Enable some optimazation for barcelona.
|
||||
//Enable some optimization for barcelona.
|
||||
#define BARCELONA_OPTIMIZATION
|
||||
#endif
|
||||
|
||||
|
||||
14
cpp_thread_test/Makefile
Normal file
14
cpp_thread_test/Makefile
Normal file
@@ -0,0 +1,14 @@
|
||||
include ../Makefile.rule
|
||||
|
||||
all :: dgemv_tester dgemm_tester
|
||||
|
||||
dgemv_tester :
|
||||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
|
||||
./dgemv_tester
|
||||
|
||||
dgemm_tester : dgemv_tester
|
||||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
|
||||
./dgemm_tester
|
||||
|
||||
clean ::
|
||||
rm -f dgemv_tester dgemm_tester
|
||||
55
cpp_thread_test/cpp_thread_safety_common.h
Normal file
55
cpp_thread_test/cpp_thread_safety_common.h
Normal file
@@ -0,0 +1,55 @@
|
||||
inline void pauser(){
|
||||
/// a portable way to pause a program
|
||||
std::string dummy;
|
||||
std::cout << "Press enter to continue...";
|
||||
std::getline(std::cin, dummy);
|
||||
}
|
||||
|
||||
void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
|
||||
for(uint32_t i=0; i<numMat; i++){
|
||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
|
||||
matBlock[i][j] = rngdist(PRNG);
|
||||
}
|
||||
}
|
||||
for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){
|
||||
for(uint32_t j=0; j<numMat; j++){
|
||||
matBlock[i+j] = matBlock[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){
|
||||
for(uint32_t i=0; i<numVec; i++){
|
||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||
vecBlock[i][j] = rngdist(PRNG);
|
||||
}
|
||||
}
|
||||
for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){
|
||||
for(uint32_t j=0; j<numVec; j++){
|
||||
vecBlock[i+j] = vecBlock[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::mt19937_64 InitPRNG(){
|
||||
std::random_device rd;
|
||||
std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG
|
||||
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||
//make sure the internal state of the PRNG is properly mixed by generating 10M random numbers
|
||||
//PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed
|
||||
for (uint32_t i=0;i<10000000;i++) rngdist(PRNG);
|
||||
return PRNG;
|
||||
}
|
||||
|
||||
void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
|
||||
for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){
|
||||
std::cout<<i<<std::endl;
|
||||
for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||
for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){
|
||||
std::cout<<matBlock[i][j*randomMatSize + k]<<" ";
|
||||
}
|
||||
std::cout<<std::endl;
|
||||
}
|
||||
std::cout<<std::endl;
|
||||
}
|
||||
}
|
||||
92
cpp_thread_test/dgemm_thread_safety.cpp
Normal file
92
cpp_thread_test/dgemm_thread_safety.cpp
Normal file
@@ -0,0 +1,92 @@
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <future>
|
||||
#include <omp.h>
|
||||
#include "../cblas.h"
|
||||
#include "cpp_thread_safety_common.h"
|
||||
|
||||
void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){
|
||||
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize);
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]){
|
||||
blasint randomMatSize = 1024; //dimension of the random square matrices used
|
||||
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
|
||||
uint32_t numTestRounds = 16; //number of testing rounds before success exit
|
||||
|
||||
if (argc > 4){
|
||||
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
if(argc == 4){
|
||||
std::vector<std::string> cliArgs;
|
||||
for (int i = 1; i < argc; i++){
|
||||
cliArgs.push_back(argv[i]);
|
||||
std::cout<<argv[i]<<std::endl;
|
||||
}
|
||||
randomMatSize = std::stoul(cliArgs[0]);
|
||||
numConcurrentThreads = std::stoul(cliArgs[1]);
|
||||
numTestRounds = std::stoul(cliArgs[2]);
|
||||
}
|
||||
|
||||
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||
std::vector<std::vector<double>> matBlock(numConcurrentThreads*3);
|
||||
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
|
||||
|
||||
std::cout<<"*----------------------------*\n";
|
||||
std::cout<<"| DGEMM thread safety tester |\n";
|
||||
std::cout<<"*----------------------------*\n";
|
||||
std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n';
|
||||
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
|
||||
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
||||
std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
||||
|
||||
std::cout<<"Initializing random number generator..."<<std::flush;
|
||||
std::mt19937_64 PRNG = InitPRNG();
|
||||
std::cout<<"done\n";
|
||||
|
||||
std::cout<<"Preparing to test CBLAS DGEMM thread safety\n";
|
||||
std::cout<<"Allocating matrices..."<<std::flush;
|
||||
for(uint32_t i=0; i<(numConcurrentThreads*3); i++){
|
||||
matBlock[i].resize(randomMatSize*randomMatSize);
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
//pauser();
|
||||
std::cout<<"Filling matrices with random numbers..."<<std::flush;
|
||||
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3);
|
||||
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Testing CBLAS DGEMM thread safety\n";
|
||||
omp_set_num_threads(numConcurrentThreads);
|
||||
for(uint32_t R=0; R<numTestRounds; R++){
|
||||
std::cout<<"DGEMM round #"<<R<<std::endl;
|
||||
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
|
||||
#pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads)
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize);
|
||||
//launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]);
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Waiting for threads to finish..."<<std::flush;
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
futureBlock[i].get();
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
|
||||
std::cout<<"Comparing results from different threads..."<<std::flush;
|
||||
for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread
|
||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
|
||||
if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread
|
||||
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl;
|
||||
std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout<<"OK!\n"<<std::endl;
|
||||
}
|
||||
std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl;
|
||||
return 0;
|
||||
}
|
||||
101
cpp_thread_test/dgemv_thread_safety.cpp
Normal file
101
cpp_thread_test/dgemv_thread_safety.cpp
Normal file
@@ -0,0 +1,101 @@
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <future>
|
||||
#include <omp.h>
|
||||
#include "../cblas.h"
|
||||
#include "cpp_thread_safety_common.h"
|
||||
|
||||
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
|
||||
const blasint inc = 1;
|
||||
cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]){
|
||||
blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
|
||||
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
|
||||
uint32_t numTestRounds = 16; //number of testing rounds before success exit
|
||||
|
||||
if (argc > 4){
|
||||
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
|
||||
abort();
|
||||
}
|
||||
if(argc == 4){
|
||||
std::vector<std::string> cliArgs;
|
||||
for (int i = 1; i < argc; i++){
|
||||
cliArgs.push_back(argv[i]);
|
||||
std::cout<<argv[i]<<std::endl;
|
||||
}
|
||||
randomMatSize = std::stoul(cliArgs.at(0));
|
||||
numConcurrentThreads = std::stoul(cliArgs.at(1));
|
||||
numTestRounds = std::stoul(cliArgs.at(2));
|
||||
}
|
||||
|
||||
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||
std::vector<std::vector<double>> matBlock(numConcurrentThreads);
|
||||
std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2);
|
||||
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
|
||||
|
||||
std::cout<<"*----------------------------*\n";
|
||||
std::cout<<"| DGEMV thread safety tester |\n";
|
||||
std::cout<<"*----------------------------*\n";
|
||||
std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n';
|
||||
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
|
||||
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
||||
std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
||||
|
||||
std::cout<<"Initializing random number generator..."<<std::flush;
|
||||
std::mt19937_64 PRNG = InitPRNG();
|
||||
std::cout<<"done\n";
|
||||
|
||||
std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
|
||||
std::cout<<"Allocating matrices..."<<std::flush;
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
matBlock.at(i).resize(randomMatSize*randomMatSize);
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Allocating vectors..."<<std::flush;
|
||||
for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
|
||||
vecBlock.at(i).resize(randomMatSize);
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
//pauser();
|
||||
|
||||
std::cout<<"Filling matrices with random numbers..."<<std::flush;
|
||||
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1);
|
||||
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads);
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Filling vectors with random numbers..."<<std::flush;
|
||||
FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2);
|
||||
std::cout<<"done\n";
|
||||
|
||||
std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
|
||||
omp_set_num_threads(numConcurrentThreads);
|
||||
for(uint32_t R=0; R<numTestRounds; R++){
|
||||
std::cout<<"DGEMV round #"<<R<<std::endl;
|
||||
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
|
||||
#pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Waiting for threads to finish..."<<std::flush;
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
futureBlock[i].get();
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Comparing results from different threads..."<<std::flush;
|
||||
for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
|
||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||
if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
|
||||
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
|
||||
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout<<"OK!\n"<<std::endl;
|
||||
}
|
||||
std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
|
||||
return 0;
|
||||
}
|
||||
@@ -94,7 +94,7 @@ int get_feature(char *search)
|
||||
if( p == NULL ) return 0;
|
||||
|
||||
t = strtok(p," ");
|
||||
while( t = strtok(NULL," "))
|
||||
while( (t = strtok(NULL," ")))
|
||||
{
|
||||
if (!strcmp(t, search)) { return(1); }
|
||||
}
|
||||
@@ -206,6 +206,33 @@ void get_subdirname(void)
|
||||
printf("arm64");
|
||||
}
|
||||
|
||||
void get_cpucount(void)
|
||||
{
|
||||
int n=0;
|
||||
|
||||
#ifdef linux
|
||||
FILE *infile;
|
||||
char buffer[2048], *p,*t;
|
||||
p = (char *) NULL ;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if (!strncmp("processor", buffer, 9))
|
||||
n++;
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
printf("#define NUM_CORES %d\n",n);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
void get_cpuconfig(void)
|
||||
{
|
||||
|
||||
@@ -309,6 +336,7 @@ void get_cpuconfig(void)
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
}
|
||||
get_cpucount();
|
||||
}
|
||||
|
||||
|
||||
@@ -344,12 +372,10 @@ void get_features(void)
|
||||
if( p == NULL ) return;
|
||||
|
||||
t = strtok(p," ");
|
||||
while( t = strtok(NULL," "))
|
||||
while( (t = strtok(NULL," ")))
|
||||
{
|
||||
}
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
77
cpuid_x86.c
77
cpuid_x86.c
@@ -1197,7 +1197,11 @@ int get_cpuname(void){
|
||||
case 3:
|
||||
case 5:
|
||||
case 6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CPUTYPE_CORE2;
|
||||
#else
|
||||
return CPUTYPE_PENTIUM2;
|
||||
#endif
|
||||
case 7:
|
||||
case 8:
|
||||
case 10:
|
||||
@@ -1211,7 +1215,7 @@ int get_cpuname(void){
|
||||
return CPUTYPE_CORE2;
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
case 1: // family 6 exmodel 1
|
||||
switch (model) {
|
||||
case 6:
|
||||
return CPUTYPE_CORE2;
|
||||
@@ -1228,7 +1232,7 @@ int get_cpuname(void){
|
||||
return CPUTYPE_DUNNINGTON;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
case 2: // family 6 exmodel 2
|
||||
switch (model) {
|
||||
case 5:
|
||||
//Intel Core (Clarkdale) / Core (Arrandale)
|
||||
@@ -1257,7 +1261,7 @@ int get_cpuname(void){
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
case 3: // family 6 exmodel 3
|
||||
switch (model) {
|
||||
case 7:
|
||||
// Bay Trail
|
||||
@@ -1287,7 +1291,7 @@ int get_cpuname(void){
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
case 4: // family 6 exmodel 4
|
||||
switch (model) {
|
||||
case 5:
|
||||
case 6:
|
||||
@@ -1321,7 +1325,7 @@ int get_cpuname(void){
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
case 5: // family 6 exmodel 5
|
||||
switch (model) {
|
||||
case 6:
|
||||
//Broadwell
|
||||
@@ -1364,7 +1368,7 @@ int get_cpuname(void){
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 6:
|
||||
case 6: // family 6 exmodel 6
|
||||
switch (model) {
|
||||
case 6: // Cannon Lake
|
||||
if(support_avx512())
|
||||
@@ -1376,7 +1380,22 @@ int get_cpuname(void){
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
break;
|
||||
case 7: // family 6 exmodel 7
|
||||
switch (model) {
|
||||
case 10: // Goldmont Plus
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 14: // Ice Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
switch (model) {
|
||||
@@ -1412,7 +1431,11 @@ int get_cpuname(void){
|
||||
case 0x5:
|
||||
return CPUTYPE_AMDK6;
|
||||
case 0x6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CPUTYPE_BARCELONA;
|
||||
#else
|
||||
return CPUTYPE_ATHLON;
|
||||
#endif
|
||||
case 0xf:
|
||||
switch (exfamily) {
|
||||
case 0:
|
||||
@@ -1795,7 +1818,11 @@ int get_coretype(void){
|
||||
case 4:
|
||||
case 5:
|
||||
case 6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CORE_CORE2;
|
||||
#else
|
||||
return CORE_P6;
|
||||
#endif
|
||||
case 7:
|
||||
return CORE_KATMAI;
|
||||
case 8:
|
||||
@@ -1979,6 +2006,38 @@ int get_coretype(void){
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 6:
|
||||
if (model == 6)
|
||||
#ifndef NO_AVX512
|
||||
return CORE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
#endif
|
||||
break;
|
||||
case 7:
|
||||
if (model == 10)
|
||||
return CORE_NEHALEM;
|
||||
if (model == 14)
|
||||
#ifndef NO_AVX512
|
||||
return CORE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
#endif
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 14) { // Kaby Lake
|
||||
@@ -2002,7 +2061,11 @@ int get_coretype(void){
|
||||
|
||||
if (vendor == VENDOR_AMD){
|
||||
if (family <= 0x5) return CORE_80486;
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
if (family <= 0xe) return CORE_BARCELONA;
|
||||
#else
|
||||
if (family <= 0xe) return CORE_ATHLON;
|
||||
#endif
|
||||
if (family == 0xf){
|
||||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
||||
else if (exfamily == 5) return CORE_BOBCAT;
|
||||
|
||||
@@ -30,17 +30,20 @@
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
#define CPU_Z15 3
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13",
|
||||
"Z14"
|
||||
"Z14",
|
||||
"Z15"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13",
|
||||
"z14"
|
||||
"z14",
|
||||
"z15"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
@@ -66,6 +69,8 @@ int detect(void)
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
if (strstr(p, "3906")) return CPU_Z14;
|
||||
if (strstr(p, "3907")) return CPU_Z14;
|
||||
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14
|
||||
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
@@ -6,6 +6,8 @@ TOPDIR = ..
|
||||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
override CFLAGS += -DADD$(BU) -DCBLAS
|
||||
override TARGET_ARCH=
|
||||
override TARGET_MACH=
|
||||
|
||||
LIB = $(TOPDIR)/$(LIBNAME)
|
||||
|
||||
|
||||
@@ -577,7 +577,7 @@
|
||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||
* ************************* STEST1 *****************************
|
||||
*
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||
*
|
||||
|
||||
@@ -1503,6 +1503,8 @@ C $ ' .' )
|
||||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
||||
@@ -653,7 +653,7 @@
|
||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||
* ************************* STEST1 *****************************
|
||||
*
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||
*
|
||||
|
||||
@@ -653,7 +653,7 @@
|
||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||
* ************************* STEST1 *****************************
|
||||
*
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||
*
|
||||
|
||||
@@ -577,7 +577,7 @@
|
||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||
* ************************* STEST1 *****************************
|
||||
*
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||
*
|
||||
|
||||
@@ -1504,6 +1504,8 @@ C $ ' .' )
|
||||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
||||
@@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
6 NUMBER OF VALUES OF N
|
||||
7 NUMBER OF VALUES OF N
|
||||
1 2 3 5 7 9 35 VALUES OF N
|
||||
3 NUMBER OF VALUES OF ALPHA
|
||||
0.0 1.0 0.7 VALUES OF ALPHA
|
||||
|
||||
@@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
6 NUMBER OF VALUES OF N
|
||||
7 NUMBER OF VALUES OF N
|
||||
0 1 2 3 5 9 35 VALUES OF N
|
||||
3 NUMBER OF VALUES OF ALPHA
|
||||
0.0 1.0 0.7 VALUES OF ALPHA
|
||||
|
||||
@@ -338,7 +338,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
@@ -398,7 +398,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
@@ -463,7 +463,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
|
||||
@@ -332,13 +332,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
#else
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
START_RPCC();
|
||||
|
||||
@@ -104,7 +104,7 @@ typedef struct {
|
||||
#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
|
||||
GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
|
||||
BETA[0], BETA[1], NULL, 0, NULL, 0, \
|
||||
(FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC)
|
||||
(FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
|
||||
#endif
|
||||
|
||||
#ifndef ICOPYB_OPERATION
|
||||
@@ -408,13 +408,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
/* Make sure if no one is using another buffer */
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
@@ -441,7 +441,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
}
|
||||
WMB;
|
||||
}
|
||||
|
||||
current = mypos;
|
||||
|
||||
@@ -458,7 +459,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
START_RPCC();
|
||||
|
||||
/* thread has to wait */
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting2);
|
||||
|
||||
@@ -477,6 +478,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
@@ -517,6 +519,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
if (is + min_i >= m_to) {
|
||||
/* Thread doesn't need this buffer any more */
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -541,13 +544,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
/* Make sure if no one is using another buffer */
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
@@ -595,7 +598,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
START_RPCC();
|
||||
|
||||
/* thread has to wait */
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting2);
|
||||
|
||||
@@ -613,6 +616,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
@@ -677,13 +681,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
/* Make sure if no one is using another buffer */
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
@@ -731,7 +735,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
START_RPCC();
|
||||
|
||||
/* thread has to wait */
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting2);
|
||||
|
||||
@@ -748,8 +752,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
}
|
||||
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
}
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
|
||||
@@ -787,7 +792,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
#endif
|
||||
if (is + min_i >= m_to) {
|
||||
/* Thread doesn't need this buffer any more */
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -804,7 +810,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
for (i = 0; i < args -> nthreads; i++) {
|
||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;MB;};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -840,6 +846,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
*range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
#else
|
||||
CRITICAL_SECTION level3_lock;
|
||||
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
blas_arg_t newarg;
|
||||
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
@@ -869,6 +884,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
pthread_mutex_lock(&level3_lock);
|
||||
#else
|
||||
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
newarg.m = args -> m;
|
||||
newarg.n = args -> n;
|
||||
newarg.k = args -> k;
|
||||
@@ -973,6 +996,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
free(job);
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
pthread_mutex_unlock(&level3_lock);
|
||||
#else
|
||||
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -365,12 +365,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
/* Split local region of B into parts */
|
||||
for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, js + div_n) - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
/* Copy part of local region of B into workspace */
|
||||
START_RPCC();
|
||||
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
|
||||
|
||||
@@ -135,10 +135,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
START_RPCC();
|
||||
|
||||
GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
|
||||
@@ -201,10 +205,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
START_RPCC();
|
||||
|
||||
GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
|
||||
@@ -292,10 +300,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
START_RPCC();
|
||||
|
||||
GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb,
|
||||
@@ -358,10 +370,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
START_RPCC();
|
||||
|
||||
GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb,
|
||||
|
||||
@@ -122,10 +122,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = 0; jjs < ls - js; jjs += min_jj){
|
||||
min_jj = ls - js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE);
|
||||
#else
|
||||
@@ -142,10 +146,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
||||
min_jj = min_l - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE);
|
||||
#else
|
||||
@@ -195,10 +203,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
|
||||
#else
|
||||
@@ -246,10 +258,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
||||
min_jj = min_l - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE);
|
||||
#else
|
||||
@@ -267,10 +283,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
|
||||
min_jj = js - ls - min_l - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda,
|
||||
sb + min_l * (min_l + jjs) * COMPSIZE);
|
||||
@@ -324,10 +344,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
|
||||
#else
|
||||
|
||||
@@ -21,9 +21,13 @@ else
|
||||
ifeq ($(ARCH),power)
|
||||
COMMONOBJS += dynamic_power.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),zarch)
|
||||
COMMONOBJS += dynamic_zarch.$(SUFFIX)
|
||||
else
|
||||
COMMONOBJS += dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
COMMONOBJS += parameter.$(SUFFIX)
|
||||
endif
|
||||
@@ -85,9 +89,13 @@ else
|
||||
ifeq ($(ARCH),power)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),zarch)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX)
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
|
||||
endif
|
||||
|
||||
@@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
|
||||
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
|
||||
/* jobs is queued. */
|
||||
|
||||
/* We need this grobal for cheking if initialization is finished. */
|
||||
/* We need this global for checking if initialization is finished. */
|
||||
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
|
||||
|
||||
/* Local Variables */
|
||||
@@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
|
||||
|
||||
#ifdef MONITOR
|
||||
|
||||
/* Monitor is a function to see thread's status for every seconds. */
|
||||
/* Usually it turns off and it's for debugging. */
|
||||
/* Monitor is a function to see thread's status for every second. */
|
||||
/* Usually it turns off and it's for debugging. */
|
||||
|
||||
static pthread_t monitor_thread;
|
||||
static int main_status[MAX_CPU_NUMBER];
|
||||
|
||||
@@ -50,7 +50,7 @@
|
||||
|
||||
/* This is a thread implementation for Win32 lazy implementation */
|
||||
|
||||
/* Thread server common infomation */
|
||||
/* Thread server common information */
|
||||
typedef struct{
|
||||
CRITICAL_SECTION lock;
|
||||
HANDLE filled;
|
||||
@@ -61,7 +61,7 @@ typedef struct{
|
||||
|
||||
} blas_pool_t;
|
||||
|
||||
/* We need this global for cheking if initialization is finished. */
|
||||
/* We need this global for checking if initialization is finished. */
|
||||
int blas_server_avail = 0;
|
||||
|
||||
/* Local Variables */
|
||||
@@ -462,11 +462,15 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
// Could also just use WaitForMultipleObjects
|
||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
||||
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
|
||||
|
||||
#ifndef OS_WINDOWSSTORE
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
TerminateThread(blas_threads[i],0);
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
if (WAIT_OBJECT_0 != wait_thread_value) {
|
||||
TerminateThread(blas_threads[i],0);
|
||||
}
|
||||
#endif
|
||||
|
||||
CloseHandle(blas_threads[i]);
|
||||
}
|
||||
|
||||
|
||||
@@ -329,7 +329,7 @@ int support_avx512(){
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 1){
|
||||
if((ebx & (1<<7)) == 0){
|
||||
ret=0; //OS does not even support AVX2
|
||||
}
|
||||
if((ebx & (1<<31)) != 0){
|
||||
@@ -585,9 +585,29 @@ static gotoblas_t *get_coretype(void){
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 7:
|
||||
if (model == 10) // Goldmont Plus
|
||||
return &gotoblas_NEHALEM;
|
||||
if (model == 14) {
|
||||
// Ice Lake
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 14 ) { // Kaby Lake
|
||||
if (model == 14 ) { // Kaby Lake, Coffee Lake
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
|
||||
@@ -37,17 +37,24 @@
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if (defined OS_LINUX || defined OS_ANDROID)
|
||||
#include <asm/hwcap.h>
|
||||
#include <sys/auxv.h>
|
||||
#endif
|
||||
|
||||
extern gotoblas_t gotoblas_ARMV8;
|
||||
extern gotoblas_t gotoblas_CORTEXA53;
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
extern gotoblas_t gotoblas_CORTEXA72;
|
||||
extern gotoblas_t gotoblas_CORTEXA73;
|
||||
extern gotoblas_t gotoblas_FALKOR;
|
||||
extern gotoblas_t gotoblas_THUNDERX;
|
||||
extern gotoblas_t gotoblas_THUNDERX2T99;
|
||||
extern gotoblas_t gotoblas_TSV110;
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#define NUM_CORETYPES 4
|
||||
#define NUM_CORETYPES 9
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
@@ -63,17 +70,27 @@ extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
static char *corename[] = {
|
||||
"armv8",
|
||||
"cortexa53",
|
||||
"cortexa57",
|
||||
"cortexa72",
|
||||
"cortexa73",
|
||||
"falkor",
|
||||
"thunderx",
|
||||
"thunderx2t99",
|
||||
"tsv110",
|
||||
"unknown"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
|
||||
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1];
|
||||
if (gotoblas == &gotoblas_THUNDERX) return corename[ 2];
|
||||
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3];
|
||||
if (gotoblas == &gotoblas_CORTEXA53) return corename[ 1];
|
||||
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 2];
|
||||
if (gotoblas == &gotoblas_CORTEXA72) return corename[ 3];
|
||||
if (gotoblas == &gotoblas_CORTEXA73) return corename[ 4];
|
||||
if (gotoblas == &gotoblas_FALKOR) return corename[ 5];
|
||||
if (gotoblas == &gotoblas_THUNDERX) return corename[ 6];
|
||||
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7];
|
||||
if (gotoblas == &gotoblas_TSV110) return corename[ 8];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
@@ -94,9 +111,14 @@ static gotoblas_t *force_coretype(char *coretype) {
|
||||
switch (found)
|
||||
{
|
||||
case 0: return (&gotoblas_ARMV8);
|
||||
case 1: return (&gotoblas_CORTEXA57);
|
||||
case 2: return (&gotoblas_THUNDERX);
|
||||
case 3: return (&gotoblas_THUNDERX2T99);
|
||||
case 1: return (&gotoblas_CORTEXA53);
|
||||
case 2: return (&gotoblas_CORTEXA57);
|
||||
case 3: return (&gotoblas_CORTEXA72);
|
||||
case 4: return (&gotoblas_CORTEXA73);
|
||||
case 5: return (&gotoblas_FALKOR);
|
||||
case 6: return (&gotoblas_THUNDERX);
|
||||
case 7: return (&gotoblas_THUNDERX2T99);
|
||||
case 8: return (&gotoblas_TSV110);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
@@ -105,13 +127,17 @@ static gotoblas_t *force_coretype(char *coretype) {
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
int implementer, variant, part, arch, revision, midr_el1;
|
||||
|
||||
|
||||
#if (defined OS_LINUX || defined OS_ANDROID)
|
||||
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
|
||||
char coremsg[128];
|
||||
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
|
||||
openblas_warning(1, coremsg);
|
||||
return NULL;
|
||||
}
|
||||
#else
|
||||
return NULL;
|
||||
#endif
|
||||
|
||||
get_cpu_ftr(MIDR_EL1, midr_el1);
|
||||
/*
|
||||
@@ -130,10 +156,14 @@ static gotoblas_t *get_coretype(void) {
|
||||
case 0x41: // ARM
|
||||
switch (part)
|
||||
{
|
||||
case 0xd07: // Cortex A57
|
||||
case 0xd08: // Cortex A72
|
||||
case 0xd03: // Cortex A53
|
||||
return &gotoblas_CORTEXA53;
|
||||
case 0xd07: // Cortex A57
|
||||
return &gotoblas_CORTEXA57;
|
||||
case 0xd08: // Cortex A72
|
||||
return &gotoblas_CORTEXA72;
|
||||
case 0xd09: // Cortex A73
|
||||
return &gotoblas_CORTEXA73;
|
||||
}
|
||||
break;
|
||||
case 0x42: // Broadcom
|
||||
@@ -152,6 +182,20 @@ static gotoblas_t *get_coretype(void) {
|
||||
return &gotoblas_THUNDERX2T99;
|
||||
}
|
||||
break;
|
||||
case 0x48: // HiSilicon
|
||||
switch (part)
|
||||
{
|
||||
case 0xd01: // tsv110
|
||||
return &gotoblas_TSV110;
|
||||
}
|
||||
break;
|
||||
case 0x51: // Qualcomm
|
||||
switch (part)
|
||||
{
|
||||
case 0xc00: // Falkor
|
||||
return &gotoblas_FALKOR;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -3,7 +3,9 @@
|
||||
|
||||
extern gotoblas_t gotoblas_POWER6;
|
||||
extern gotoblas_t gotoblas_POWER8;
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
extern gotoblas_t gotoblas_POWER9;
|
||||
#endif
|
||||
|
||||
extern void openblas_warning(int verbose, const char *msg);
|
||||
|
||||
@@ -19,7 +21,9 @@ static char *corename[] = {
|
||||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_POWER6) return corename[1];
|
||||
if (gotoblas == &gotoblas_POWER8) return corename[2];
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
#endif
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
@@ -29,8 +33,10 @@ static gotoblas_t *get_coretype(void) {
|
||||
return &gotoblas_POWER6;
|
||||
if (__builtin_cpu_is("power8"))
|
||||
return &gotoblas_POWER8;
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
if (__builtin_cpu_is("power9"))
|
||||
return &gotoblas_POWER9;
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -53,7 +59,9 @@ static gotoblas_t *force_coretype(char * coretype) {
|
||||
{
|
||||
case 1: return (&gotoblas_POWER6);
|
||||
case 2: return (&gotoblas_POWER8);
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
case 3: return (&gotoblas_POWER9);
|
||||
#endif
|
||||
default: return NULL;
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
|
||||
131
driver/others/dynamic_zarch.c
Normal file
131
driver/others/dynamic_zarch.c
Normal file
@@ -0,0 +1,131 @@
|
||||
|
||||
#include "common.h"
|
||||
|
||||
extern gotoblas_t gotoblas_Z13;
|
||||
extern gotoblas_t gotoblas_Z14;
|
||||
//extern gotoblas_t gotoblas_Z15;
|
||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
//extern gotoblas_t gotoblas_Z14;
|
||||
//#endif
|
||||
|
||||
#define NUM_CORETYPES 4
|
||||
|
||||
extern void openblas_warning(int verbose, const char* msg);
|
||||
|
||||
static char* corename[] = {
|
||||
"unknown",
|
||||
"Z13",
|
||||
"Z14",
|
||||
// "Z15",
|
||||
"ZARCH_GENERIC",
|
||||
};
|
||||
|
||||
char* gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_Z13) return corename[1];
|
||||
if (gotoblas == &gotoblas_Z14) return corename[2];
|
||||
// if (gotoblas == &gotoblas_Z15) return corename[3];
|
||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
// if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
//#endif
|
||||
return corename[0]; // try generic?
|
||||
}
|
||||
|
||||
// __builtin_cpu_is is not supported by zarch
|
||||
static gotoblas_t* get_coretype(void) {
|
||||
FILE* infile;
|
||||
char buffer[512], * p;
|
||||
|
||||
p = (char*)NULL;
|
||||
infile = fopen("/proc/sysinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)) {
|
||||
if (!strncmp("Type", buffer, 4)) {
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if (strstr(p, "2964")) return &gotoblas_Z13;
|
||||
if (strstr(p, "2965")) return &gotoblas_Z13;
|
||||
if (strstr(p, "3906")) return &gotoblas_Z14;
|
||||
if (strstr(p, "3907")) return &gotoblas_Z14;
|
||||
if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14
|
||||
if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14
|
||||
|
||||
return NULL; // should be ZARCH_GENERIC
|
||||
}
|
||||
|
||||
static gotoblas_t* force_coretype(char* coretype) {
|
||||
|
||||
int i;
|
||||
int found = -1;
|
||||
char message[128];
|
||||
|
||||
for (i = 0; i < NUM_CORETYPES; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype, corename[i], 20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 1: return (&gotoblas_Z13);
|
||||
case 2: return (&gotoblas_Z14);
|
||||
// case 3: return (&gotoblas_Z15);
|
||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
// case 3: return (&gotoblas_POWER9);
|
||||
//#endif
|
||||
default: return NULL;
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_init(void) {
|
||||
|
||||
char coremsg[128];
|
||||
char coren[22];
|
||||
char* p;
|
||||
|
||||
|
||||
if (gotoblas) return;
|
||||
|
||||
p = getenv("OPENBLAS_CORETYPE");
|
||||
if (p)
|
||||
{
|
||||
gotoblas = force_coretype(p);
|
||||
}
|
||||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
}
|
||||
|
||||
if (gotoblas == NULL)
|
||||
{
|
||||
snprintf(coremsg, 128, "Falling back to Z14 core\n");
|
||||
openblas_warning(1, coremsg);
|
||||
gotoblas = &gotoblas_Z14;
|
||||
}
|
||||
|
||||
if (gotoblas && gotoblas->init) {
|
||||
strncpy(coren, gotoblas_corename(), 20);
|
||||
sprintf(coremsg, "Core: %s\n", coren);
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas->init();
|
||||
}
|
||||
else {
|
||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_quit(void) {
|
||||
gotoblas = NULL;
|
||||
}
|
||||
@@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {
|
||||
|
||||
int mynode = 1;
|
||||
|
||||
/* if number of threads is larger than inital condition */
|
||||
/* if number of threads is larger than initial condition */
|
||||
if (pos < 0) {
|
||||
sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
|
||||
return 0;
|
||||
@@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) {
|
||||
common -> shmid = pshmid;
|
||||
|
||||
if (common -> magic != SH_MAGIC) {
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if __GLIBC_PREREQ(2, 7)
|
||||
cpu_set_t *cpusetp;
|
||||
#else
|
||||
cpu_set_t cpuset;
|
||||
#endif
|
||||
#endif
|
||||
int nums;
|
||||
int ret;
|
||||
|
||||
@@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) {
|
||||
}
|
||||
CPU_FREE(cpusetp);
|
||||
#else
|
||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
|
||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
|
||||
if (ret!=0) {
|
||||
common->num_procs = nums;
|
||||
} else {
|
||||
@@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) {
|
||||
int i;
|
||||
int n = 0;
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,cpusetp)) n++;
|
||||
if (CPU_ISSET(i,&cpuset)) n++;
|
||||
common->num_procs = n;
|
||||
}
|
||||
#else
|
||||
common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
|
||||
common->num_procs = CPU_COUNT(&cpuset);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -129,7 +129,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
@@ -192,7 +192,7 @@ void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
@@ -229,7 +229,7 @@ int get_num_procs(void) {
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,cpuset)) n++;
|
||||
if (CPU_ISSET(i,&cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
@@ -312,7 +312,7 @@ int get_num_procs(void) {
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
@@ -404,7 +404,7 @@ extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
@@ -412,7 +412,7 @@ int blas_get_cpu_number(void){
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
@@ -436,7 +436,7 @@ int blas_get_cpu_number(void){
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
@@ -822,7 +822,7 @@ static void *alloc_qalloc(void *address){
|
||||
|
||||
static void alloc_windows_free(struct alloc_t *alloc_info){
|
||||
|
||||
VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT);
|
||||
VirtualFree(alloc_info, 0, MEM_RELEASE);
|
||||
|
||||
}
|
||||
|
||||
@@ -935,7 +935,7 @@ static void alloc_hugetlb_free(struct alloc_t *alloc_info){
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT);
|
||||
VirtualFree(alloc_info, 0, MEM_LARGE_PAGES | MEM_RELEASE);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1622,6 +1622,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
gotoblas_init();
|
||||
gotoblas_quit();
|
||||
|
||||
#if __PGIC__ < 19
|
||||
#if 0
|
||||
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
|
||||
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
|
||||
@@ -1629,6 +1630,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
|
||||
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1671,7 +1673,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
@@ -1734,7 +1736,7 @@ void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
@@ -1772,7 +1774,7 @@ int get_num_procs(void) {
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,cpuset)) n++;
|
||||
if (CPU_ISSET(i,&cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
@@ -1853,7 +1855,7 @@ int get_num_procs(void) {
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
@@ -1943,7 +1945,7 @@ extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
@@ -1951,7 +1953,7 @@ int blas_get_cpu_number(void){
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
@@ -1975,7 +1977,7 @@ int blas_get_cpu_number(void){
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
@@ -2039,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL;
|
||||
|
||||
static void alloc_mmap_free(struct release_t *release){
|
||||
|
||||
if (!release->address) return;
|
||||
|
||||
if (munmap(release -> address, BUFFER_SIZE)) {
|
||||
printf("OpenBLAS : munmap failed\n");
|
||||
int errsv=errno;
|
||||
perror("OpenBLAS : munmap failed:");
|
||||
printf("error code=%d,\trelease->address=%lx\n",errsv,release->address);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2062,15 +2068,21 @@ static void *alloc_mmap(void *address){
|
||||
}
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
release_pos ++;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
} else {
|
||||
#ifdef DEBUG
|
||||
int errsv=errno;
|
||||
perror("OpenBLAS : mmap failed:");
|
||||
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef OS_LINUX
|
||||
@@ -2214,13 +2226,13 @@ static void *alloc_mmap(void *address){
|
||||
#endif
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
release_pos ++;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
}
|
||||
@@ -2298,7 +2310,7 @@ static void *alloc_qalloc(void *address){
|
||||
|
||||
static void alloc_windows_free(struct release_t *release){
|
||||
|
||||
VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
|
||||
VirtualFree(release -> address, 0, MEM_RELEASE);
|
||||
|
||||
}
|
||||
|
||||
@@ -2420,7 +2432,7 @@ static void alloc_hugetlb_free(struct release_t *release){
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
|
||||
VirtualFree(release -> address, 0, MEM_LARGE_PAGES | MEM_RELEASE);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -2701,7 +2713,7 @@ void *blas_memory_alloc(int procpos){
|
||||
|
||||
position = 0;
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
do {
|
||||
@@ -2718,7 +2730,7 @@ void *blas_memory_alloc(int procpos){
|
||||
position ++;
|
||||
|
||||
} while (position < NUM_BUFFERS);
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
goto error;
|
||||
@@ -2730,7 +2742,7 @@ void *blas_memory_alloc(int procpos){
|
||||
#endif
|
||||
|
||||
memory[position].used = 1;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#else
|
||||
blas_unlock(&memory[position].lock);
|
||||
@@ -2751,7 +2763,7 @@ void *blas_memory_alloc(int procpos){
|
||||
|
||||
#ifdef ALLOC_DEVICEDRIVER
|
||||
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
|
||||
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
|
||||
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -2779,11 +2791,11 @@ void *blas_memory_alloc(int procpos){
|
||||
|
||||
} while ((BLASLONG)map_address == -1);
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
memory[position].addr = map_address;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
||||
@@ -2839,7 +2851,7 @@ void blas_memory_free(void *free_area){
|
||||
#endif
|
||||
|
||||
position = 0;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
|
||||
@@ -2855,7 +2867,7 @@ void blas_memory_free(void *free_area){
|
||||
WMB;
|
||||
|
||||
memory[position].used = 0;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
||||
@@ -2872,7 +2884,7 @@ void blas_memory_free(void *free_area){
|
||||
for (position = 0; position < NUM_BUFFERS; position++)
|
||||
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
|
||||
#endif
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
return;
|
||||
@@ -2924,7 +2936,7 @@ void blas_shutdown(void){
|
||||
|
||||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||
|
||||
#ifdef SMP
|
||||
#if defined(SMP) || defined(USE_LOCKING)
|
||||
#if defined(USE_PTHREAD_LOCK)
|
||||
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
#elif defined(USE_PTHREAD_SPINLOCK)
|
||||
@@ -2949,7 +2961,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
|
||||
if (hot_alloc != 2) {
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
#if defined(SMP) || defined(USE_LOCKING)
|
||||
LOCK_COMMAND(&init_lock);
|
||||
#endif
|
||||
|
||||
@@ -2959,7 +2971,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
|
||||
size -= PAGESIZE;
|
||||
}
|
||||
|
||||
#ifdef SMP
|
||||
#if defined(SMP) || defined(USE_LOCKING)
|
||||
UNLOCK_COMMAND(&init_lock);
|
||||
#endif
|
||||
|
||||
@@ -3192,7 +3204,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
|
||||
gotoblas_init();
|
||||
gotoblas_quit();
|
||||
|
||||
#if __PGIC__ < 19
|
||||
#if 0
|
||||
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
|
||||
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
|
||||
@@ -3200,6 +3212,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
|
||||
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -38,21 +38,29 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifndef SMP
|
||||
#define blas_cpu_number 1
|
||||
#else
|
||||
|
||||
int blas_cpu_number = 1;
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
|
||||
return blas_cpu_number;
|
||||
}
|
||||
#ifdef OS_LINUX
|
||||
#include <sys/sysinfo.h>
|
||||
#include <sched.h>
|
||||
#include <errno.h>
|
||||
#include <linux/unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
|
||||
#define FIXED_PAGESIZE 4096
|
||||
|
||||
|
||||
void *sa = NULL;
|
||||
void *sb = NULL;
|
||||
static double static_buffer[BUFFER_SIZE/sizeof(double)];
|
||||
@@ -60,7 +68,7 @@ static double static_buffer[BUFFER_SIZE/sizeof(double)];
|
||||
void *blas_memory_alloc(int numproc){
|
||||
|
||||
if (sa == NULL){
|
||||
#if 1
|
||||
#if 0
|
||||
sa = (void *)qalloc(QFAST, BUFFER_SIZE);
|
||||
#else
|
||||
sa = (void *)malloc(BUFFER_SIZE);
|
||||
@@ -75,3 +83,296 @@ void blas_memory_free(void *free_area){
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#ifndef SMP
|
||||
|
||||
#define blas_cpu_number 1
|
||||
#define blas_num_threads 1
|
||||
|
||||
/* Dummy Function */
|
||||
int goto_get_num_procs (void) { return 1;};
|
||||
void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int n;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
#else
|
||||
#if !__GLIBC_PREREQ(2, 3)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
|
||||
if (ret!=0) return nums;
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,&cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
#endif
|
||||
return nums;
|
||||
#else
|
||||
if (nums >= CPU_SETSIZE) {
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) {
|
||||
return nums;
|
||||
}
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) {
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
} else {
|
||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
|
||||
if (ret!=0) {
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT(&cpuset);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef OS_ANDROID
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_AIX
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
|
||||
if (nums == 0) {
|
||||
|
||||
SYSTEM_INFO sysinfo;
|
||||
|
||||
GetSystemInfo(&sysinfo);
|
||||
|
||||
nums = sysinfo.dwNumberOfProcessors;
|
||||
}
|
||||
|
||||
return nums;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
|
||||
int m[2];
|
||||
size_t len;
|
||||
|
||||
if (nums == 0) {
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
sysctl(m, 2, &nums, &len, NULL, 0);
|
||||
}
|
||||
|
||||
return nums;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_DARWIN)
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
size_t len;
|
||||
if (nums == 0){
|
||||
len = sizeof(int);
|
||||
sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
|
||||
}
|
||||
return nums;
|
||||
}
|
||||
/*
|
||||
void set_stack_limit(int limitMB){
|
||||
int result=0;
|
||||
struct rlimit rl;
|
||||
rlim_t StackSize;
|
||||
|
||||
StackSize=limitMB*1024*1024;
|
||||
result=getrlimit(RLIMIT_STACK, &rl);
|
||||
if(result==0){
|
||||
if(rl.rlim_cur < StackSize){
|
||||
rl.rlim_cur=StackSize;
|
||||
result=setrlimit(RLIMIT_STACK, &rl);
|
||||
if(result !=0){
|
||||
fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
OpenBLAS uses the numbers of CPU cores in multithreading.
|
||||
It can be set by openblas_set_num_threads(int num_threads);
|
||||
*/
|
||||
int blas_cpu_number = 0;
|
||||
/*
|
||||
The numbers of threads in the thread pool.
|
||||
This value is equal or large than blas_cpu_number. This means some threads are sleep.
|
||||
*/
|
||||
int blas_num_threads = 0;
|
||||
|
||||
int goto_get_num_procs (void) {
|
||||
return blas_cpu_number;
|
||||
}
|
||||
|
||||
void openblas_fork_handler()
|
||||
{
|
||||
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
|
||||
// built with "make USE_OPENMP=0".
|
||||
// Hanging can still happen when OpenBLAS is built against the libgomp
|
||||
// implementation of OpenMP. The problem is tracked at:
|
||||
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
|
||||
// In the mean time build with USE_OPENMP=0 or link against another
|
||||
// implementation of OpenMP.
|
||||
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
||||
int err;
|
||||
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
|
||||
if(err != 0)
|
||||
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
extern int openblas_num_threads_env();
|
||||
extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
int blas_omp_num = 0;
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
// blas_goto_num = 0;
|
||||
#ifndef USE_OPENMP
|
||||
blas_goto_num=openblas_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
|
||||
if (blas_goto_num == 0) {
|
||||
blas_goto_num=openblas_goto_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// blas_omp_num = 0;
|
||||
blas_omp_num=openblas_omp_num_threads_env();
|
||||
if (blas_omp_num < 0) blas_omp_num = 0;
|
||||
|
||||
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#ifdef DEBUG
|
||||
printf( "Adjusted number of threads : %3d\n", blas_num_threads);
|
||||
#endif
|
||||
|
||||
blas_cpu_number = blas_num_threads;
|
||||
|
||||
return blas_num_threads;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
int openblas_get_num_procs(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
return get_num_procs();
|
||||
#endif
|
||||
}
|
||||
|
||||
int openblas_get_num_threads(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
// init blas_cpu_number if needed
|
||||
blas_get_cpu_number();
|
||||
return blas_cpu_number;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -78,10 +78,10 @@ char tmpstr[20];
|
||||
#ifdef DYNAMIC_ARCH
|
||||
strcat(tmp_config_str, gotoblas_corename());
|
||||
#endif
|
||||
if (openblas_get_parallel() == 0)
|
||||
sprintf(tmpstr, " SINGLE_THREADED");
|
||||
else
|
||||
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
|
||||
if (openblas_get_parallel() == 0)
|
||||
sprintf(tmpstr, " SINGLE_THREADED");
|
||||
else
|
||||
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
|
||||
strcat(tmp_config_str, tmpstr);
|
||||
return tmp_config_str;
|
||||
}
|
||||
|
||||
@@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol
|
||||
libgoto_hpl.def : gensymbol
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
endif
|
||||
|
||||
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
||||
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
||||
else
|
||||
@@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
|
||||
endif
|
||||
ifneq (,$(filter 1 2,$(NOFORTRAN)))
|
||||
#only build without Fortran
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
else
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
endif
|
||||
|
||||
dllinit.$(SUFFIX) : dllinit.c
|
||||
|
||||
@@ -50,7 +50,10 @@ BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {
|
||||
gotoblas_init();
|
||||
break;
|
||||
case DLL_PROCESS_DETACH:
|
||||
gotoblas_quit();
|
||||
// If the process is about to exit, don't bother releasing any resources
|
||||
// The kernel is much better at bulk releasing then.
|
||||
if (!reserved)
|
||||
gotoblas_quit();
|
||||
break;
|
||||
case DLL_THREAD_ATTACH:
|
||||
break;
|
||||
|
||||
@@ -618,19 +618,6 @@
|
||||
# functions added for lapack-3.7.0
|
||||
|
||||
slarfy,
|
||||
slasyf_rk,
|
||||
ssyconvf_rook,
|
||||
ssytf2_rk,
|
||||
ssytrf_rk,
|
||||
ssytrs_3,
|
||||
ssytri_3,
|
||||
ssytri_3x,
|
||||
ssycon_3,
|
||||
ssysv_rk,
|
||||
slasyf_aa,
|
||||
ssysv_aa,
|
||||
ssytrf_aa,
|
||||
ssytrs_aa,
|
||||
strevc3,
|
||||
sgelqt,
|
||||
sgelqt3,
|
||||
@@ -647,33 +634,8 @@
|
||||
stplqt,
|
||||
stplqt2,
|
||||
stpmlqt,
|
||||
ssytrd_2stage,
|
||||
ssytrd_sy2sb,
|
||||
ssytrd_sb2st,
|
||||
ssb2st_kernels,
|
||||
ssyevd_2stage,
|
||||
ssyev_2stage,
|
||||
ssyevx_2stage,
|
||||
ssyevr_2stage,
|
||||
ssbev_2stage,
|
||||
ssbevx_2stage,
|
||||
ssbevd_2stage,
|
||||
ssygv_2stage,
|
||||
dlarfy,
|
||||
dlasyf_rk,
|
||||
dsyconvf,
|
||||
dsyconvf_rook,
|
||||
dsytf2_rk,
|
||||
dsytrf_rk,
|
||||
dsytrs_3,
|
||||
dsytri_3,
|
||||
dsytri_3x,
|
||||
dsycon_3,
|
||||
dsysv_rk,
|
||||
dlasyf_aa,
|
||||
dsysv_aa,
|
||||
dsytrf_aa,
|
||||
dsytrs_aa,
|
||||
dtrevc3,
|
||||
dgelqt,
|
||||
dgelqt3,
|
||||
@@ -690,45 +652,8 @@
|
||||
dtplqt,
|
||||
dtplqt2,
|
||||
dtpmlqt,
|
||||
dsytrd_2stage,
|
||||
dsytrd_sy2sb,
|
||||
dsytrd_sb2st,
|
||||
dsb2st_kernels,
|
||||
dsyevd_2stage,
|
||||
dsyev_2stage,
|
||||
dsyevx_2stage,
|
||||
dsyevr_2stage,
|
||||
dsbev_2stage,
|
||||
dsbevx_2stage,
|
||||
dsbevd_2stage,
|
||||
dsygv_2stage,
|
||||
chetf2_rk,
|
||||
chetrf_rk,
|
||||
chetri_3,
|
||||
chetri_3x,
|
||||
chetrs_3,
|
||||
checon_3,
|
||||
chesv_rk,
|
||||
chesv_aa,
|
||||
chetrf_aa,
|
||||
chetrs_aa,
|
||||
clahef_aa,
|
||||
clahef_rk,
|
||||
clarfy,
|
||||
clasyf_rk,
|
||||
clasyf_aa,
|
||||
csyconvf,
|
||||
csyconvf_rook,
|
||||
csytf2_rk,
|
||||
csytrf_rk,
|
||||
csytrf_aa,
|
||||
csytrs_3,
|
||||
csytrs_aa,
|
||||
csytri_3,
|
||||
csytri_3x,
|
||||
csycon_3,
|
||||
csysv_rk,
|
||||
csysv_aa,
|
||||
ctrevc3,
|
||||
cgelqt,
|
||||
cgelqt3,
|
||||
@@ -745,45 +670,8 @@
|
||||
ctplqt,
|
||||
ctplqt2,
|
||||
ctpmlqt,
|
||||
chetrd_2stage,
|
||||
chetrd_he2hb,
|
||||
chetrd_hb2st,
|
||||
chb2st_kernels,
|
||||
cheevd_2stage,
|
||||
cheev_2stage,
|
||||
cheevx_2stage,
|
||||
cheevr_2stage,
|
||||
chbev_2stage,
|
||||
chbevx_2stage,
|
||||
chbevd_2stage,
|
||||
chegv_2stage,
|
||||
zhetf2_rk,
|
||||
zhetrf_rk,
|
||||
zhetri_3,
|
||||
zhetri_3x,
|
||||
zhetrs_3,
|
||||
zhecon_3,
|
||||
zhesv_rk,
|
||||
zhesv_aa,
|
||||
zhetrf_aa,
|
||||
zhetrs_aa,
|
||||
zlahef_aa,
|
||||
zlahef_rk,
|
||||
zlarfy,
|
||||
zlasyf_rk,
|
||||
zlasyf_aa,
|
||||
zsyconvf,
|
||||
zsyconvf_rook,
|
||||
zsytrs_aa,
|
||||
zsytf2_rk,
|
||||
zsytrf_rk,
|
||||
zsytrf_aa,
|
||||
zsytrs_3,
|
||||
zsytri_3,
|
||||
zsytri_3x,
|
||||
zsycon_3,
|
||||
zsysv_rk,
|
||||
zsysv_aa,
|
||||
ztrevc3,
|
||||
ztplqt,
|
||||
ztplqt2,
|
||||
@@ -800,18 +688,6 @@
|
||||
zlaswlq,
|
||||
zlamswlq,
|
||||
zgemlq,
|
||||
zhetrd_2stage,
|
||||
zhetrd_he2hb,
|
||||
zhetrd_hb2st,
|
||||
zhb2st_kernels,
|
||||
zheevd_2stage,
|
||||
zheev_2stage,
|
||||
zheevx_2stage,
|
||||
zheevr_2stage,
|
||||
zhbev_2stage,
|
||||
zhbevx_2stage,
|
||||
zhbevd_2stage,
|
||||
zhegv_2stage,
|
||||
sladiv1,
|
||||
dladiv1,
|
||||
iparam2stage,
|
||||
@@ -819,24 +695,18 @@
|
||||
# functions added for lapack-3.8.0
|
||||
|
||||
ilaenv2stage,
|
||||
ssysv_aa_2stage,
|
||||
ssytrf_aa_2stage,
|
||||
ssytrs_aa_2stage,
|
||||
chesv_aa_2stage,
|
||||
chetrf_aa_2stage,
|
||||
chetrs_aa_2stage,
|
||||
csysv_aa_2stage,
|
||||
csytrf_aa_2stage,
|
||||
csytrs_aa_2stage,
|
||||
dsysv_aa_2stage,
|
||||
dsytrf_aa_2stage,
|
||||
dsytrs_aa_2stage,
|
||||
zhesv_aa_2stage,
|
||||
zhetrf_aa_2stage,
|
||||
zhetrs_aa_2stage,
|
||||
zsysv_aa_2stage,
|
||||
zsytrf_aa_2stage,
|
||||
zsytrs_aa_2stage
|
||||
|
||||
# functions added for lapack-3.9.0
|
||||
cgesvdq,
|
||||
cungtsqr,
|
||||
dcombssq,
|
||||
dgesvdq,
|
||||
dorgtsqr,
|
||||
scombssq,
|
||||
sgesvdq,
|
||||
sorgtsqr,
|
||||
zgesvdq,
|
||||
zungtsqr
|
||||
);
|
||||
|
||||
@lapack_extendedprecision_objs = (
|
||||
@@ -3489,6 +3359,15 @@
|
||||
LAPACKE_zsytrf_aa_2stage_work,
|
||||
LAPACKE_zsytrs_aa_2stage,
|
||||
LAPACKE_zsytrs_aa_2stage_work,
|
||||
|
||||
# new functions from 3.9.0
|
||||
LAPACKE_dgesvdq,
|
||||
LAPACKE_dgesvdq_work,
|
||||
LAPACKE_sgesvdq,
|
||||
LAPACKE_sgesvdq_work,
|
||||
LAPACKE_zgesvdq,
|
||||
LAPACKE_zgesvdq_work
|
||||
|
||||
);
|
||||
|
||||
#These function may need 2 underscores.
|
||||
@@ -3509,6 +3388,65 @@
|
||||
zlahef_rook, zlasyf_rook,
|
||||
zsytf2_rook, zsytrf_rook, zsytrs_rook,
|
||||
zsytri_rook, zsycon_rook, zsysv_rook,
|
||||
# 3.7.0
|
||||
slasyf_rk, ssyconvf_rook, ssytf2_rk,
|
||||
ssytrf_rk, ssytrs_3, ssytri_3,
|
||||
ssytri_3x, ssycon_3, ssysv_rk,
|
||||
slasyf_aa, ssysv_aa, ssytrf_aa,
|
||||
ssytrs_aa, ssytrd_2stage, ssytrd_sy2sb,
|
||||
ssytrd_sb2st, ssb2st_kernels, ssyevd_2stage,
|
||||
ssyev_2stage, ssyevx_2stage, ssyevr_2stage,
|
||||
ssbev_2stage, ssbevx_2stage, ssbevd_2stage,
|
||||
ssygv_2stage, dlasyf_rk, dsyconvf_rook,
|
||||
dsytf2_rk, dsytrf_rk, dsytrs_3,
|
||||
dsytri_3, dsytri_3x, dsycon_3,
|
||||
dsysv_rk, dlasyf_aa, dsysv_aa,
|
||||
dsytrf_aa, dsytrs_aa, dsytrd_2stage,
|
||||
dsytrd_sy2sb, dsytrd_sb2st, dsb2st_kernels,
|
||||
dsyevd_2stage, dsyev_2stage, dsyevx_2stage,
|
||||
dsyevr_2stage, dsbev_2stage, dsbevx_2stage,
|
||||
dsbevd_2stage, dsygv_2stage, chetf2_rk,
|
||||
chetrf_rk, chetri_3, chetri_3x,
|
||||
chetrs_3, checon_3, chesv_rk,
|
||||
chesv_aa, chetrf_aa, chetrs_aa,
|
||||
clahef_aa, clahef_rk, clasyf_rk,
|
||||
clasyf_aa, csytf2_rk, csytrf_rk,
|
||||
csytrf_aa, csytrs_3, csytrs_aa,
|
||||
csytri_3, csytri_3x, csycon_3,
|
||||
csysv_rk, csysv_aa, csyconvf_rook,
|
||||
chetrd_2stage, chetrd_he2hb, chetrd_hb2st,
|
||||
chb2st_kernels, cheevd_2stage, cheev_2stage,
|
||||
cheevx_2stage, cheevr_2stage, chbev_2stage,
|
||||
chbevx_2stage, chbevd_2stage, chegv_2stage,
|
||||
zhetf2_rk, zhetrf_rk, zhetri_3,
|
||||
zhetri_3x, zhetrs_3, zhecon_3,
|
||||
zhesv_rk, zhesv_aa, zhetrf_aa,
|
||||
zhetrs_aa, zlahef_aa, zlahef_rk,
|
||||
zlasyf_rk, zlasyf_aa, zsyconvf_rook,
|
||||
zsytrs_aa, zsytf2_rk, zsytrf_rk,
|
||||
zsytrf_aa, zsytrs_3, zsytri_3,
|
||||
zsytri_3x, zsycon_3, zsysv_rk,
|
||||
zsysv_aa, zhetrd_2stage, zhetrd_he2hb,
|
||||
zhetrd_hb2st, zhb2st_kernels, zheevd_2stage,
|
||||
zheev_2stage, zheevx_2stage, zheevr_2stage,
|
||||
zhbev_2stage, zhbevx_2stage, zhbevd_2stage,
|
||||
zhegv_2stage,
|
||||
# 3.8.0
|
||||
ssysv_aa_2stage, ssytrf_aa_2stage,
|
||||
ssytrs_aa_2stage, chesv_aa_2stage,
|
||||
chetrf_aa_2stage, chetrs_aa_2stage,
|
||||
csysv_aa_2stage, csytrf_aa_2stage,
|
||||
csytrs_aa_2stage, dsysv_aa_2stage,
|
||||
dsytrf_aa_2stage, dsytrs_aa_2stage,
|
||||
zhesv_aa_2stage, zhetrf_aa_2stage,
|
||||
zhetrs_aa_2stage, zsysv_aa_2stage,
|
||||
zsytrf_aa_2stage, zsytrs_aa_2stage,
|
||||
# 3.9.0
|
||||
claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col,
|
||||
dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col,
|
||||
slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col,
|
||||
zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col
|
||||
|
||||
);
|
||||
|
||||
|
||||
|
||||
14
f_check
14
f_check
@@ -19,7 +19,7 @@ $nofortran = 0;
|
||||
|
||||
$compiler = join(" ", @ARGV);
|
||||
$compiler_bin = shift(@ARGV);
|
||||
|
||||
|
||||
# f77 is too ambiguous
|
||||
$compiler = "" if $compiler eq "f77";
|
||||
|
||||
@@ -71,7 +71,7 @@ if ($compiler eq "") {
|
||||
|
||||
if ($data =~ /GNU/) {
|
||||
|
||||
$data =~ /(\d)\.(\d).(\d)/;
|
||||
$data =~ /(\d+)\.(\d+).(\d+)/;
|
||||
$major = $1;
|
||||
$minor = $2;
|
||||
|
||||
@@ -125,11 +125,16 @@ if ($compiler eq "") {
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
|
||||
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
|
||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
|
||||
if ($data =~ / zho_ge__/) {
|
||||
$need2bu = 1;
|
||||
}
|
||||
if ($vendor =~ /G95/) {
|
||||
if ($ENV{NO_LAPACKE} != 1) {
|
||||
$need2bu = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($vendor eq "") {
|
||||
@@ -277,6 +282,8 @@ $linker_a = "";
|
||||
if ($link ne "") {
|
||||
|
||||
$link =~ s/\-Y\sP\,/\-Y/g;
|
||||
|
||||
$link =~ s/\-R\s*/\-rpath\@/g;
|
||||
|
||||
$link =~ s/\-rpath\s+/\-rpath\@/g;
|
||||
|
||||
@@ -327,6 +334,7 @@ if ($link ne "") {
|
||||
&& ($flags !~ /kernel32/)
|
||||
&& ($flags !~ /advapi32/)
|
||||
&& ($flags !~ /shell32/)
|
||||
&& ($flags !~ /omp/)
|
||||
&& ($flags !~ /^\-l$/)
|
||||
) {
|
||||
$linker_l .= $flags . " ";
|
||||
|
||||
@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#ifdef OS_WINDOWS
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
@@ -1201,7 +1201,7 @@ static int get_num_cores(void) {
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
SYSTEM_INFO sysinfo;
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
int m[2], count;
|
||||
size_t len;
|
||||
#endif
|
||||
@@ -1215,7 +1215,7 @@ static int get_num_cores(void) {
|
||||
GetSystemInfo(&sysinfo);
|
||||
return sysinfo.dwNumberOfProcessors;
|
||||
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
|
||||
@@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES
|
||||
axpby.c
|
||||
)
|
||||
|
||||
# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f
|
||||
# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f
|
||||
# these all have 'z' sources for complex versions
|
||||
set(BLAS2_SOURCES
|
||||
gemv.c ger.c
|
||||
|
||||
@@ -394,7 +394,7 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS)
|
||||
SLAPACKOBJS = \
|
||||
sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \
|
||||
spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \
|
||||
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX)
|
||||
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX)
|
||||
|
||||
|
||||
#DLAPACKOBJS = \
|
||||
@@ -405,14 +405,14 @@ SLAPACKOBJS = \
|
||||
DLAPACKOBJS = \
|
||||
dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \
|
||||
dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \
|
||||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX)
|
||||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX)
|
||||
|
||||
|
||||
QLAPACKOBJS = \
|
||||
qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \
|
||||
qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \
|
||||
qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \
|
||||
|
||||
qlaswp.$(SUFFIX) qtrtrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \
|
||||
qtrtrs.$(SUFFIX)
|
||||
|
||||
#CLAPACKOBJS = \
|
||||
# cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
|
||||
@@ -423,7 +423,7 @@ QLAPACKOBJS = \
|
||||
CLAPACKOBJS = \
|
||||
cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
|
||||
cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \
|
||||
clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX)
|
||||
clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX)
|
||||
|
||||
|
||||
#ZLAPACKOBJS = \
|
||||
@@ -435,13 +435,14 @@ CLAPACKOBJS = \
|
||||
ZLAPACKOBJS = \
|
||||
zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \
|
||||
zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \
|
||||
zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX)
|
||||
zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX)
|
||||
|
||||
|
||||
XLAPACKOBJS = \
|
||||
xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \
|
||||
xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \
|
||||
xlaswp.$(SUFFIX) xgetrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \
|
||||
xlaswp.$(SUFFIX) xtrtrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \
|
||||
xtrtrs.$(SUFFIX)
|
||||
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
SBLASOBJS += $(SLAPACKOBJS)
|
||||
@@ -2031,7 +2032,7 @@ sgetrs.$(SUFFIX) sgetrs.$(PSUFFIX) : lapack/getrs.c
|
||||
dgetrs.$(SUFFIX) dgetrs.$(PSUFFIX) : lapack/getrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : getrs.c
|
||||
qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : lapack/getrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
@@ -2040,7 +2041,25 @@ cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : zgetrs.c
|
||||
xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
strtrs.$(SUFFIX) strtrs.$(PSUFFIX) : lapack/trtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
dtrtrs.$(SUFFIX) dtrtrs.$(PSUFFIX) : lapack/trtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
qtrtrs.$(SUFFIX) qtrtrs.$(PSUFFIX) : lapack/trtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ctrtrs.$(SUFFIX) ctrtrs.$(PSUFFIX) : lapack/ztrtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ztrtrs.$(SUFFIX) ztrtrs.$(PSUFFIX) : lapack/ztrtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
xtrtrs.$(SUFFIX) xtrtrs.$(PSUFFIX) : lapack/ztrtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : lapack/gesv.c
|
||||
|
||||
@@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
|
||||
//disable multi-thread when incx==0 or incy==0
|
||||
//In that case, the threads would be dependent.
|
||||
//
|
||||
//Temporarily work-around the low performance issue with small imput size &
|
||||
//Temporarily work-around the low performance issue with small input size &
|
||||
//multithreads.
|
||||
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
|
||||
nthreads = 1;
|
||||
|
||||
@@ -44,19 +44,19 @@
|
||||
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "QGESV "
|
||||
#define ERROR_NAME "QGESV"
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "DGESV "
|
||||
#define ERROR_NAME "DGESV"
|
||||
#else
|
||||
#define ERROR_NAME "SGESV "
|
||||
#define ERROR_NAME "SGESV"
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "XGESV "
|
||||
#define ERROR_NAME "XGESV"
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "ZGESV "
|
||||
#define ERROR_NAME "ZGESV"
|
||||
#else
|
||||
#define ERROR_NAME "CGESV "
|
||||
#define ERROR_NAME "CGESV"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -89,7 +89,7 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv,
|
||||
if (args.m < 0) info = 1;
|
||||
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
||||
if (args.n < 0) info = 2;
|
||||
if (args.m < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
||||
if (args.n < 0) info = 2;
|
||||
if (args.m < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -102,7 +102,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
|
||||
if (trans < 0) info = 1;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (uplo < 0) info = 1;
|
||||
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
|
||||
if (diag < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -99,7 +99,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
|
||||
if (diag < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
171
interface/lapack/trtrs.c
Normal file
171
interface/lapack/trtrs.c
Normal file
@@ -0,0 +1,171 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "QTRTRS"
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "DTRTRS"
|
||||
#else
|
||||
#define ERROR_NAME "STRTRS"
|
||||
#endif
|
||||
|
||||
static blasint (*trtrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_SINGLE, TRTRS_UNN_SINGLE, TRTRS_UTU_SINGLE, TRTRS_UTN_SINGLE, TRTRS_LNU_SINGLE, TRTRS_LNN_SINGLE, TRTRS_LTU_SINGLE, TRTRS_LTN_SINGLE,
|
||||
};
|
||||
|
||||
#ifdef SMP
|
||||
static blasint (*trtrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_PARALLEL, TRTRS_UNN_PARALLEL, TRTRS_UTU_PARALLEL, TRTRS_UTN_PARALLEL, TRTRS_LNU_PARALLEL, TRTRS_LNN_PARALLEL, TRTRS_LTU_PARALLEL, TRTRS_LTN_PARALLEL,
|
||||
};
|
||||
#endif
|
||||
|
||||
int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
|
||||
FLOAT *b, blasint *ldB, blasint *Info){
|
||||
|
||||
char uplo_arg = *UPLO;
|
||||
char trans_arg = *TRANS;
|
||||
char diag_arg = *DIAG;
|
||||
|
||||
blas_arg_t args;
|
||||
|
||||
blasint info;
|
||||
int uplo, trans, diag;
|
||||
FLOAT *buffer;
|
||||
#ifdef PPC440
|
||||
extern
|
||||
#endif
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
args.m = *N;
|
||||
args.n = *NRHS;
|
||||
args.a = (void *)a;
|
||||
args.lda = *ldA;
|
||||
args.b = (void *)b;
|
||||
args.ldb = *ldB;
|
||||
|
||||
info = 0;
|
||||
|
||||
TOUPPER(trans_arg);
|
||||
trans = -1;
|
||||
if (trans_arg == 'N') trans = 0;
|
||||
if (trans_arg == 'T') trans = 1;
|
||||
if (trans_arg == 'R') trans = 0;
|
||||
if (trans_arg == 'C') trans = 1;
|
||||
|
||||
uplo = -1;
|
||||
if (uplo_arg == 'U') uplo = 0;
|
||||
if (uplo_arg == 'L') uplo = 1;
|
||||
|
||||
diag = -1;
|
||||
if (diag_arg == 'U') diag = 0;
|
||||
if (diag_arg == 'N') diag = 1;
|
||||
|
||||
if (args.ldb < MAX(1, args.m)) info = 9;
|
||||
if (args.lda < MAX(1, args.m)) info = 7;
|
||||
if (args.n < 0) info = 5;
|
||||
if (args.m < 0) info = 4;
|
||||
if (trans < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (diag < 0) info = 3;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
args.alpha = NULL;
|
||||
args.beta = NULL;
|
||||
|
||||
*Info = 0;
|
||||
|
||||
if (args.m == 0) return 0;
|
||||
|
||||
if (diag) {
|
||||
if (AMIN_K(args.m, args.a, args.lda + 1) == ZERO) {
|
||||
*Info = IAMIN_K(args.m, args.a, args.lda + 1);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
#ifndef PPC440
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
(trtrs_single[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
(trtrs_parallel[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef PPC440
|
||||
blas_memory_free(buffer);
|
||||
#endif
|
||||
|
||||
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
||||
if (args.n < 0) info = 2;
|
||||
if (args.m < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
||||
if (args.n < 0) info = 2;
|
||||
if (args.m < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -102,7 +102,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
|
||||
if (trans < 0) info = 1;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -91,7 +91,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -91,7 +91,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (uplo < 0) info = 1;
|
||||
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
|
||||
if (diag < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user