Compare commits
777 Commits
revert-179
...
revert-232
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
83dae28ae2 | ||
|
|
da986d2e83 | ||
|
|
6bc487de35 | ||
|
|
f95989cbc1 | ||
|
|
04226f1e97 | ||
|
|
0925ef70db | ||
|
|
371e6f73d4 | ||
|
|
d117dfd505 | ||
|
|
883c39773a | ||
|
|
b09b5be0a4 | ||
|
|
bfb5fbdb4d | ||
|
|
3da6d66da9 | ||
|
|
08fa83aba2 | ||
|
|
63d3ee8dfc | ||
|
|
1191db1a49 | ||
|
|
1f6071590d | ||
|
|
0caf1434c9 | ||
|
|
73128f3883 | ||
|
|
cad0d150db | ||
|
|
eba0aeb7cd | ||
|
|
0c07c356c1 | ||
|
|
82b75f97e5 | ||
|
|
7887c45077 | ||
|
|
3e67017ac8 | ||
|
|
b3ac6ee222 | ||
|
|
6082e556cd | ||
|
|
92315173d5 | ||
|
|
351d12b94e | ||
|
|
bf73aa141b | ||
|
|
71e96163db | ||
|
|
819e852ae7 | ||
|
|
4e466d739c | ||
|
|
4c6a457358 | ||
|
|
836c414e22 | ||
|
|
d403eb3c2f | ||
|
|
3cd97f1a80 | ||
|
|
9955f0996f | ||
|
|
430c11e135 | ||
|
|
fbacd2605d | ||
|
|
6fa89b06a1 | ||
|
|
68597002ea | ||
|
|
d2a6285549 | ||
|
|
d999688d1a | ||
|
|
928fe1b28e | ||
|
|
ccc28c6d60 | ||
|
|
ae43b75a6a | ||
|
|
54fc06fd70 | ||
|
|
1df9a2013d | ||
|
|
274ff5cdb8 | ||
|
|
eb2eddf241 | ||
|
|
8691825944 | ||
|
|
7dc8a76f60 | ||
|
|
df857551c0 | ||
|
|
85ccdce8c4 | ||
|
|
aeabe0a83f | ||
|
|
1b90989662 | ||
|
|
e3e8b5cdca | ||
|
|
69b16a894d | ||
|
|
6782e5767d | ||
|
|
48f5a89f92 | ||
|
|
4ae1610f37 | ||
|
|
911c3e2f4b | ||
|
|
fab49e49e5 | ||
|
|
b687fba5bc | ||
|
|
46a8c2519a | ||
|
|
e9437eebd2 | ||
|
|
3a39062cfc | ||
|
|
eaa0be1313 | ||
|
|
6ff013bae0 | ||
|
|
0d669e04bb | ||
|
|
17cdd9f9e1 | ||
|
|
6bcb06fcb1 | ||
|
|
b7315f8401 | ||
|
|
9b19e9e1b0 | ||
|
|
6bd67ddbab | ||
|
|
5da9484d93 | ||
|
|
844629af57 | ||
|
|
2beaa82c05 | ||
|
|
e8a2aed2b9 | ||
|
|
f262031685 | ||
|
|
5f6206fa2d | ||
|
|
f2cde2ccfb | ||
|
|
ba7838d2e1 | ||
|
|
a448884a63 | ||
|
|
17609f88f1 | ||
|
|
3a2df19db6 | ||
|
|
d2093a40d3 | ||
|
|
aa04b0925e | ||
|
|
258ac56e0a | ||
|
|
56837e9d92 | ||
|
|
bb5413863f | ||
|
|
32f5907fef | ||
|
|
ac10236cc8 | ||
|
|
8617d75548 | ||
|
|
c07d78b9e9 | ||
|
|
6355c25dde | ||
|
|
5e244d80f2 | ||
|
|
ede5efebab | ||
|
|
84908d60d2 | ||
|
|
596a22325a | ||
|
|
7f58f3ad0e | ||
|
|
c0d570a357 | ||
|
|
6b83079368 | ||
|
|
673e5a0495 | ||
|
|
bfa2cc7d64 | ||
|
|
e7c4d6705a | ||
|
|
2a1911cc14 | ||
|
|
9f7a9a32e3 | ||
|
|
5d6525c87c | ||
|
|
6cb47ea3f0 | ||
|
|
459bb9291d | ||
|
|
3f1077ce6f | ||
|
|
eb45eb6942 | ||
|
|
f2becb777a | ||
|
|
5997b6b491 | ||
|
|
4b21b646ea | ||
|
|
7ec7b999a5 | ||
|
|
af9ac0898a | ||
|
|
c7b5a459b6 | ||
|
|
9b2f0323d6 | ||
|
|
9f6984fe4b | ||
|
|
42203dafdc | ||
|
|
a4f17a9297 | ||
|
|
733d97b2df | ||
|
|
ea747cf933 | ||
|
|
4de545aa7d | ||
|
|
6e9a93ec19 | ||
|
|
fde8a8e6a0 | ||
|
|
256fc15f5f | ||
|
|
ee498525e0 | ||
|
|
1fec0570f6 | ||
|
|
b5af7b9c78 | ||
|
|
f3c314550c | ||
|
|
847c20c9b7 | ||
|
|
4c22828812 | ||
|
|
e79712d969 | ||
|
|
be09551cdf | ||
|
|
ec1ef6aa9e | ||
|
|
11c59acfb1 | ||
|
|
bf0d92a310 | ||
|
|
db066151ee | ||
|
|
3a55dca2dc | ||
|
|
7d380f7d79 | ||
|
|
300f158d3b | ||
|
|
3635fdbf2b | ||
|
|
b6552b11eb | ||
|
|
5fdf9ad24f | ||
|
|
2fe967c542 | ||
|
|
6d8595351c | ||
|
|
f40200f559 | ||
|
|
a95a5e52b8 | ||
|
|
e3d846ab57 | ||
|
|
8506386d82 | ||
|
|
9ef96b32a6 | ||
|
|
b48c025974 | ||
|
|
a1fce67743 | ||
|
|
103b32fdb7 | ||
|
|
aef9804089 | ||
|
|
303869f572 | ||
|
|
02d9203981 | ||
|
|
7b6808b69c | ||
|
|
321288597c | ||
|
|
be147a9f28 | ||
|
|
c275290ea6 | ||
|
|
b7bbb02447 | ||
|
|
bf1430f7d7 | ||
|
|
dccff2e785 | ||
|
|
5c3458a6e7 | ||
|
|
1776ad82c0 | ||
|
|
4e2f81cfa1 | ||
|
|
acf6002ab2 | ||
|
|
96a794e9fd | ||
|
|
3d36c45116 | ||
|
|
648491e1aa | ||
|
|
2dfb804cb9 | ||
|
|
4c153ec9da | ||
|
|
7eecd8e39c | ||
|
|
f0406a7708 | ||
|
|
561f3fd995 | ||
|
|
30efed14d1 | ||
|
|
af2e7f28fc | ||
|
|
4250e6ed64 | ||
|
|
7b0b7c11d2 | ||
|
|
d14cf1ccf4 | ||
|
|
3f6ab1582a | ||
|
|
28e96458e5 | ||
|
|
95fb98f556 | ||
|
|
4801c6d36b | ||
|
|
9440fa607d | ||
|
|
94db259e5b | ||
|
|
f49f8047ac | ||
|
|
825777faab | ||
|
|
9c89757562 | ||
|
|
b0b7600bef | ||
|
|
9b04baeaee | ||
|
|
8a074b3965 | ||
|
|
211ab03b14 | ||
|
|
1733f927e6 | ||
|
|
182b06d6ad | ||
|
|
7a9050d681 | ||
|
|
0ba29fd262 | ||
|
|
bafa021ed6 | ||
|
|
b89d9762a2 | ||
|
|
08dedf4c5e | ||
|
|
b89c781637 | ||
|
|
dd7ff77f4b | ||
|
|
8fb76134bc | ||
|
|
04d671aae2 | ||
|
|
f69a0be712 | ||
|
|
ae9e8b131e | ||
|
|
9086543f50 | ||
|
|
abea977ded | ||
|
|
6b6c9b1441 | ||
|
|
a97b301aaa | ||
|
|
2f13f04224 | ||
|
|
7c7505a778 | ||
|
|
5a4f1a2118 | ||
|
|
3b761892df | ||
|
|
eebfeba768 | ||
|
|
7684c4f8f8 | ||
|
|
7faf42b7bb | ||
|
|
a575f1e4c7 | ||
|
|
cdbfb891da | ||
|
|
280552b988 | ||
|
|
bbd4bb0154 | ||
|
|
6d3efb2b58 | ||
|
|
d9ff2cd90d | ||
|
|
2a43062de7 | ||
|
|
4ea794a522 | ||
|
|
ece0bfb881 | ||
|
|
1f4b6a5d5d | ||
|
|
be8f70d269 | ||
|
|
e674e1c735 | ||
|
|
6ca898b63b | ||
|
|
26411acd56 | ||
|
|
0ab4076dd8 | ||
|
|
a0caa762b3 | ||
|
|
900d5a3205 | ||
|
|
a17cf36225 | ||
|
|
148c4cc5fd | ||
|
|
d0c3543c3f | ||
|
|
909ad04aef | ||
|
|
417efd41c6 | ||
|
|
9cdc828afa | ||
|
|
7a9a4dbc4f | ||
|
|
a469b32cf4 | ||
|
|
27649b9543 | ||
|
|
16f3df5d35 | ||
|
|
1aded69821 | ||
|
|
c00289ba54 | ||
|
|
8fe794f059 | ||
|
|
74c10b57c6 | ||
|
|
c5495d2056 | ||
|
|
c70496b108 | ||
|
|
ca8d8835f5 | ||
|
|
d76b20b4d2 | ||
|
|
85af04da3c | ||
|
|
11e0dcbffb | ||
|
|
79366ff7a9 | ||
|
|
21d05a4835 | ||
|
|
940f38f6dd | ||
|
|
1778fd4219 | ||
|
|
969dd6175e | ||
|
|
d8d5682481 | ||
|
|
f66c11fc22 | ||
|
|
5ecffc28f2 | ||
|
|
86dda5c2fa | ||
|
|
1e52572be3 | ||
|
|
d2cb610272 | ||
|
|
a211bc9b6a | ||
|
|
9208ab8603 | ||
|
|
b43deb4ad6 | ||
|
|
b911525c81 | ||
|
|
7ff44e0016 | ||
|
|
e3cb8ad2d6 | ||
|
|
7aa6faad5f | ||
|
|
3d94ab660f | ||
|
|
cd99dfe034 | ||
|
|
dadafcdcd8 | ||
|
|
d40c109eb0 | ||
|
|
608cd69b66 | ||
|
|
231472c4c6 | ||
|
|
612c2d78e0 | ||
|
|
dc110e179d | ||
|
|
9184590c33 | ||
|
|
a0aaf308ed | ||
|
|
15f925fe9a | ||
|
|
21acf03e9a | ||
|
|
ff807473bb | ||
|
|
58829c0988 | ||
|
|
d86f0b9e74 | ||
|
|
63554d5dec | ||
|
|
43068288e9 | ||
|
|
999a04f101 | ||
|
|
3cb1c8d210 | ||
|
|
ff1bfe7b16 | ||
|
|
9ea30f3788 | ||
|
|
a3d4c65d62 | ||
|
|
e1fc02095c | ||
|
|
0cd6d8508f | ||
|
|
c2f152c470 | ||
|
|
4efbac28ed | ||
|
|
406c7242f4 | ||
|
|
53703585aa | ||
|
|
ad20ceaa68 | ||
|
|
dd77a3f0e2 | ||
|
|
a598ab1d32 | ||
|
|
16fd8e3dbe | ||
|
|
aa4c41bad2 | ||
|
|
5cf434167a | ||
|
|
3a49e8c05a | ||
|
|
95e2cf32e1 | ||
|
|
70cea0b96b | ||
|
|
ae0dec77ec | ||
|
|
e47b63466b | ||
|
|
7d1b468d9d | ||
|
|
575a84398a | ||
|
|
5cabda79d0 | ||
|
|
c516209581 | ||
|
|
a6a8cc2b7f | ||
|
|
3d7debbb28 | ||
|
|
5a9cce2bf6 | ||
|
|
6a8b4269b5 | ||
|
|
b1561ecc68 | ||
|
|
7ed8431527 | ||
|
|
a387a23518 | ||
|
|
b46875b76b | ||
|
|
858e609e1f | ||
|
|
3f427c0cf9 | ||
|
|
c95317158f | ||
|
|
47f892198c | ||
|
|
b43c8382c8 | ||
|
|
daf2fec12d | ||
|
|
4f8143b098 | ||
|
|
bfeb9c16b0 | ||
|
|
97d5034ed3 | ||
|
|
9763f872fc | ||
|
|
628b335e83 | ||
|
|
0f105dd8a5 | ||
|
|
9c4edd38f2 | ||
|
|
1036299da0 | ||
|
|
5b0398186e | ||
|
|
452859f4e1 | ||
|
|
2cd463eabd | ||
|
|
11530b76f7 | ||
|
|
91943b7325 | ||
|
|
268c28db7d | ||
|
|
2aad88d5b9 | ||
|
|
0bd956fd21 | ||
|
|
bbd9d98664 | ||
|
|
798c448b0c | ||
|
|
9a19616a28 | ||
|
|
6b41eb9c0c | ||
|
|
ccfb7ead15 | ||
|
|
40e53e52d6 | ||
|
|
744779d335 | ||
|
|
bcdf1d4917 | ||
|
|
e06b8438b4 | ||
|
|
9229d6859b | ||
|
|
21d146a8de | ||
|
|
7f4e36d219 | ||
|
|
c04a729081 | ||
|
|
100d94f94e | ||
|
|
d17da6c6a4 | ||
|
|
1679de5e59 | ||
|
|
246ca29679 | ||
|
|
9d717cb5ee | ||
|
|
e3bc83f2a8 | ||
|
|
70f2a4e0d7 | ||
|
|
706dfe263b | ||
|
|
688fa9201c | ||
|
|
cdbe0f0235 | ||
|
|
f8b82bc6dc | ||
|
|
3e3ccb9011 | ||
|
|
94ab4e6fb2 | ||
|
|
c3cfc6986b | ||
|
|
b9f4943a14 | ||
|
|
79cfc24a62 | ||
|
|
5c42287c4f | ||
|
|
32c7063cb0 | ||
|
|
c19a449096 | ||
|
|
3d1e36d4cb | ||
|
|
4f9d3e4b28 | ||
|
|
4dec151d0b | ||
|
|
7c51cc8527 | ||
|
|
853a18bc17 | ||
|
|
3ae122e2c7 | ||
|
|
b043a5962e | ||
|
|
8502030e5e | ||
|
|
8ba9e2a61a | ||
|
|
4ad694eda1 | ||
|
|
dff4a197a5 | ||
|
|
a5425575b1 | ||
|
|
1006ff8a7b | ||
|
|
e608d4f7fe | ||
|
|
4fc17d0d75 | ||
|
|
c3e30b2bc2 | ||
|
|
03d7110900 | ||
|
|
3ce28fb81a | ||
|
|
04f2226ea6 | ||
|
|
b1393c7a97 | ||
|
|
7e3eb9b25d | ||
|
|
f074d7d146 | ||
|
|
f18ab6c17b | ||
|
|
946ec6c3b8 | ||
|
|
5b95534afc | ||
|
|
f7a06463d9 | ||
|
|
b0c714ef60 | ||
|
|
8d3d29e4d7 | ||
|
|
b7f59da42d | ||
|
|
db3dc9e282 | ||
|
|
4290afdae2 | ||
|
|
4741ce803b | ||
|
|
11cfd0bd75 | ||
|
|
651ab01d2b | ||
|
|
d7b2c53c0b | ||
|
|
e4864a8933 | ||
|
|
10d841d8b9 | ||
|
|
12f2b76748 | ||
|
|
6c83b878f6 | ||
|
|
fb4dae7124 | ||
|
|
760842dda1 | ||
|
|
53f482ee72 | ||
|
|
783ba8058f | ||
|
|
af480b02a4 | ||
|
|
e4a79be6bb | ||
|
|
e5c316c6b9 | ||
|
|
25427926bc | ||
|
|
edb8143141 | ||
|
|
c4868d11c0 | ||
|
|
4c321ae571 | ||
|
|
2ffb727187 | ||
|
|
d66214c946 | ||
|
|
fd34820b99 | ||
|
|
918a0cc4d1 | ||
|
|
0db9c03e7e | ||
|
|
6eee1beac5 | ||
|
|
e5df5958cc | ||
|
|
343b301d14 | ||
|
|
45333d5793 | ||
|
|
e29b0cfcc4 | ||
|
|
78d9910236 | ||
|
|
e12cdf58ef | ||
|
|
1860c9456d | ||
|
|
aec905498f | ||
|
|
56089991e2 | ||
|
|
f9bb76d29a | ||
|
|
8242b1fe3f | ||
|
|
efb9038f72 | ||
|
|
e976557d29 | ||
|
|
9d8be15789 | ||
|
|
d752799a0f | ||
|
|
f209fc7fa9 | ||
|
|
c26c0b77a7 | ||
|
|
1c6da2d03c | ||
|
|
4255a58cd2 | ||
|
|
d3e4725548 | ||
|
|
adb419ed67 | ||
|
|
46e415b140 | ||
|
|
cd5a59b9cf | ||
|
|
69a97ca7b9 | ||
|
|
b55c586fac | ||
|
|
056917d616 | ||
|
|
718efcec6f | ||
|
|
f9d67bb5e8 | ||
|
|
76bb74fcd4 | ||
|
|
0a54c98b9d | ||
|
|
bec54ae366 | ||
|
|
63d7bad8a5 | ||
|
|
ab1630f9fa | ||
|
|
b824fa70eb | ||
|
|
91481a3e4e | ||
|
|
dc6ac9eab0 | ||
|
|
f583674109 | ||
|
|
77fe70019f | ||
|
|
03a2bf2602 | ||
|
|
69edc5bbe7 | ||
|
|
7039770165 | ||
|
|
641767f846 | ||
|
|
af6e2253a2 | ||
|
|
5952e586ce | ||
|
|
f10408aae8 | ||
|
|
d70ae3ab43 | ||
|
|
1391fc46d2 | ||
|
|
11a43e8116 | ||
|
|
817fe9865c | ||
|
|
f4b82d7bc4 | ||
|
|
61526480f9 | ||
|
|
81daf6bc38 | ||
|
|
a38aa56e76 | ||
|
|
729e925174 | ||
|
|
498ac98581 | ||
|
|
cd9ea45463 | ||
|
|
f9c5023e04 | ||
|
|
4abc375a91 | ||
|
|
874df65491 | ||
|
|
1f4b61f572 | ||
|
|
282230c303 | ||
|
|
cce574c3e0 | ||
|
|
877023e1e1 | ||
|
|
265142edd5 | ||
|
|
885a3c4350 | ||
|
|
4b512f84dd | ||
|
|
72d3e7c9b4 | ||
|
|
bdc73a49e0 | ||
|
|
1249ee1fd0 | ||
|
|
42df9efa0c | ||
|
|
82124729af | ||
|
|
29416cb5a3 | ||
|
|
48b9b94f7f | ||
|
|
86a824c97f | ||
|
|
808410c2c7 | ||
|
|
eaf20f0e7a | ||
|
|
fcd814a8d2 | ||
|
|
dc4d3bccd5 | ||
|
|
c7143c1019 | ||
|
|
04873bb174 | ||
|
|
c8ef9fb220 | ||
|
|
5be61f4b47 | ||
|
|
3d155cff83 | ||
|
|
7d47f0a82d | ||
|
|
a529c71a74 | ||
|
|
ea1716ce2a | ||
|
|
0f24b39ebf | ||
|
|
89b60dab8a | ||
|
|
58dd7e4501 | ||
|
|
36b844af88 | ||
|
|
e882b239aa | ||
|
|
3f7bb87a2a | ||
|
|
e908ac2a51 | ||
|
|
8533aca964 | ||
|
|
16494cb7c4 | ||
|
|
b56b34a75c | ||
|
|
21eda8b577 | ||
|
|
24288803b3 | ||
|
|
f0d834b824 | ||
|
|
63bbd7b0d7 | ||
|
|
b111829226 | ||
|
|
010d59bfee | ||
|
|
83b5c6b92d | ||
|
|
bbfdd6c0fe | ||
|
|
cda81cfae0 | ||
|
|
32b0f1168e | ||
|
|
b495e54310 | ||
|
|
d5e6940253 | ||
|
|
24e697eadb | ||
|
|
3e9fd6359d | ||
|
|
43a4572038 | ||
|
|
256eb588bb | ||
|
|
a034e65512 | ||
|
|
8c3386be87 | ||
|
|
3e601bd419 | ||
|
|
478d3c4569 | ||
|
|
3afceb6c2a | ||
|
|
7af8b21dbb | ||
|
|
1e3ada6db4 | ||
|
|
2777a7f506 | ||
|
|
b70fd23836 | ||
|
|
def0385caa | ||
|
|
29dc72889f | ||
|
|
b815a04c87 | ||
|
|
dbc9a060ef | ||
|
|
00401489c2 | ||
|
|
1a7925b3a3 | ||
|
|
406f835f00 | ||
|
|
621dedb37b | ||
|
|
b731e8246f | ||
|
|
ecc31b743f | ||
|
|
5d89d6b143 | ||
|
|
67432b23c2 | ||
|
|
21c0f2af7b | ||
|
|
ad2c386d6a | ||
|
|
be66f5d5c2 | ||
|
|
c2ffef8156 | ||
|
|
e7455f500c | ||
|
|
3eafcfa650 | ||
|
|
8d99dba86b | ||
|
|
1650311246 | ||
|
|
cf5d48e833 | ||
|
|
191677b902 | ||
|
|
31ed19e8b9 | ||
|
|
e1574fa2b4 | ||
|
|
68eb3146ce | ||
|
|
0afaae4b23 | ||
|
|
ae1d1f74f7 | ||
|
|
94cd946b96 | ||
|
|
ed01f4932a | ||
|
|
1aa840a0a2 | ||
|
|
802f0dbde1 | ||
|
|
20d1aad13f | ||
|
|
d11554c88f | ||
|
|
ed704185ab | ||
|
|
2940798ea7 | ||
|
|
eebc189287 | ||
|
|
9185d419d3 | ||
|
|
4cf9d32694 | ||
|
|
1c75b65d53 | ||
|
|
13d006339b | ||
|
|
bf76162635 | ||
|
|
0d52aefc6b | ||
|
|
a6787b0f81 | ||
|
|
8643521127 | ||
|
|
5a720cf9ca | ||
|
|
ccd5945d38 | ||
|
|
9f80e0f5fc | ||
|
|
bba1e67269 | ||
|
|
93240f489e | ||
|
|
7cbc2c37d6 | ||
|
|
c329de2931 | ||
|
|
187233953c | ||
|
|
09170268a3 | ||
|
|
211120c508 | ||
|
|
9e4d190f4f | ||
|
|
fe02ba86a4 | ||
|
|
284fb00971 | ||
|
|
795285c587 | ||
|
|
d6818777d1 | ||
|
|
5bd21ab6e1 | ||
|
|
e1eab96502 | ||
|
|
76b4b8980f | ||
|
|
49e0f485da | ||
|
|
43c2b0eb55 | ||
|
|
942e229ed5 | ||
|
|
26a3402773 | ||
|
|
20033f992a | ||
|
|
f343ed65b5 | ||
|
|
a5a1118527 | ||
|
|
e23366e860 | ||
|
|
b28f75cd7e | ||
|
|
d321448a63 | ||
|
|
c43331ad0a | ||
|
|
e8ca5a59a9 | ||
|
|
c4e23dd016 | ||
|
|
cfc4acc221 | ||
|
|
545c2b1bbb | ||
|
|
69d206440a | ||
|
|
3843e3e017 | ||
|
|
fbcb14a74b | ||
|
|
2a3190dc76 | ||
|
|
1ebe5c0f49 | ||
|
|
0586899a10 | ||
|
|
00dc09ad19 | ||
|
|
78d877b54b | ||
|
|
cdc668d82b | ||
|
|
87718807f0 | ||
|
|
51aec8e96b | ||
|
|
06f7d78d70 | ||
|
|
38cc638591 | ||
|
|
0bf6d74e5f | ||
|
|
133c278ee5 | ||
|
|
2b355592e3 | ||
|
|
ff3eb1d474 | ||
|
|
0b09516678 | ||
|
|
7639f2e1f0 | ||
|
|
2fc712469d | ||
|
|
6ba30e270d | ||
|
|
bf23518e36 | ||
|
|
31a490ea88 | ||
|
|
701ea88347 | ||
|
|
721c56c224 | ||
|
|
c5f8aeff2d | ||
|
|
8278cbe7f8 | ||
|
|
ea6d1b96bd | ||
|
|
360374be62 | ||
|
|
f5acaad8f0 | ||
|
|
93fa6b7b76 | ||
|
|
c0827a7164 | ||
|
|
86cff4effc | ||
|
|
b028960aba | ||
|
|
3c9e3faedb | ||
|
|
44c81fd135 | ||
|
|
26b3710485 | ||
|
|
84e614d0fd | ||
|
|
dceff5542c | ||
|
|
6c7b691083 | ||
|
|
5f4c550c27 | ||
|
|
731b2722ba | ||
|
|
f85ce54d4a | ||
|
|
2601cd58ab | ||
|
|
95a5542e3c | ||
|
|
7a2e1bc804 | ||
|
|
35653e38b3 | ||
|
|
71e25ae42f | ||
|
|
97d7298973 | ||
|
|
de0d0ed52f | ||
|
|
081ceb3e02 | ||
|
|
a29ec458c2 | ||
|
|
816775e309 | ||
|
|
b6363f4539 | ||
|
|
19c4bdd8b3 | ||
|
|
f049a4c84f | ||
|
|
f72fdf525c | ||
|
|
5393759a98 | ||
|
|
5cf18e2875 | ||
|
|
910050985a | ||
|
|
0184713e1a | ||
|
|
45c3c459e1 | ||
|
|
113cb00b95 | ||
|
|
5192651706 | ||
|
|
310ea55f29 | ||
|
|
2e6fae2aad | ||
|
|
368d14f8c8 | ||
|
|
42bc2a9202 | ||
|
|
43bb386b10 | ||
|
|
c171b8ad13 | ||
|
|
2f04cf22ac | ||
|
|
807f6e6922 | ||
|
|
ecbeb802a0 | ||
|
|
2c5725cc39 | ||
|
|
e3666931d8 | ||
|
|
ae02a57261 | ||
|
|
a6a52a73f7 | ||
|
|
0427277cef | ||
|
|
4f43668eec | ||
|
|
b0c15bacc1 | ||
|
|
cfb0f5b0f8 | ||
|
|
667fed579d | ||
|
|
96d2f2c9b2 | ||
|
|
653e657a58 | ||
|
|
5f8f0583d4 | ||
|
|
974a6a30f2 | ||
|
|
9531d0e175 | ||
|
|
40cce0e353 | ||
|
|
3fd41313fc | ||
|
|
a931afe269 | ||
|
|
7d3502b500 | ||
|
|
066f8065d1 | ||
|
|
fb5b2177ca | ||
|
|
f1c02273cb | ||
|
|
661035477c | ||
|
|
aa7e47aa0a | ||
|
|
9c177d270b | ||
|
|
b025523197 | ||
|
|
5b50bd36f7 | ||
|
|
5b708e5eb1 | ||
|
|
dcc5d6291e | ||
|
|
7b5aea52bb | ||
|
|
f5595d0262 | ||
|
|
326d394a0f | ||
|
|
6af8e35a24 | ||
|
|
38cf5d9364 | ||
|
|
8a43baacb2 | ||
|
|
64ca44873b | ||
|
|
2d8064174c | ||
|
|
76a66eaac8 | ||
|
|
2992e3886a | ||
|
|
d5aeff636f | ||
|
|
af2837c392 | ||
|
|
e7b66cd36e | ||
|
|
d50abc8903 | ||
|
|
351a0c777c | ||
|
|
e3c262e5cf | ||
|
|
a293bdcd5e | ||
|
|
c7bbf9c987 | ||
|
|
898a8dcaba | ||
|
|
71c6deed60 | ||
|
|
21f46a1cf2 | ||
|
|
caf339412f | ||
|
|
8001fdcd2a | ||
|
|
162e312832 | ||
|
|
c3d93caa8d | ||
|
|
a71923514f | ||
|
|
55b244ca0d | ||
|
|
2263d3906c | ||
|
|
81c9985c3a | ||
|
|
56ebc7b53e | ||
|
|
c5f88f5a57 | ||
|
|
8a11ec19d1 | ||
|
|
fa53b903db | ||
|
|
fd8d1868a1 | ||
|
|
f0563f14ba | ||
|
|
3197f86762 | ||
|
|
422a8fa953 | ||
|
|
e6c0e39492 | ||
|
|
453bfa7e71 | ||
|
|
23229011db | ||
|
|
e8a68ef261 |
143
.drone.yml
Normal file
143
.drone.yml
Normal file
@@ -0,0 +1,143 @@
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm64_gcc_make
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm64
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:19.04
|
||||
environment:
|
||||
CC: gcc
|
||||
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
|
||||
commands:
|
||||
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC gfortran perl
|
||||
- $CC --version
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||
- make -C test $COMMON_FLAGS
|
||||
- make -C ctest $COMMON_FLAGS
|
||||
- make -C utest $COMMON_FLAGS
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm32_gcc_make
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:19.04
|
||||
environment:
|
||||
CC: gcc
|
||||
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
|
||||
commands:
|
||||
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC gfortran perl
|
||||
- $CC --version
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||
- make -C test $COMMON_FLAGS
|
||||
- make -C ctest $COMMON_FLAGS
|
||||
- make -C utest $COMMON_FLAGS
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm64_clang_make
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm64
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:18.04
|
||||
environment:
|
||||
CC: clang
|
||||
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
|
||||
commands:
|
||||
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC gfortran perl
|
||||
- $CC --version
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||
- make -C test $COMMON_FLAGS
|
||||
- make -C ctest $COMMON_FLAGS
|
||||
- make -C utest $COMMON_FLAGS
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm32_clang_cmake
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:18.04
|
||||
environment:
|
||||
CC: clang
|
||||
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
|
||||
commands:
|
||||
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC g++ perl cmake
|
||||
- $CC --version
|
||||
- mkdir build && cd build
|
||||
- cmake $CMAKE_FLAGS ..
|
||||
- make -j
|
||||
- ctest
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm64_gcc_cmake
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm64
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:18.04
|
||||
environment:
|
||||
CC: gcc
|
||||
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
|
||||
commands:
|
||||
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC g++ perl cmake
|
||||
- $CC --version
|
||||
- mkdir build && cd build
|
||||
- cmake $CMAKE_FLAGS ..
|
||||
- make -j
|
||||
- ctest
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm64_clang_cmake
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm64
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:18.04
|
||||
environment:
|
||||
CC: clang
|
||||
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
|
||||
commands:
|
||||
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC g++ perl cmake
|
||||
- $CC --version
|
||||
- mkdir build && cd build
|
||||
- cmake $CMAKE_FLAGS ..
|
||||
- make -j
|
||||
- ctest
|
||||
37
.travis.yml
37
.travis.yml
@@ -4,11 +4,10 @@ dist: precise
|
||||
sudo: true
|
||||
language: c
|
||||
|
||||
jobs:
|
||||
matrix:
|
||||
include:
|
||||
- &test-ubuntu
|
||||
os: linux
|
||||
stage: test
|
||||
compiler: gcc
|
||||
addons:
|
||||
apt:
|
||||
@@ -18,7 +17,7 @@ jobs:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
script:
|
||||
- set -e
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
@@ -26,6 +25,15 @@ jobs:
|
||||
- TARGET_BOX=LINUX64
|
||||
- BTYPE="BINARY=64"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
os: linux-ppc64le
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
|
||||
env:
|
||||
# for matrix annotation only
|
||||
- TARGET_BOX=PPC64LE_LINUX
|
||||
- BTYPE="BINARY=64 USE_OPENMP=1"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
env:
|
||||
- TARGET_BOX=LINUX64
|
||||
@@ -59,7 +67,6 @@ jobs:
|
||||
- BTYPE="BINARY=32"
|
||||
|
||||
- os: linux
|
||||
stage: test
|
||||
compiler: gcc
|
||||
addons:
|
||||
apt:
|
||||
@@ -80,7 +87,6 @@ jobs:
|
||||
# that don't require sudo.
|
||||
- &test-alpine
|
||||
os: linux
|
||||
stage: test
|
||||
dist: trusty
|
||||
sudo: true
|
||||
language: minimal
|
||||
@@ -120,11 +126,10 @@ jobs:
|
||||
- <<: *test-alpine
|
||||
env:
|
||||
- TARGET_BOX=LINUX64_MUSL
|
||||
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
|
||||
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
|
||||
|
||||
- &test-cmake
|
||||
os: linux
|
||||
stage: test
|
||||
compiler: clang
|
||||
addons:
|
||||
apt:
|
||||
@@ -153,20 +158,28 @@ jobs:
|
||||
|
||||
- &test-macos
|
||||
os: osx
|
||||
stage: test
|
||||
osx_image: xcode8
|
||||
osx_image: xcode10.1
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
- brew update
|
||||
- brew install gcc # for gfortran
|
||||
- brew install gcc@8 # for gfortran
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||
- BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode8.3
|
||||
env:
|
||||
- BTYPE="BINARY=32"
|
||||
- BTYPE="BINARY=32 FC=gfortran-8"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode10.1
|
||||
env:
|
||||
- COMMON_FLAGS="NUM_THREADS=32"
|
||||
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk"
|
||||
- CFLAGS="-O2 -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang"
|
||||
|
||||
# whitelist
|
||||
branches:
|
||||
|
||||
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
||||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 4.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 8.dev)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
@@ -20,9 +20,14 @@ if(MSVC)
|
||||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
|
||||
endif()
|
||||
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
||||
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF)
|
||||
option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
|
||||
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
|
||||
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
|
||||
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
||||
else()
|
||||
set(NO_AFFINITY 1)
|
||||
endif()
|
||||
|
||||
# Add a prefix or suffix to all exported symbol names in the shared library.
|
||||
# Avoids conflicts with other BLAS libraries, especially when using
|
||||
@@ -42,6 +47,19 @@ endif()
|
||||
|
||||
#######
|
||||
|
||||
if(MSVC AND MSVC_STATIC_CRT)
|
||||
set(CompilerFlags
|
||||
CMAKE_CXX_FLAGS
|
||||
CMAKE_CXX_FLAGS_DEBUG
|
||||
CMAKE_CXX_FLAGS_RELEASE
|
||||
CMAKE_C_FLAGS
|
||||
CMAKE_C_FLAGS_DEBUG
|
||||
CMAKE_C_FLAGS_RELEASE
|
||||
)
|
||||
foreach(CompilerFlag ${CompilerFlags})
|
||||
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
|
||||
|
||||
@@ -62,10 +80,10 @@ endif ()
|
||||
|
||||
set(SUBDIRS ${BLASDIRS})
|
||||
if (NOT NO_LAPACK)
|
||||
list(APPEND SUBDIRS lapack)
|
||||
if(BUILD_RELAPACK)
|
||||
list(APPEND SUBDIRS relapack/src)
|
||||
endif()
|
||||
list(APPEND SUBDIRS lapack)
|
||||
endif ()
|
||||
|
||||
# set which float types we want to build for
|
||||
@@ -134,7 +152,7 @@ endif ()
|
||||
|
||||
# Only generate .def for dll on MSVC and always produce pdb files for debug and release
|
||||
if(MSVC)
|
||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
|
||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
|
||||
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
|
||||
endif()
|
||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
|
||||
@@ -149,15 +167,9 @@ if (${DYNAMIC_ARCH})
|
||||
endforeach()
|
||||
endif ()
|
||||
|
||||
# Only build shared libs for MSVC
|
||||
if (MSVC)
|
||||
set(BUILD_SHARED_LIBS ON)
|
||||
endif()
|
||||
|
||||
|
||||
# add objects to the openblas lib
|
||||
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
|
||||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
|
||||
|
||||
# Android needs to explicitly link against libm
|
||||
if(ANDROID)
|
||||
@@ -166,7 +178,7 @@ endif()
|
||||
|
||||
# Handle MSVC exports
|
||||
if(MSVC AND BUILD_SHARED_LIBS)
|
||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
|
||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
|
||||
else()
|
||||
# Creates verbose .def file (51KB vs 18KB)
|
||||
@@ -199,7 +211,8 @@ if (USE_THREAD)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
|
||||
endif()
|
||||
|
||||
if (MSVC OR NOT NOFORTRAN)
|
||||
#if (MSVC OR NOT NOFORTRAN)
|
||||
if (NOT NO_CBLAS)
|
||||
# Broken without fortran on unix
|
||||
add_subdirectory(utest)
|
||||
endif()
|
||||
@@ -217,6 +230,14 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
|
||||
SOVERSION ${OpenBLAS_MAJOR_VERSION}
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
|
||||
if (NOT MSVC)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
|
||||
else()
|
||||
target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
|
||||
if (NOT DEFINED ARCH)
|
||||
set(ARCH_IN "x86_64")
|
||||
@@ -314,7 +335,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
if(NOT NOFORTRAN)
|
||||
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
|
||||
set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h)
|
||||
set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h)
|
||||
file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n")
|
||||
file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n")
|
||||
file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n")
|
||||
@@ -327,10 +348,11 @@ endif()
|
||||
if(NOT NO_CBLAS)
|
||||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
|
||||
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
|
||||
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
|
||||
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}")
|
||||
install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h)
|
||||
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
|
||||
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
endif()
|
||||
|
||||
if(NOT NO_LAPACKE)
|
||||
|
||||
@@ -167,4 +167,7 @@ In chronological order:
|
||||
* [2017-02-26] ztrmm kernel for IBM z13
|
||||
* [2017-03-13] strmm and ctrmm kernel for IBM z13
|
||||
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
|
||||
|
||||
* [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes
|
||||
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
|
||||
* [2019-03-14] power9 dgemm/dtrmm kernel
|
||||
* [2019-04-29] power9 sgemm/strmm kernel
|
||||
|
||||
225
Changelog.txt
225
Changelog.txt
@@ -1,4 +1,229 @@
|
||||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.7
|
||||
11-Aug 2019
|
||||
|
||||
common:
|
||||
* having the gmake special variables TARGET_ARCH or TARGET_MACH
|
||||
defined no longer causes build failures in ctest or utest
|
||||
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
|
||||
has the same effect as setting them to 1
|
||||
* a new test program was added to allow checking the library for
|
||||
thread safety
|
||||
* a new option USE_LOCKING was added to ensure thread safety when
|
||||
OpenBLAS itself is built without multithreading but will be
|
||||
called from multiple threads.
|
||||
* a build failure on Linux with glibc versions earlier than 2.5
|
||||
was fixed
|
||||
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
|
||||
on glibc 2.6 was fixed
|
||||
* NO_AFFINITY was added to the CMAKE options (and defaults to being
|
||||
active on Linux, as in the gmake builds)
|
||||
|
||||
x86_64:
|
||||
* the build-time logic for detection of AVX512 availability in
|
||||
the processor and compiler was fixed
|
||||
* gmake builds on OSX now set the internal name of the library to
|
||||
libopenblas.0.dylib (consistent with CMAKE)
|
||||
* the Haswell DGEMM kernel received a significant speedup through
|
||||
improved prefetch and load instructions
|
||||
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
|
||||
increased by avoiding vpermpd instructions
|
||||
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
|
||||
to fix remaining errors in DGEMM, DSYMM and DTRMM
|
||||
|
||||
## POWER:
|
||||
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
|
||||
* added optimized kernels for POWER9 SGEMM and STRMM
|
||||
|
||||
## ARMV7:
|
||||
* fixed the softfp implementations of xAMAX and IxAMAX
|
||||
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
|
||||
they were appropriate for only a subset of platforms
|
||||
|
||||
====================================================================
|
||||
Version 0.3.6
|
||||
29-Apr-2019
|
||||
|
||||
common:
|
||||
* the build tools now check that a given cpu TARGET is actually valid
|
||||
* the build-time check of system features (c_check) has been made
|
||||
less dependent on particular perl features (this should mainly
|
||||
benefit building on Windows)
|
||||
* several problem with the ReLAPACK integration were fixed,
|
||||
including INTERFACE64 support and building a shared library
|
||||
* building with CMAKE on BSD systems was improved
|
||||
* a non-absolute SUM function was added based on the
|
||||
existing optimized code for ASUM
|
||||
* CBLAS interfaces to the IxMIN and IxMAX functions were added
|
||||
* a name clash between LAPACKE and BOOST headers was resolved
|
||||
* CMAKE builds with OpenMP failed to include the appropriate getrf_parallel
|
||||
kernels
|
||||
* a crash on thread (key) deletion with the USE_TLS=1 memory management
|
||||
option was fixed
|
||||
* restored several earlier fixes, in particular for OpenMP performance,
|
||||
building on BSD, and calling fork on CYGWIN, which had inadvertently
|
||||
been dropped in the 0.3.3 rewrite of the memory management code.
|
||||
|
||||
x86_64:
|
||||
* the AVX512 DGEMM kernel has been disabled again due to unsolved problems
|
||||
* building with old versions of MSVC was fixed
|
||||
* it is now possible to build a static library on Windows with CMAKE
|
||||
* accessing environment variables on CYGWIN at run time was fixed
|
||||
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
|
||||
* Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected
|
||||
* building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported
|
||||
with CMAKE as well
|
||||
* building for DYNAMIC_ARCH with GENERIC as the default target is now supported
|
||||
* a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed
|
||||
* assembly bugs involving undeclared modification of input operands were fixed
|
||||
in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem,
|
||||
Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause
|
||||
test failures or segfaults when compiled with recent versions of gcc from 8 onward.
|
||||
* a similar bug was fixed in the blas_quickdivide code used to split workloads
|
||||
in most functions
|
||||
* a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX
|
||||
* fixed building on SkylakeX systems when either the compiler or the (emulated) operating
|
||||
environment does not support AVX512
|
||||
* improved GEMM performance on ZEN targets
|
||||
|
||||
x86:
|
||||
* build failures caused by the recently added checks for AVX512 were fixed
|
||||
* an inline assembly bug involving undeclared modification of an input argument was
|
||||
fixed in the blas_quickdivide code used to split workloads in most functions
|
||||
* a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX
|
||||
|
||||
MIPS32:
|
||||
* a bug in the IMIN implementation made it return the result of IMAX
|
||||
|
||||
POWER:
|
||||
* single precision BLAS1/2 functions have received optimized POWER8 kernels
|
||||
* POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel
|
||||
* building on PPC970 systems under OSX Leopard or Tiger is now supported
|
||||
* out-of-bounds memory accesses in the gemm_beta microkernels were fixed
|
||||
* building a shared library on AIX is now supported for POWER6
|
||||
* DYNAMIC_ARCH support has been added for POWER6 and newer
|
||||
|
||||
ARMv7:
|
||||
* corrected xDOT behaviour with zero INC_X or INC_Y
|
||||
* a bug in the IMIN implementation made it return the result of IMAX
|
||||
|
||||
ARMv8:
|
||||
* added support for HiSilicon TSV110 cpus
|
||||
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
|
||||
* cross-compilation with CMAKE now works again
|
||||
* a bug in the IMIN implementation made it return the result of IMAX
|
||||
* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7
|
||||
|
||||
IBM Z:
|
||||
* optimized microkernels for single precicion BLAS1/2 functions have been added
|
||||
for both Z13 and Z14
|
||||
|
||||
====================================================================
|
||||
Version 0.3.5
|
||||
31-Dec-2018
|
||||
|
||||
common:
|
||||
* loop unrolling in TRMV has been enabled again.
|
||||
* A domain error in the thread workload distribution for SYRK
|
||||
has been fixed.
|
||||
* gmake builds will now automatically add -fPIC to the build
|
||||
options if the platform requires it.
|
||||
* a pthreads key leakage (and associate crash on dlclose) in
|
||||
the USE_TLS codepath was fixed.
|
||||
* building of the utest cases on systems that do not provide
|
||||
an implementation of complex.h was fixed.
|
||||
|
||||
x86_64:
|
||||
* the SkylakeX code was changed to compile on OSX.
|
||||
* unwanted application of the -march=skylake-avx512 option
|
||||
to the common code parts of a DYNAMIC_ARCH build was fixed.
|
||||
* improved performance of SGEMM for small workloads on Skylake X.
|
||||
* performance of SGEMM and DGEMM was improved on Haswell.
|
||||
|
||||
ARMV8:
|
||||
* a configuration error that broke the CNRM2 kernel was corrected.
|
||||
* compilation of the GEMM kernels with CMAKE was fixed.
|
||||
* DYNAMIC_ARCH builds are now available with CMAKE as well.
|
||||
* using CMAKE for cross-compilation to the new cpu TARGETs
|
||||
introduced in 0.3.4 now works.
|
||||
|
||||
POWER:
|
||||
* a problem in cpu autodetection for AIX has been corrected.
|
||||
|
||||
====================================================================
|
||||
Version 0.3.4
|
||||
02-Dec-2018
|
||||
|
||||
common:
|
||||
* the new, experimental thread-local memory allocation had
|
||||
inadvertently been left enabled for gmake builds in 0.3.3
|
||||
despite the announcement. It is now disabled by default, and
|
||||
single-threaded builds will keep using the old allocator even
|
||||
if the USE_TLS option is turned on.
|
||||
* OpenBLAS will now provide enough buffer space for at least 50
|
||||
threads by default.
|
||||
* The output of openblas_get_config() now contains the version
|
||||
number.
|
||||
* A serious thread safety bug in GEMV operation with small M and
|
||||
large N size has been fixed.
|
||||
* The code will now automatically call blas_thread_init after a
|
||||
fork if needed before handling a call to openblas_set_num_threads
|
||||
* Accesses to parallelized level3 functions from multiple callers
|
||||
are now serialized to avoid thread races (unless using OpenMP).
|
||||
This should provide better performance than the known-threadsafe
|
||||
(but non-default) USE_SIMPLE_THREADED_LEVEL3 option.
|
||||
* When building LAPACK with gfortran, -frecursive is now (again)
|
||||
enabled by default to ensure correct behaviour.
|
||||
* The OpenBLAS version cblas.h now supports both CBLAS_ORDER and
|
||||
CBLAS_LAYOUT as the name of the matrix row/column order option.
|
||||
* Externally set LDFLAGS are now passed through to the final compile/link
|
||||
steps to facilitate setting platform-specific linker flags.
|
||||
* A potential race condition during the build of LAPACK (that would
|
||||
usually manifest itself as a failure to build TESTING/MATGEN) has been
|
||||
fixed.
|
||||
* xHEMV has been changed to stay single-threaded for small input sizes
|
||||
where the overhead of multithreading exceeds any possible gains
|
||||
* CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or
|
||||
ThunderX hardware with sizable input.
|
||||
* Linker flags for the PGI compiler have been updated
|
||||
* Behaviour of AXPY with zero increments is now handled in the C interface,
|
||||
correcting the result on at least Intel Atom.
|
||||
* The result matrix from calling SGELSS with an all-zero input matrix is
|
||||
now zeroed completely.
|
||||
|
||||
x86_64:
|
||||
* Autodetection of AMD Ryzen2 has been fixed (again).
|
||||
* CMAKE builds now support labeling of an INTERFACE64=1 build of
|
||||
the library with the _64 suffix.
|
||||
* AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel
|
||||
has been sped up by rewriting with C intrinsics
|
||||
* Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS)
|
||||
|
||||
POWER:
|
||||
* added support for building on AIX (with gcc and GNU tools from AIX Toolbox).
|
||||
* CPU type detection has been implemented for AIX.
|
||||
* CPU type detection has been fixed for NETBSD.
|
||||
|
||||
MIPS64:
|
||||
* AXPY on LOONGSON3A has been corrected to pass "zero increment" utest.
|
||||
* DSDOT on LOONGSON3A has been fixed.
|
||||
* the SGEMM microkernel has been hardened against potential data loss.
|
||||
|
||||
ARMV8:
|
||||
* DYNAMic_ARCH support is now available for 64bit ARM
|
||||
* cross-compiling for ARMV8 under iOS now works.
|
||||
* cpu-specific code has been rearranged to make better use of both
|
||||
hardware commonalities and model-specific compiler optimizations.
|
||||
* XGENE1 has been removed as a TARGET, superseded by the improved generic
|
||||
ARMV8 support.
|
||||
|
||||
ARMV7:
|
||||
* Older assembly mnemonics have been converted to UAL form to allow
|
||||
building with clang 7.0
|
||||
* Cross compiling LAPACKE for Android has been fixed again (broken by
|
||||
update to LAPACK 3.7.0 some while ago).
|
||||
|
||||
====================================================================
|
||||
Version 0.3.3
|
||||
31-Aug-2018
|
||||
|
||||
12
Makefile
12
Makefile
@@ -34,7 +34,7 @@ endif
|
||||
|
||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
|
||||
|
||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
|
||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
|
||||
|
||||
.PHONY : all libs netlib $(RELA) test ctest shared install
|
||||
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
|
||||
@@ -96,7 +96,7 @@ endif
|
||||
@echo
|
||||
|
||||
shared :
|
||||
ifndef NO_SHARED
|
||||
ifneq ($(NO_SHARED), 1)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@@ -109,6 +109,7 @@ endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
@$(MAKE) -C exports dyn
|
||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
@$(MAKE) -C exports dll
|
||||
@@ -123,15 +124,18 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
touch $(LIBNAME)
|
||||
ifndef NO_FBLAS
|
||||
$(MAKE) -C test all
|
||||
$(MAKE) -C utest all
|
||||
endif
|
||||
$(MAKE) -C utest all
|
||||
ifndef NO_CBLAS
|
||||
$(MAKE) -C ctest all
|
||||
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
|
||||
$(MAKE) -C cpp_thread_test all
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
libs :
|
||||
ifeq ($(CORE), UNKOWN)
|
||||
ifeq ($(CORE), UNKNOWN)
|
||||
$(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
|
||||
endif
|
||||
ifeq ($(NOFORTRAN), 1)
|
||||
|
||||
13
Makefile.arm
13
Makefile.arm
@@ -1,7 +1,7 @@
|
||||
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
|
||||
ifeq ($(OSNAME), Android)
|
||||
CCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
CCOMMON_OPT += -mfpu=neon
|
||||
FCOMMON_OPT += -mfpu=neon
|
||||
else
|
||||
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||
@@ -9,11 +9,6 @@ endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV6)
|
||||
CCOMMON_OPT += -mfpu=vfp -march=armv6
|
||||
FCOMMON_OPT += -mfpu=vfp -march=armv6
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV5)
|
||||
CCOMMON_OPT += -march=armv5
|
||||
FCOMMON_OPT += -march=armv5
|
||||
CCOMMON_OPT += -mfpu=vfp
|
||||
FCOMMON_OPT += -mfpu=vfp
|
||||
endif
|
||||
|
||||
@@ -4,22 +4,42 @@ CCOMMON_OPT += -march=armv8-a
|
||||
FCOMMON_OPT += -march=armv8-a
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA57)
|
||||
CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
|
||||
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
|
||||
ifeq ($(CORE), CORTEXA53)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), VULCAN)
|
||||
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
|
||||
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
|
||||
ifeq ($(CORE), CORTEXA57)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA72)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA73)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX)
|
||||
CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
|
||||
FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), FALKOR)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=falkor
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=falkor
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX2T99)
|
||||
CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
|
||||
FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
|
||||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), TSV110)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
endif
|
||||
|
||||
@@ -48,6 +48,7 @@ ifndef NO_CBLAS
|
||||
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
|
||||
endif
|
||||
|
||||
ifneq ($(OSNAME), AIX)
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@@ -57,14 +58,14 @@ ifndef NO_LAPACKE
|
||||
endif
|
||||
|
||||
#for install static library
|
||||
ifndef NO_STATIC
|
||||
ifneq ($(NO_STATIC),1)
|
||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
ifneq ($(NO_SHARED),1)
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@@ -72,6 +73,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
@@ -81,7 +83,8 @@ ifeq ($(OSNAME), Darwin)
|
||||
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
||||
@@ -93,6 +96,33 @@ ifeq ($(OSNAME), CYGWIN_NT)
|
||||
endif
|
||||
endif
|
||||
|
||||
else
|
||||
#install on AIX has different options syntax
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
|
||||
endif
|
||||
|
||||
#for install static library
|
||||
ifneq ($(NO_STATIC),1)
|
||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
#for install shared library
|
||||
ifneq ($(NO_SHARED),1)
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
#Generating openblas.pc
|
||||
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||
@@ -109,7 +139,7 @@ endif
|
||||
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
|
||||
ifndef NO_SHARED
|
||||
ifneq ($(NO_SHARED),1)
|
||||
#ifeq logical or
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
|
||||
@@ -9,7 +9,15 @@ else
|
||||
USE_OPENMP = 1
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(CORE), POWER9)
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
else
|
||||
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER8)
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
@@ -21,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas
|
||||
endif
|
||||
endif
|
||||
|
||||
# workaround for C->FORTRAN ABI violation in LAPACKE
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
FCOMMON_OPT += -fno-optimize-sibling-calls
|
||||
endif
|
||||
|
||||
FLAMEPATH = $(HOME)/flame/lib
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.4.dev
|
||||
VERSION = 0.3.8.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
@@ -48,6 +48,8 @@ VERSION = 0.3.4.dev
|
||||
# HOSTCC = gcc
|
||||
|
||||
# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64
|
||||
# Please note that AVX is not available on 32-bit.
|
||||
# Setting BINARY=32 disables AVX/AVX2/AVX-512.
|
||||
# BINARY=64
|
||||
|
||||
# About threaded BLAS. It will be automatically detected if you don't
|
||||
@@ -56,8 +58,14 @@ VERSION = 0.3.4.dev
|
||||
# For force setting for multi threaded, specify USE_THREAD = 1
|
||||
# USE_THREAD = 0
|
||||
|
||||
# If you want to build a single-threaded OpenBLAS, but expect to call this
|
||||
# from several concurrent threads in some other program, comment this in for
|
||||
# thread safety. (This is done automatically for USE_THREAD=1 , and should not
|
||||
# be necessary when USE_OPENMP=1)
|
||||
# USE_LOCKING = 1
|
||||
|
||||
# If you're going to use this library with OpenMP, please comment it in.
|
||||
# This flag is always set for POWER8. Don't modify the flag
|
||||
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
|
||||
# USE_OPENMP = 1
|
||||
|
||||
# The OpenMP scheduler to use - by default this is "static" and you
|
||||
@@ -68,36 +76,45 @@ VERSION = 0.3.4.dev
|
||||
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
|
||||
# CCOMMON_OPT += -DOMP_SCHED=dynamic
|
||||
|
||||
# You can define maximum number of threads. Basically it should be
|
||||
# less than actual number of cores. If you don't specify one, it's
|
||||
# automatically detected by the the script.
|
||||
# You can define the maximum number of threads. Basically it should be less
|
||||
# than or equal to the number of CPU threads. If you don't specify one, it's
|
||||
# automatically detected by the build system.
|
||||
# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to
|
||||
# restrict NUM_THREADS to the number of physical cores. By default, the automatic
|
||||
# detection includes logical CPUs, thus allowing the use of SMT.
|
||||
# Users may opt at runtime to use less than NUM_THREADS threads.
|
||||
#
|
||||
# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS
|
||||
# value (eg. 32-256) if you expect your users to use that many threads. Due to the way
|
||||
# some internal structures are allocated, using a large NUM_THREADS value has a RAM
|
||||
# footprint penalty, even if users reduce the actual number of threads at runtime.
|
||||
# NUM_THREADS = 24
|
||||
|
||||
# If you have enabled USE_OPENMP and your application would call
|
||||
# OpenBLAS's calculation API from multi threads, please comment it in.
|
||||
# This flag defines how many instances of OpenBLAS's calculation API can
|
||||
# actually run in parallel. If more threads call OpenBLAS's calculation API,
|
||||
# OpenBLAS's calculation API from multiple threads, please comment this in.
|
||||
# This flag defines how many instances of OpenBLAS's calculation API can actually
|
||||
# run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API,
|
||||
# they need to wait for the preceding API calls to finish or risk data corruption.
|
||||
# NUM_PARALLEL = 2
|
||||
|
||||
# if you don't need to install the static library, please comment it in.
|
||||
# If you don't need to install the static library, please comment this in.
|
||||
# NO_STATIC = 1
|
||||
|
||||
# if you don't need generate the shared library, please comment it in.
|
||||
# If you don't need to generate the shared library, please comment this in.
|
||||
# NO_SHARED = 1
|
||||
|
||||
# If you don't need CBLAS interface, please comment it in.
|
||||
# If you don't need the CBLAS interface, please comment this in.
|
||||
# NO_CBLAS = 1
|
||||
|
||||
# If you only want CBLAS interface without installing Fortran compiler,
|
||||
# please comment it in.
|
||||
# If you only want the CBLAS interface without installing a Fortran compiler,
|
||||
# please comment this in.
|
||||
# ONLY_CBLAS = 1
|
||||
|
||||
# If you don't need LAPACK, please comment it in.
|
||||
# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1.
|
||||
# If you don't need LAPACK, please comment this in.
|
||||
# If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1.
|
||||
# NO_LAPACK = 1
|
||||
|
||||
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
|
||||
# If you don't need LAPACKE (C Interface to LAPACK), please comment this in.
|
||||
# NO_LAPACKE = 1
|
||||
|
||||
# Build LAPACK Deprecated functions since LAPACK 3.6.0
|
||||
@@ -106,7 +123,7 @@ BUILD_LAPACK_DEPRECATED = 1
|
||||
# Build RecursiveLAPACK on top of LAPACK
|
||||
# BUILD_RELAPACK = 1
|
||||
|
||||
# If you want to use legacy threaded Level 3 implementation.
|
||||
# If you want to use the legacy threaded Level 3 implementation.
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
||||
# If you want to use the new, still somewhat experimental code that uses
|
||||
@@ -116,8 +133,8 @@ BUILD_LAPACK_DEPRECATED = 1
|
||||
# USE_TLS = 1
|
||||
|
||||
# If you want to drive whole 64bit region by BLAS. Not all Fortran
|
||||
# compiler supports this. It's safe to keep comment it out if you
|
||||
# are not sure(equivalent to "-i8" option).
|
||||
# compilers support this. It's safe to keep this commented out if you
|
||||
# are not sure. (This is equivalent to the "-i8" ifort option).
|
||||
# INTERFACE64 = 1
|
||||
|
||||
# Unfortunately most of kernel won't give us high quality buffer.
|
||||
@@ -125,10 +142,18 @@ BUILD_LAPACK_DEPRECATED = 1
|
||||
# but it will consume time. If you don't like it, you can disable one.
|
||||
NO_WARMUP = 1
|
||||
|
||||
# If you want to disable CPU/Memory affinity on Linux.
|
||||
# Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling.
|
||||
# This feature is only implemented on Linux, and is always disabled on other platforms.
|
||||
# Enabling affinity handling may improve performance, especially on NUMA systems, but
|
||||
# it may conflict with certain applications that also try to manage affinity.
|
||||
# This conflict can result in threads of the application calling OpenBLAS ending up locked
|
||||
# to the same core(s) as OpenBLAS, possibly binding all threads to a single core.
|
||||
# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing
|
||||
# else modifies affinity settings.
|
||||
# Note: enabling affinity has been known to cause problems with NumPy and R
|
||||
NO_AFFINITY = 1
|
||||
|
||||
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
|
||||
# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
|
||||
# BIGNUMA = 1
|
||||
|
||||
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
|
||||
@@ -138,6 +163,10 @@ NO_AFFINITY = 1
|
||||
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
|
||||
# NO_AVX2 = 1
|
||||
|
||||
# Don't use SkylakeX optimizations if binutils or compiler are too old (the build
|
||||
# system will try to determine this automatically)
|
||||
# NO_AVX512 = 1
|
||||
|
||||
# Don't use parallel make.
|
||||
# NO_PARALLEL_MAKE = 1
|
||||
|
||||
@@ -162,17 +191,17 @@ NO_AFFINITY = 1
|
||||
# time out to improve performance. This number should be from 4 to 30
|
||||
# which corresponds to (1 << n) cycles. For example, if you set to 26,
|
||||
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
|
||||
# system). Also you can control this mumber by THREAD_TIMEOUT
|
||||
# system). Also you can control this number by THREAD_TIMEOUT
|
||||
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26
|
||||
|
||||
# Using special device driver for mapping physically contigous memory
|
||||
# Using special device driver for mapping physically contiguous memory
|
||||
# to the user space. If bigphysarea is enabled, it will use it.
|
||||
# DEVICEDRIVER_ALLOCATION = 1
|
||||
|
||||
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
|
||||
# CONSISTENT_FPCSR = 1
|
||||
|
||||
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
||||
# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
|
||||
# with single thread. (Actually in recent versions this is a factor proportional to the
|
||||
# number of floating point operations necessary for the given problem size, no longer
|
||||
# an individual dimension). You can use this setting to avoid the overhead of multi-
|
||||
@@ -180,7 +209,7 @@ NO_AFFINITY = 1
|
||||
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
|
||||
# GEMM_MULTITHREAD_THRESHOLD = 4
|
||||
|
||||
# If you need santy check by comparing reference BLAS. It'll be very
|
||||
# If you need sanity check by comparing results to reference BLAS. It'll be very
|
||||
# slow (Not implemented yet).
|
||||
# SANITY_CHECK = 1
|
||||
|
||||
@@ -192,8 +221,8 @@ NO_AFFINITY = 1
|
||||
# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT
|
||||
# COMMON_OPT = -O2
|
||||
|
||||
# gfortran option for LAPACK
|
||||
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
|
||||
# gfortran option for LAPACK to improve thread-safety
|
||||
# It is enabled by default in Makefile.system for gfortran
|
||||
# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT
|
||||
# FCOMMON_OPT = -frecursive
|
||||
|
||||
@@ -220,6 +249,21 @@ COMMON_PROF = -pg
|
||||
# SYMBOLPREFIX=
|
||||
# SYMBOLSUFFIX=
|
||||
|
||||
# Run a C++ based thread safety tester after the build is done.
|
||||
# This is mostly intended as a developer feature to spot regressions, but users and
|
||||
# package maintainers can enable this if they have doubts about the thread safety of
|
||||
# the library, given the configuration in this file.
|
||||
# By default, the thread safety tester launches 52 concurrent calculations at the same
|
||||
# time.
|
||||
#
|
||||
# Please note that the test uses ~1300 MiB of RAM for the DGEMM test.
|
||||
#
|
||||
# The test requires CBLAS to be built, a C++11 capable compiler and the presence of
|
||||
# an OpenMP implementation. If you are cross-compiling this test will probably not
|
||||
# work at all.
|
||||
#
|
||||
# CPP_THREAD_SAFETY_TEST = 1
|
||||
|
||||
#
|
||||
# End of user configuration
|
||||
#
|
||||
|
||||
@@ -9,9 +9,22 @@ ifndef TOPDIR
|
||||
TOPDIR = .
|
||||
endif
|
||||
|
||||
# If ARCH is not set, we use the host system's architecture for getarch compile options.
|
||||
ifndef ARCH
|
||||
HOSTARCH := $(shell uname -m)
|
||||
else
|
||||
HOSTARCH = $(ARCH)
|
||||
endif
|
||||
|
||||
# Catch conflicting usage of ARCH in some BSD environments
|
||||
ifeq ($(ARCH), amd64)
|
||||
override ARCH=x86_64
|
||||
else ifeq ($(ARCH), powerpc64)
|
||||
override ARCH=power
|
||||
else ifeq ($(ARCH), i386)
|
||||
override ARCH=x86
|
||||
else ifeq ($(ARCH), aarch64)
|
||||
override ARCH=arm64
|
||||
endif
|
||||
|
||||
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
|
||||
@@ -59,6 +72,7 @@ endif
|
||||
|
||||
ifdef TARGET
|
||||
GETARCH_FLAGS := -DFORCE_$(TARGET)
|
||||
GETARCH_FLAGS += -DUSER_TARGET
|
||||
endif
|
||||
|
||||
# Force fallbacks for 32bit
|
||||
@@ -88,6 +102,9 @@ endif
|
||||
ifeq ($(TARGET), ZEN)
|
||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||
endif
|
||||
ifeq ($(TARGET), ARMV8)
|
||||
GETARCH_FLAGS := -DFORCE_ARMV7
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
@@ -127,7 +144,12 @@ endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
|
||||
ifeq ($(HOSTARCH), x86_64)
|
||||
ifeq ($(findstring pgcc,$(HOSTCC)),)
|
||||
GETARCH_FLAGS += -march=native
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef INTERFACE64
|
||||
ifneq ($(INTERFACE64), 0)
|
||||
@@ -145,7 +167,8 @@ GETARCH_FLAGS += -DNO_AVX
|
||||
endif
|
||||
|
||||
ifeq ($(BINARY), 32)
|
||||
GETARCH_FLAGS += -DNO_AVX
|
||||
GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512
|
||||
NO_AVX512 = 1
|
||||
endif
|
||||
|
||||
ifeq ($(NO_AVX2), 1)
|
||||
@@ -226,6 +249,10 @@ SMP = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(SMP), 1)
|
||||
USE_LOCKING =
|
||||
endif
|
||||
|
||||
ifndef NEED_PIC
|
||||
NEED_PIC = 1
|
||||
endif
|
||||
@@ -242,9 +269,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy
|
||||
OBJCONV = $(CROSS_SUFFIX)objconv
|
||||
|
||||
|
||||
# For detect fortran failed, only build BLAS.
|
||||
# When fortran support was either not detected or actively deselected, only build BLAS.
|
||||
ifeq ($(NOFORTRAN), 1)
|
||||
NO_LAPACK = 1
|
||||
override FEXTRALIB =
|
||||
endif
|
||||
|
||||
#
|
||||
@@ -294,12 +322,13 @@ CCOMMON_OPT += -DMS_ABI
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
#Test for supporting MS_ABI
|
||||
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
||||
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||
ifeq ($(GCCVERSIONGT4), 1)
|
||||
# GCC Majar version > 4
|
||||
# GCC Major version > 4
|
||||
# It is compatible with MSVC ABI.
|
||||
CCOMMON_OPT += -DMS_ABI
|
||||
endif
|
||||
@@ -377,6 +406,12 @@ ifneq ($(MAX_STACK_ALLOC), 0)
|
||||
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
|
||||
endif
|
||||
|
||||
ifdef USE_LOCKING
|
||||
ifneq ($(USE_LOCKING), 0)
|
||||
CCOMMON_OPT += -DUSE_LOCKING
|
||||
endif
|
||||
endif
|
||||
|
||||
#
|
||||
# Architecture dependent settings
|
||||
#
|
||||
@@ -510,6 +545,28 @@ CCOMMON_OPT += $(XCCOMMON_OPT)
|
||||
#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), arm64)
|
||||
DYNAMIC_CORE = ARMV8
|
||||
DYNAMIC_CORE += CORTEXA57
|
||||
DYNAMIC_CORE += THUNDERX
|
||||
DYNAMIC_CORE += THUNDERX2T99
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), power)
|
||||
DYNAMIC_CORE = POWER6
|
||||
DYNAMIC_CORE += POWER8
|
||||
ifneq ($(C_COMPILER), GCC)
|
||||
DYNAMIC_CORE += POWER9
|
||||
endif
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
ifeq ($(GCCVERSIONGT5), 1)
|
||||
DYNAMIC_CORE += POWER9
|
||||
else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
ifndef DYNAMIC_CORE
|
||||
override DYNAMIC_ARCH=
|
||||
@@ -652,7 +709,7 @@ endif
|
||||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
ifdef BINARY64
|
||||
CCOMMON_OPT += -tp p7-64
|
||||
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
|
||||
else
|
||||
CCOMMON_OPT += -tp p7
|
||||
endif
|
||||
@@ -712,12 +769,19 @@ else
|
||||
FCOMMON_OPT += -m32
|
||||
endif
|
||||
endif
|
||||
ifneq ($(NO_LAPACKE), 1)
|
||||
FCOMMON_OPT += -fno-second-underscore
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
CCOMMON_OPT += -DF_INTERFACE_GFORT
|
||||
FCOMMON_OPT += -Wall
|
||||
# make single-threaded LAPACK calls thread-safe #1847
|
||||
FCOMMON_OPT += -frecursive
|
||||
# work around ABI problem with passing single-character arguments
|
||||
FCOMMON_OPT += -fno-optimize-sibling-calls
|
||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
EXTRALIB += -lgfortran
|
||||
@@ -1023,10 +1087,12 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||
endif
|
||||
|
||||
ifdef USE_TLS
|
||||
ifeq ($(USE_TLS), 1)
|
||||
CCOMMON_OPT += -DUSE_TLS
|
||||
endif
|
||||
|
||||
CCOMMON_OPT += -DVERSION=\"$(VERSION)\"
|
||||
|
||||
ifndef SYMBOLPREFIX
|
||||
SYMBOLPREFIX =
|
||||
endif
|
||||
@@ -1074,8 +1140,12 @@ endif
|
||||
endif
|
||||
|
||||
ifdef NO_AFFINITY
|
||||
ifeq ($(NO_AFFINITY), 0)
|
||||
override undefine NO_AFFINITY
|
||||
else
|
||||
CCOMMON_OPT += -DNO_AFFINITY
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef FUNCTION_PROFILE
|
||||
CCOMMON_OPT += -DFUNCTION_PROFILE
|
||||
@@ -1137,8 +1207,6 @@ ifndef FCOMMON_OPT
|
||||
FCOMMON_OPT = -O2 -frecursive
|
||||
endif
|
||||
|
||||
|
||||
|
||||
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
|
||||
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
|
||||
|
||||
@@ -1146,6 +1214,12 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
|
||||
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
|
||||
#MAKEOVERRIDES =
|
||||
|
||||
ifdef NEED_PIC
|
||||
ifeq (,$(findstring PIC,$(FFLAGS)))
|
||||
override FFLAGS += -fPIC
|
||||
endif
|
||||
endif
|
||||
|
||||
#For LAPACK Fortran codes.
|
||||
#Disable -fopenmp for LAPACK Fortran codes on Windows.
|
||||
ifdef OS_WINDOWS
|
||||
@@ -1204,7 +1278,11 @@ endif
|
||||
|
||||
LIBDLLNAME = $(LIBPREFIX).dll
|
||||
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
|
||||
ifneq ($(OSNAME), AIX)
|
||||
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
|
||||
else
|
||||
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
|
||||
endif
|
||||
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
|
||||
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
|
||||
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
|
||||
|
||||
@@ -9,6 +9,7 @@ endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), SKYLAKEX)
|
||||
ifndef DYNAMIC_ARCH
|
||||
ifndef NO_AVX512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
@@ -22,6 +23,22 @@ endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), HASWELL)
|
||||
ifndef DYNAMIC_ARCH
|
||||
ifndef NO_AVX2
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
CCOMMON_OPT += -mavx2
|
||||
endif
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
FCOMMON_OPT += -mavx2
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
ifeq ($(OSNAME), Interix)
|
||||
ARFLAGS = -m x64
|
||||
|
||||
@@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
|
||||
FCOMMON_OPT += -march=z13 -mzvector
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z14)
|
||||
CCOMMON_OPT += -march=z14 -mzvector
|
||||
FCOMMON_OPT += -march=z14 -mzvector
|
||||
endif
|
||||
|
||||
17
README.md
17
README.md
@@ -6,11 +6,13 @@ Travis CI: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
||||
|
||||
[](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
|
||||
|
||||
## Introduction
|
||||
|
||||
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
|
||||
|
||||
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
|
||||
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
|
||||
|
||||
## Binary Packages
|
||||
|
||||
@@ -22,7 +24,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
|
||||
|
||||
## Installation from Source
|
||||
|
||||
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
|
||||
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
|
||||
using Git from https://github.com/xianyi/OpenBLAS.git.
|
||||
|
||||
### Dependencies
|
||||
@@ -63,9 +65,7 @@ A debug version can be built using `make DEBUG=1`.
|
||||
|
||||
### Compile with MASS support on Power CPU (optional)
|
||||
|
||||
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
|
||||
consists of a set of mathematical functions for C, C++, and Fortran applications that are
|
||||
are tuned for optimum performance on POWER architectures.
|
||||
The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
|
||||
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
|
||||
The library can be installed as shown:
|
||||
|
||||
@@ -115,6 +115,7 @@ Please read `GotoBLAS_01Readme.txt`.
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
||||
- **AMD ZEN**: Uses Haswell codes with some optimizations.
|
||||
|
||||
#### MIPS64
|
||||
|
||||
@@ -133,11 +134,13 @@ Please read `GotoBLAS_01Readme.txt`.
|
||||
|
||||
#### PPC/PPC64
|
||||
|
||||
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
|
||||
- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
|
||||
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
|
||||
|
||||
#### IBM zEnterprise System
|
||||
|
||||
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
|
||||
- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)
|
||||
|
||||
### Supported OS
|
||||
|
||||
@@ -201,7 +204,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
|
||||
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
||||
Clang 3.0 will generate the wrong AVX binary code.
|
||||
* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels.
|
||||
* Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels.
|
||||
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
|
||||
the library with `BIGNUMA=1`.
|
||||
|
||||
@@ -48,6 +48,7 @@ POWER5
|
||||
POWER6
|
||||
POWER7
|
||||
POWER8
|
||||
POWER9
|
||||
PPCG4
|
||||
PPC970
|
||||
PPC970MP
|
||||
@@ -83,11 +84,16 @@ ARMV5
|
||||
|
||||
8.ARM 64-bit CPU:
|
||||
ARMV8
|
||||
CORTEXA53
|
||||
CORTEXA57
|
||||
VULCAN
|
||||
CORTEXA72
|
||||
CORTEXA73
|
||||
FALKOR
|
||||
THUNDERX
|
||||
THUNDERX2T99
|
||||
TSV110
|
||||
|
||||
9.System Z:
|
||||
ZARCH_GENERIC
|
||||
Z13
|
||||
Z14
|
||||
|
||||
22
appveyor.yml
22
appveyor.yml
@@ -35,7 +35,15 @@ environment:
|
||||
DYNAMIC_ARCH: ON
|
||||
WITH_FORTRAN: no
|
||||
- COMPILER: cl
|
||||
|
||||
- COMPILER: MinGW64-gcc-7.2.0-mingw
|
||||
DYNAMIC_ARCH: OFF
|
||||
WITH_FORTRAN: ignore
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
COMPILER: MinGW-gcc-6.3.0-32
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
COMPILER: MinGW-gcc-5.3.0
|
||||
WITH_FORTRAN: ignore
|
||||
|
||||
install:
|
||||
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
|
||||
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
|
||||
@@ -52,10 +60,17 @@ install:
|
||||
before_build:
|
||||
- ps: if (-Not (Test-Path .\build)) { mkdir build }
|
||||
- cd build
|
||||
- set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
|
||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] set PATH=C:\msys64\usr\bin;C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
|
||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl ..
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON ..
|
||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
|
||||
|
||||
build_script:
|
||||
- cmake --build .
|
||||
@@ -64,3 +79,4 @@ test_script:
|
||||
- echo Running Test
|
||||
- cd utest
|
||||
- openblas_utest
|
||||
|
||||
|
||||
51
azure-pipelines.yml
Normal file
51
azure-pipelines.yml
Normal file
@@ -0,0 +1,51 @@
|
||||
trigger:
|
||||
# start a new build for every push
|
||||
batch: False
|
||||
branches:
|
||||
include:
|
||||
- develop
|
||||
|
||||
jobs:
|
||||
# manylinux1 is useful to test because the
|
||||
# standard Docker container uses an old version
|
||||
# of gcc / glibc
|
||||
- job: manylinux1_gcc
|
||||
pool:
|
||||
vmImage: 'ubuntu-16.04'
|
||||
steps:
|
||||
- script: |
|
||||
echo "FROM quay.io/pypa/manylinux1_x86_64
|
||||
COPY . /tmp/openblas
|
||||
RUN cd /tmp/openblas && \
|
||||
COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \
|
||||
BTYPE='BINARY=64' CC=gcc && \
|
||||
make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \
|
||||
make -C test $COMMON_FLAGS $BTYPE && \
|
||||
make -C ctest $COMMON_FLAGS $BTYPE && \
|
||||
make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile
|
||||
docker build .
|
||||
displayName: Run manylinux1 docker build
|
||||
- job: Intel_SDE_skx
|
||||
pool:
|
||||
vmImage: 'ubuntu-16.04'
|
||||
steps:
|
||||
- script: |
|
||||
# at the time of writing the available Azure Ubuntu vm image
|
||||
# does not support AVX512VL, so use more recent LTS version
|
||||
echo "FROM ubuntu:bionic
|
||||
COPY . /tmp/openblas
|
||||
RUN apt-get -y update && apt-get -y install \\
|
||||
cmake \\
|
||||
gfortran \\
|
||||
make \\
|
||||
wget
|
||||
RUN mkdir /tmp/SDE && cd /tmp/SDE && \\
|
||||
mkdir sde-external-8.35.0-2019-03-11-lin && \\
|
||||
wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\
|
||||
tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1
|
||||
RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64
|
||||
CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile
|
||||
docker build -t intel_sde .
|
||||
# we need a privileged docker run for sde process attachment
|
||||
docker run --privileged intel_sde
|
||||
displayName: 'Run AVX512 SkylakeX docker build / test'
|
||||
@@ -207,7 +207,7 @@ int main(int argc, char *argv[]){
|
||||
for (i = 0; i < m * n * COMPSIZE; i++) {
|
||||
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time\n");
|
||||
|
||||
for (i = from; i <= to; i += step) {
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
argv <- commandArgs(trailingOnly = TRUE)
|
||||
|
||||
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
|
||||
|
||||
nfrom <- 128
|
||||
nto <- 2048
|
||||
nstep <- 128
|
||||
@@ -19,7 +21,6 @@ if (length(argv) > 0) {
|
||||
loops <- as.numeric(argv[z])
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
p <- Sys.getenv("OPENBLAS_LOOPS")
|
||||
@@ -27,29 +28,21 @@ if (p != "") {
|
||||
loops <- as.numeric(p)
|
||||
}
|
||||
|
||||
|
||||
cat(sprintf(
|
||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
|
||||
nfrom,
|
||||
nto,
|
||||
nstep,
|
||||
loops
|
||||
))
|
||||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
|
||||
cat(sprintf(" SIZE Flops Time\n"))
|
||||
|
||||
n <- nfrom
|
||||
while (n <= nto) {
|
||||
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
|
||||
A <- matrix(rnorm(n * n), nrow = n)
|
||||
ev <- 0
|
||||
z <- system.time(for (l in 1:loops) {
|
||||
ev <- eigen(A)
|
||||
})
|
||||
|
||||
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6)
|
||||
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06)
|
||||
|
||||
st <- sprintf("%.0fx%.0f :", n, n)
|
||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
||||
|
||||
n <- n + nstep
|
||||
|
||||
}
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
argv <- commandArgs(trailingOnly = TRUE)
|
||||
|
||||
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
|
||||
|
||||
nfrom <- 128
|
||||
nto <- 2048
|
||||
nstep <- 128
|
||||
@@ -19,7 +21,6 @@ if (length(argv) > 0) {
|
||||
loops <- as.numeric(argv[z])
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
p <- Sys.getenv("OPENBLAS_LOOPS")
|
||||
@@ -27,26 +28,13 @@ if (p != "") {
|
||||
loops <- as.numeric(p)
|
||||
}
|
||||
|
||||
|
||||
cat(sprintf(
|
||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
|
||||
nfrom,
|
||||
nto,
|
||||
nstep,
|
||||
loops
|
||||
))
|
||||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
|
||||
cat(sprintf(" SIZE Flops Time\n"))
|
||||
|
||||
n <- nfrom
|
||||
while (n <= nto) {
|
||||
A <- matrix(runif(n * n),
|
||||
ncol = n,
|
||||
nrow = n,
|
||||
byrow = TRUE)
|
||||
B <- matrix(runif(n * n),
|
||||
ncol = n,
|
||||
nrow = n,
|
||||
byrow = TRUE)
|
||||
A <- matrix(runif(n * n), nrow = n)
|
||||
B <- matrix(runif(n * n), nrow = n)
|
||||
C <- 1
|
||||
|
||||
z <- system.time(for (l in 1:loops) {
|
||||
@@ -54,11 +42,10 @@ while (n <= nto) {
|
||||
l <- l + 1
|
||||
})
|
||||
|
||||
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6)
|
||||
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06)
|
||||
|
||||
st <- sprintf("%.0fx%.0f :", n, n)
|
||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
||||
|
||||
n <- n + nstep
|
||||
|
||||
}
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
argv <- commandArgs(trailingOnly = TRUE)
|
||||
|
||||
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
|
||||
|
||||
nfrom <- 128
|
||||
nto <- 2048
|
||||
nstep <- 128
|
||||
@@ -19,7 +21,6 @@ if (length(argv) > 0) {
|
||||
loops <- as.numeric(argv[z])
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
p <- Sys.getenv("OPENBLAS_LOOPS")
|
||||
@@ -27,31 +28,22 @@ if (p != "") {
|
||||
loops <- as.numeric(p)
|
||||
}
|
||||
|
||||
|
||||
cat(sprintf(
|
||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
|
||||
nfrom,
|
||||
nto,
|
||||
nstep,
|
||||
loops
|
||||
))
|
||||
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
|
||||
cat(sprintf(" SIZE Flops Time\n"))
|
||||
|
||||
n <- nfrom
|
||||
while (n <= nto) {
|
||||
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
|
||||
B <- matrix(rnorm(n * n), ncol = n, nrow = n)
|
||||
A <- matrix(rnorm(n * n), nrow = n)
|
||||
B <- matrix(rnorm(n * n), nrow = n)
|
||||
|
||||
z <- system.time(for (l in 1:loops) {
|
||||
solve(A, B)
|
||||
})
|
||||
|
||||
mflops <-
|
||||
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6)
|
||||
mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06)
|
||||
|
||||
st <- sprintf("%.0fx%.0f :", n, n)
|
||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
||||
|
||||
n <- n + nstep
|
||||
|
||||
}
|
||||
|
||||
98
c_check
98
c_check
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
use File::Basename;
|
||||
use File::Temp qw(tempfile);
|
||||
#use File::Basename;
|
||||
# use File::Temp qw(tempfile);
|
||||
|
||||
# Checking cross compile
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
@@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64");
|
||||
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
|
||||
$hostarch = "zarch" if ($hostarch eq "s390x");
|
||||
|
||||
$tmpf = new File::Temp( UNLINK => 1 );
|
||||
#$tmpf = new File::Temp( UNLINK => 1 );
|
||||
$binary = $ENV{"BINARY"};
|
||||
|
||||
$makefile = shift(@ARGV);
|
||||
@@ -31,12 +31,25 @@ if ($?) {
|
||||
|
||||
$cross_suffix = "";
|
||||
|
||||
if (dirname($compiler_name) ne ".") {
|
||||
$cross_suffix .= dirname($compiler_name) . "/";
|
||||
}
|
||||
eval "use File::Basename";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Basename, emulating its functionality";
|
||||
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
|
||||
if ($dirnam ne ".") {
|
||||
$cross_suffix .= $dirnam . "/";
|
||||
}
|
||||
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
|
||||
if ($basnam =~ /([^\s]*-)(.*)/) {
|
||||
$cross_suffix .= $1;
|
||||
}
|
||||
} else {
|
||||
if (dirname($compiler_name) ne ".") {
|
||||
$cross_suffix .= dirname($compiler_name) . "/";
|
||||
}
|
||||
|
||||
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
|
||||
$cross_suffix .= $1;
|
||||
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
|
||||
$cross_suffix .= $1;
|
||||
}
|
||||
}
|
||||
|
||||
$compiler = "";
|
||||
@@ -171,20 +184,26 @@ if ($?) {
|
||||
|
||||
$have_msa = 0;
|
||||
if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
||||
$code = '"addvi.b $w0, $w1, 1"';
|
||||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
|
||||
print $tmpf "#include <msa.h>\n\n";
|
||||
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
|
||||
|
||||
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$have_msa = 0;
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
|
||||
} else {
|
||||
$have_msa = 1;
|
||||
$tmpf = new File::Temp( UNLINK => 1 );
|
||||
$code = '"addvi.b $w0, $w1, 1"';
|
||||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
|
||||
print $tmpf "#include <msa.h>\n\n";
|
||||
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
|
||||
|
||||
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$have_msa = 0;
|
||||
} else {
|
||||
$have_msa = 1;
|
||||
}
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
@@ -204,17 +223,25 @@ $binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
|
||||
$no_avx512= 0;
|
||||
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
|
||||
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
|
||||
$args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_avx512 = 1;
|
||||
} else {
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
|
||||
$no_avx512 = 0;
|
||||
} else {
|
||||
# $tmpf = new File::Temp( UNLINK => 1 );
|
||||
($fh,$tmpf) = tempfile( UNLINK => 1 );
|
||||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
|
||||
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
|
||||
$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_avx512 = 1;
|
||||
} else {
|
||||
$no_avx512 = 0;
|
||||
}
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
unlink("tmpf.o");
|
||||
}
|
||||
|
||||
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
|
||||
@@ -233,6 +260,19 @@ if ($architecture ne $hostarch) {
|
||||
|
||||
$cross = 1 if ($os ne $hostos);
|
||||
|
||||
# rework cross suffix and architecture if we are on OSX cross-compiling for ARMV8-based IOS
|
||||
# the initial autodetection will have been confused by the command-line arguments to clang
|
||||
# and the cross-compiler apparently still claims to build for x86_64 in its CC -E output
|
||||
if (($os eq "Darwin") && ($cross_suffix ne "")) {
|
||||
my $tmpnam = `xcrun --sdk iphoneos --find clang`;
|
||||
$cross_suffix = substr($tmpnam, 0, rindex($tmpnam, "/")+1 );
|
||||
# this should produce something like $cross_suffix="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/";
|
||||
$cross =1;
|
||||
$architecture = arm64;
|
||||
}
|
||||
|
||||
|
||||
|
||||
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
||||
|
||||
$linker_L = "";
|
||||
|
||||
15
cblas.h
15
cblas.h
@@ -73,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS
|
||||
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX);
|
||||
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
|
||||
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);
|
||||
@@ -88,6 +93,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
|
||||
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
||||
@@ -44,6 +44,10 @@ endif ()
|
||||
|
||||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99)
|
||||
endif ()
|
||||
|
||||
if (X86)
|
||||
set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
|
||||
endif ()
|
||||
@@ -69,11 +73,16 @@ if (DYNAMIC_ARCH)
|
||||
endif ()
|
||||
if (NOT NO_AVX512)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
|
||||
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||
endif ()
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DYNAMIC_CORE)
|
||||
unset(DYNAMIC_ARCH)
|
||||
message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options")
|
||||
unset(DYNAMIC_ARCH CACHE)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||
## Sets C related variables.
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -Wall")
|
||||
set(COMMON_PROF "${COMMON_PROF} -fno-inline")
|
||||
@@ -43,7 +43,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "PGI")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64")
|
||||
else ()
|
||||
@@ -51,7 +51,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PGI")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
|
||||
else ()
|
||||
@@ -59,7 +59,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64")
|
||||
|
||||
if (MIPS64)
|
||||
|
||||
@@ -87,7 +87,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "SUN")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -w")
|
||||
if (X86)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
|
||||
|
||||
@@ -44,7 +44,10 @@ endif ()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "GFORTRAN")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
|
||||
# ensure reentrancy of lapack codes
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
|
||||
# work around ABI violation in passing string arguments from C
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
|
||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||
if (NOT NO_LAPACK)
|
||||
set(EXTRALIB "{EXTRALIB} -lgfortran")
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# helper functions for the kernel CMakeLists.txt
|
||||
|
||||
|
||||
# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file.
|
||||
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
|
||||
macro(SetDefaultL1)
|
||||
set(SAMAXKERNEL amax.S)
|
||||
set(DAMAXKERNEL amax.S)
|
||||
@@ -107,6 +107,12 @@ macro(SetDefaultL1)
|
||||
set(DAXPBYKERNEL ../arm/axpby.c)
|
||||
set(CAXPBYKERNEL ../arm/zaxpby.c)
|
||||
set(ZAXPBYKERNEL ../arm/zaxpby.c)
|
||||
set(SSUMKERNEL sum.S)
|
||||
set(DSUMKERNEL sum.S)
|
||||
set(CSUMKERNEL zsum.S)
|
||||
set(ZSUMKERNEL zsum.S)
|
||||
set(QSUMKERNEL sum.S)
|
||||
set(XSUMKERNEL zsum.S)
|
||||
endmacro ()
|
||||
|
||||
macro(SetDefaultL2)
|
||||
@@ -162,4 +168,4 @@ macro(SetDefaultL3)
|
||||
set(DGEADD_KERNEL ../generic/geadd.c)
|
||||
set(CGEADD_KERNEL ../generic/zgeadd.c)
|
||||
set(ZGEADD_KERNEL ../generic/zgeadd.c)
|
||||
endmacro ()
|
||||
endmacro ()
|
||||
|
||||
@@ -8,6 +8,11 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
|
||||
set(NO_EXPRECISION 1)
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly")
|
||||
set(EXTRALIB "${EXTRALIB} -lm")
|
||||
set(NO_EXPRECISION 1)
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX")
|
||||
set(EXTRALIB "${EXTRALIB} -lm")
|
||||
endif ()
|
||||
|
||||
@@ -59,6 +59,9 @@ set(FU "")
|
||||
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
|
||||
set(FU "_")
|
||||
endif()
|
||||
if(MINGW AND NOT MINGW64)
|
||||
set(FU "_")
|
||||
endif()
|
||||
|
||||
set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
|
||||
if (${COMPILER_ID} STREQUAL "GNU")
|
||||
@@ -82,18 +85,59 @@ endif ()
|
||||
# f_check
|
||||
if (NOT NOFORTRAN)
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
|
||||
else ()
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define BUNDERSCORE _\n"
|
||||
"#define NEEDBUNDERSCORE 1\n")
|
||||
set(BU "_")
|
||||
endif ()
|
||||
|
||||
# Cannot run getarch on target if we are cross-compiling
|
||||
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
|
||||
# Write to config as getarch would
|
||||
if (DEFINED TARGET_CORE)
|
||||
set(TCORE ${TARGET_CORE})
|
||||
else()
|
||||
set(TCORE ${CORE})
|
||||
endif()
|
||||
|
||||
# TODO: Set up defines that getarch sets up based on every other target
|
||||
# Perhaps this should be inside a different file as it grows larger
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define ${CORE}\n"
|
||||
"#define CHAR_CORENAME \"${CORE}\"\n")
|
||||
if ("${CORE}" STREQUAL "ARMV7")
|
||||
"#define ${TCORE}\n"
|
||||
"#define CORE_${TCORE}\n"
|
||||
"#define CHAR_CORENAME \"${TCORE}\"\n")
|
||||
if ("${TCORE}" STREQUAL "CORE2")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t1048576\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t256\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_CMOV\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSSE3\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t16384\n")
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 4)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "ARMV7")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t32\n"
|
||||
@@ -108,7 +152,11 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
elseif ("${CORE}" STREQUAL "ARMV8")
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
elseif ("${TCORE}" STREQUAL "ARMV8")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
@@ -116,18 +164,26 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define L2_ASSOCIATIVE\t32\n")
|
||||
set(SGEMM_UNROLL_M 4)
|
||||
"#define L2_ASSOCIATIVE\t32\n"
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
elseif ("${CORE}" STREQUAL "CORTEXA57")
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t49152\n"
|
||||
"#define L1_CODE_SIZE\t32768\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t3\n"
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
||||
"#define L2_SIZE\t2097152\n"
|
||||
"#define L2_SIZE\t262144\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t16\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
@@ -135,15 +191,124 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||
"#define HAVE_VFPV4\n"
|
||||
"#define HAVE_VFPV3\n"
|
||||
"#define HAVE_VFP\n"
|
||||
"#define HAVE_NEON\n")
|
||||
"#define HAVE_NEON\n"
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 8)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t49152\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t3\n"
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
||||
"#define L2_SIZE\t524288\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t16\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_VFPV4\n"
|
||||
"#define HAVE_VFPV3\n"
|
||||
"#define HAVE_VFP\n"
|
||||
"#define HAVE_NEON\n"
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "FALKOR")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t3\n"
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t128\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
||||
"#define L2_SIZE\t524288\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t16\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_VFPV4\n"
|
||||
"#define HAVE_VFPV3\n"
|
||||
"#define HAVE_VFP\n"
|
||||
"#define HAVE_NEON\n"
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "THUNDERX")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t32768\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t3\n"
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t128\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
||||
"#define L2_SIZE\t167772164\n"
|
||||
"#define L2_LINESIZE\t128\n"
|
||||
"#define L2_ASSOCIATIVE\t16\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_VFPV4\n"
|
||||
"#define HAVE_VFPV3\n"
|
||||
"#define HAVE_VFP\n"
|
||||
"#define HAVE_NEON\n"
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 4)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 2)
|
||||
set(DGEMM_UNROLL_N 2)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "THUNDERX2T99")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t32768\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t8\n"
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t8\n"
|
||||
"#define L2_SIZE\t262144\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t8\n"
|
||||
"#define L3_SIZE\t33554432\n"
|
||||
"#define L3_LINESIZE\t64\n"
|
||||
"#define L3_ASSOCIATIVE\t32\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
endif()
|
||||
|
||||
# Or should this actually be NUM_CORES?
|
||||
@@ -163,6 +328,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define GEMM_MULTITHREAD_THRESHOLD\t${GEMM_MULTITHREAD_THRESHOLD}\n")
|
||||
# Move to where gen_config_h would place it
|
||||
file(MAKE_DIRECTORY ${TARGET_CONF_DIR})
|
||||
file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}")
|
||||
|
||||
else(NOT CMAKE_CROSSCOMPILING)
|
||||
@@ -178,6 +344,9 @@ else(NOT CMAKE_CROSSCOMPILING)
|
||||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
|
||||
else()
|
||||
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
|
||||
if (DEFINED TARGET_CORE)
|
||||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE})
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
|
||||
@@ -39,13 +39,44 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
||||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
||||
set(TARGET "BARCELONA")
|
||||
endif ()
|
||||
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
|
||||
set(TARGET "ARMV7")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (DEFINED TARGET)
|
||||
if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
endif()
|
||||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (DEFINED TARGET)
|
||||
message(STATUS "Targeting the ${TARGET} architecture.")
|
||||
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
|
||||
endif ()
|
||||
|
||||
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
|
||||
if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
|
||||
endif ()
|
||||
|
||||
# On x86 no AVX support is available
|
||||
if (X86 OR X86_64)
|
||||
if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4"))
|
||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (INTERFACE64)
|
||||
message(STATUS "Using 64-bit integers.")
|
||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
|
||||
@@ -117,10 +148,16 @@ endif ()
|
||||
|
||||
if (USE_THREAD)
|
||||
message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.")
|
||||
else()
|
||||
if (${USE_LOCKING})
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
||||
|
||||
if (DEFINED BINARY)
|
||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||
endif ()
|
||||
if (NOT DEFINED NEED_PIC)
|
||||
set(NEED_PIC 1)
|
||||
endif ()
|
||||
@@ -137,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
|
||||
if (NOT NOFORTRAN)
|
||||
# Fortran Compiler dependent settings
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
|
||||
else ()
|
||||
set(NO_LAPACK 1)
|
||||
set(NO_LAPACKE 1)
|
||||
endif ()
|
||||
|
||||
if (BINARY64)
|
||||
@@ -162,12 +202,24 @@ if (NEED_PIC)
|
||||
endif ()
|
||||
|
||||
if (DYNAMIC_ARCH)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
||||
if (DYNAMIC_OLDER)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
|
||||
if (X86 OR X86_64 OR ARM64 OR PPC)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
||||
if (DYNAMIC_OLDER)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
|
||||
endif ()
|
||||
else ()
|
||||
unset (DYNAMIC_ARCH)
|
||||
message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (DYNAMIC_LIST)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST")
|
||||
foreach(DCORE ${DYNAMIC_LIST})
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}")
|
||||
endforeach ()
|
||||
endif ()
|
||||
|
||||
if (NO_LAPACK)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK")
|
||||
#Disable LAPACK C interface
|
||||
@@ -257,7 +309,7 @@ endif ()
|
||||
|
||||
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
|
||||
|
||||
# TODO: nead to convert these Makefiles
|
||||
# TODO: need to convert these Makefiles
|
||||
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
|
||||
|
||||
if (${CORE} STREQUAL "PPC440")
|
||||
@@ -304,6 +356,8 @@ if (MIXED_MEMORY_ALLOCATION)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION")
|
||||
endif ()
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DVERSION=\"\\\"${OpenBLAS_VERSION}\\\"\"")
|
||||
|
||||
set(REVISION "-r${OpenBLAS_VERSION}")
|
||||
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})
|
||||
|
||||
|
||||
@@ -10,6 +10,16 @@ if (${HOST_OS} STREQUAL "WINDOWS")
|
||||
set(HOST_OS WINNT)
|
||||
endif ()
|
||||
|
||||
if (${HOST_OS} STREQUAL "LINUX")
|
||||
# check if we're building natively on Android (TERMUX)
|
||||
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
|
||||
if(${OPERATING_SYSTEM} MATCHES "Android")
|
||||
set(HOST_OS ANDROID)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
|
||||
if(CMAKE_COMPILER_IS_GNUCC AND WIN32)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
|
||||
OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE
|
||||
@@ -29,13 +39,45 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
||||
set(MIPS64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||
set(X86_64 1)
|
||||
if (NOT BINARY)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(X86_64 1)
|
||||
else()
|
||||
set(X86 1)
|
||||
endif()
|
||||
else()
|
||||
if (${BINARY} EQUAL "64")
|
||||
set(X86_64 1)
|
||||
else ()
|
||||
set(X86 1)
|
||||
endif()
|
||||
endif()
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
||||
set(X86 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
|
||||
set(ARM 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
|
||||
set(ARM64 1)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(ARM64 1)
|
||||
else()
|
||||
set(ARM 1)
|
||||
endif()
|
||||
elseif (${CMAKE_CROSSCOMPILING})
|
||||
if (${TARGET} STREQUAL "CORE2")
|
||||
if (NOT BINARY)
|
||||
set(X86 1)
|
||||
elseif (${BINARY} EQUAL "64")
|
||||
set(X86_64 1)
|
||||
else ()
|
||||
set(X86 1)
|
||||
endif()
|
||||
elseif (${TARGET} STREQUAL "ARMV7")
|
||||
set(ARM 1)
|
||||
else()
|
||||
set(ARM64 1)
|
||||
endif ()
|
||||
else ()
|
||||
message(WARNING "Target ARCH could not be determined, got \"${CMAKE_SYSTEM_PROCESSOR}\"")
|
||||
endif()
|
||||
|
||||
if (X86_64)
|
||||
@@ -68,10 +110,9 @@ endif()
|
||||
|
||||
if (X86_64 OR X86)
|
||||
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
|
||||
if (NO_AVX512 EQUAL 1)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||
endif()
|
||||
file(REMOVE "avx512.tmp" "avx512.o")
|
||||
endif()
|
||||
|
||||
|
||||
@@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
|
||||
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
|
||||
endfunction ()
|
||||
|
||||
# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition
|
||||
# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
|
||||
# @param sources_in the source files to build from
|
||||
# @param defines_in (optional) preprocessor definitions that will be applied to all objects
|
||||
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.
|
||||
|
||||
15
common.h
15
common.h
@@ -85,6 +85,8 @@ extern "C" {
|
||||
|
||||
#if !defined(_MSC_VER)
|
||||
#include <unistd.h>
|
||||
#elif _MSC_VER < 1900
|
||||
#define snprintf _snprintf
|
||||
#endif
|
||||
#include <time.h>
|
||||
|
||||
@@ -129,7 +131,7 @@ extern "C" {
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <math.h>
|
||||
#ifdef SMP
|
||||
#if defined(SMP) || defined(USE_LOCKING)
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
#endif
|
||||
@@ -183,7 +185,7 @@ extern "C" {
|
||||
|
||||
#define ALLOCA_ALIGN 63UL
|
||||
|
||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
|
||||
#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
|
||||
|
||||
#ifdef NEEDBUNDERSCORE
|
||||
#define BLASFUNC(FUNC) FUNC##_
|
||||
@@ -198,7 +200,7 @@ extern "C" {
|
||||
#error "You can't specify both LOCK operation!"
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
#if defined(SMP) || defined(USE_LOCKING)
|
||||
#define USE_PTHREAD_LOCK
|
||||
#undef USE_PTHREAD_SPINLOCK
|
||||
#endif
|
||||
@@ -348,6 +350,11 @@ typedef int blasint;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef POWER9
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
#ifdef PILEDRIVER
|
||||
@@ -439,7 +446,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
#define readenv(p, n) 0
|
||||
#else
|
||||
#ifdef OS_WINDOWS
|
||||
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
|
||||
#else
|
||||
|
||||
@@ -78,7 +78,18 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
#define BLAS_LOCK_DEFINED
|
||||
|
||||
#if !defined(OS_DARWIN) && !defined (OS_ANDROID)
|
||||
static __inline BLASULONG rpcc(void){
|
||||
BLASULONG ret = 0;
|
||||
|
||||
__asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define RPCC_DEFINED
|
||||
#define RPCC64BIT
|
||||
#endif
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
@@ -103,12 +114,16 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.text ;\
|
||||
.align 4 ;\
|
||||
.global REALNAME ;\
|
||||
.type REALNAME, %function ;\
|
||||
.macro PROLOGUE
|
||||
.text ;
|
||||
.p2align 2 ;
|
||||
.global REALNAME ;
|
||||
#ifndef __APPLE__
|
||||
.type REALNAME, %function ;
|
||||
#endif
|
||||
REALNAME:
|
||||
.endm
|
||||
|
||||
|
||||
#define EPILOGUE
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#define CDOTC_K cdotc_k
|
||||
#define CNRM2_K cnrm2_k
|
||||
#define CSCAL_K cscal_k
|
||||
#define CSUM_K csum_k
|
||||
#define CSWAP_K cswap_k
|
||||
#define CROT_K csrot_k
|
||||
|
||||
@@ -249,6 +250,7 @@
|
||||
#define CDOTC_K gotoblas -> cdotc_k
|
||||
#define CNRM2_K gotoblas -> cnrm2_k
|
||||
#define CSCAL_K gotoblas -> cscal_k
|
||||
#define CSUM_K gotoblas -> csum_k
|
||||
#define CSWAP_K gotoblas -> cswap_k
|
||||
#define CROT_K gotoblas -> csrot_k
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#define DDOTC_K ddot_k
|
||||
#define DNRM2_K dnrm2_k
|
||||
#define DSCAL_K dscal_k
|
||||
#define DSUM_K dsum_k
|
||||
#define DSWAP_K dswap_k
|
||||
#define DROT_K drot_k
|
||||
|
||||
@@ -174,6 +175,7 @@
|
||||
#define DDOTC_K gotoblas -> ddot_k
|
||||
#define DNRM2_K gotoblas -> dnrm2_k
|
||||
#define DSCAL_K gotoblas -> dscal_k
|
||||
#define DSUM_K gotoblas -> dsum_k
|
||||
#define DSWAP_K gotoblas -> dswap_k
|
||||
#define DROT_K gotoblas -> drot_k
|
||||
|
||||
|
||||
@@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
|
||||
double BLASFUNC(dzasum)(blasint *, double *, blasint *);
|
||||
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);
|
||||
|
||||
FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *);
|
||||
FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *);
|
||||
double BLASFUNC(dsum) (blasint *, double *, blasint *);
|
||||
xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
|
||||
double BLASFUNC(dzsum)(blasint *, double *, blasint *);
|
||||
xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);
|
||||
|
||||
blasint BLASFUNC(isamax)(blasint *, float *, blasint *);
|
||||
blasint BLASFUNC(idamax)(blasint *, double *, blasint *);
|
||||
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);
|
||||
|
||||
146
common_lapack.h
146
common_lapack.h
@@ -293,4 +293,150 @@ blasint zlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLO
|
||||
blasint xlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
blasint strtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint dtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint qtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint ctrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ztrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint xtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
blasint strtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint dtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint qtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint ctrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ztrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint xtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG);
|
||||
double zasum_k (BLASLONG, double *, BLASLONG);
|
||||
xdouble xasum_k (BLASLONG, xdouble *, BLASLONG);
|
||||
|
||||
float ssum_k (BLASLONG, float *, BLASLONG);
|
||||
double dsum_k (BLASLONG, double *, BLASLONG);
|
||||
xdouble qsum_k (BLASLONG, xdouble *, BLASLONG);
|
||||
float csum_k (BLASLONG, float *, BLASLONG);
|
||||
double zsum_k (BLASLONG, double *, BLASLONG);
|
||||
xdouble xsum_k (BLASLONG, xdouble *, BLASLONG);
|
||||
|
||||
float samax_k (BLASLONG, float *, BLASLONG);
|
||||
double damax_k (BLASLONG, double *, BLASLONG);
|
||||
xdouble qamax_k (BLASLONG, xdouble *, BLASLONG);
|
||||
|
||||
@@ -47,6 +47,14 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *);
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K,
|
||||
float * A, BLASLONG strideA,
|
||||
float * B, BLASLONG strideB,
|
||||
float * R, BLASLONG strideR);
|
||||
|
||||
extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);
|
||||
|
||||
|
||||
int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double,
|
||||
|
||||
171
common_macro.h
171
common_macro.h
@@ -66,6 +66,7 @@
|
||||
#define DOTC_K QDOTC_K
|
||||
#define NRM2_K QNRM2_K
|
||||
#define SCAL_K QSCAL_K
|
||||
#define SUM_K QSUM_K
|
||||
#define SWAP_K QSWAP_K
|
||||
#define ROT_K QROT_K
|
||||
|
||||
@@ -356,6 +357,7 @@
|
||||
#define DOTC_K DDOTC_K
|
||||
#define NRM2_K DNRM2_K
|
||||
#define SCAL_K DSCAL_K
|
||||
#define SUM_K DSUM_K
|
||||
#define SWAP_K DSWAP_K
|
||||
#define ROT_K DROT_K
|
||||
|
||||
@@ -639,7 +641,7 @@
|
||||
#define IMATCOPY_K_CT DIMATCOPY_K_CT
|
||||
#define IMATCOPY_K_RT DIMATCOPY_K_RT
|
||||
|
||||
#define GEADD_K DGEADD_K
|
||||
#define GEADD_K DGEADD_K
|
||||
#else
|
||||
|
||||
#define AMAX_K SAMAX_K
|
||||
@@ -658,6 +660,7 @@
|
||||
#define DOTC_K SDOTC_K
|
||||
#define NRM2_K SNRM2_K
|
||||
#define SCAL_K SSCAL_K
|
||||
#define SUM_K SSUM_K
|
||||
#define SWAP_K SSWAP_K
|
||||
#define ROT_K SROT_K
|
||||
|
||||
@@ -941,7 +944,7 @@
|
||||
#define IMATCOPY_K_CT SIMATCOPY_K_CT
|
||||
#define IMATCOPY_K_RT SIMATCOPY_K_RT
|
||||
|
||||
#define GEADD_K SGEADD_K
|
||||
#define GEADD_K SGEADD_K
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
@@ -962,6 +965,7 @@
|
||||
#define DOTC_K XDOTC_K
|
||||
#define NRM2_K XNRM2_K
|
||||
#define SCAL_K XSCAL_K
|
||||
#define SUM_K XSUM_K
|
||||
#define SWAP_K XSWAP_K
|
||||
#define ROT_K XROT_K
|
||||
|
||||
@@ -1363,6 +1367,7 @@
|
||||
#define DOTC_K ZDOTC_K
|
||||
#define NRM2_K ZNRM2_K
|
||||
#define SCAL_K ZSCAL_K
|
||||
#define SUM_K ZSUM_K
|
||||
#define SWAP_K ZSWAP_K
|
||||
#define ROT_K ZROT_K
|
||||
|
||||
@@ -1765,7 +1770,7 @@
|
||||
#define IMATCOPY_K_CTC ZIMATCOPY_K_CTC
|
||||
#define IMATCOPY_K_RTC ZIMATCOPY_K_RTC
|
||||
|
||||
#define GEADD_K ZGEADD_K
|
||||
#define GEADD_K ZGEADD_K
|
||||
|
||||
#else
|
||||
|
||||
@@ -1785,6 +1790,7 @@
|
||||
#define DOTC_K CDOTC_K
|
||||
#define NRM2_K CNRM2_K
|
||||
#define SCAL_K CSCAL_K
|
||||
#define SUM_K CSUM_K
|
||||
#define SWAP_K CSWAP_K
|
||||
#define ROT_K CROT_K
|
||||
|
||||
@@ -2187,7 +2193,7 @@
|
||||
#define IMATCOPY_K_CTC CIMATCOPY_K_CTC
|
||||
#define IMATCOPY_K_RTC CIMATCOPY_K_RTC
|
||||
|
||||
#define GEADD_K CGEADD_K
|
||||
#define GEADD_K CGEADD_K
|
||||
|
||||
#endif
|
||||
#endif
|
||||
@@ -2800,3 +2806,160 @@ typedef struct {
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
#define TRTRS_UNU_SINGLE qtrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE qtrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE qtrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE qtrtrs_UTN_single
|
||||
#define TRTRS_LNU_SINGLE qtrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE qtrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE qtrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE qtrtrs_LTN_single
|
||||
#define TRTRS_UNU_PARALLEL qtrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL qtrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL qtrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL qtrtrs_UTN_parallel
|
||||
#define TRTRS_LNU_PARALLEL qtrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL qtrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL qtrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL qtrtrs_LTN_parallel
|
||||
|
||||
#elif defined(DOUBLE)
|
||||
#define TRTRS_UNU_SINGLE dtrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE dtrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE dtrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE dtrtrs_UTN_single
|
||||
#define TRTRS_LNU_SINGLE dtrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE dtrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE dtrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE dtrtrs_LTN_single
|
||||
#define TRTRS_UNU_PARALLEL dtrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL dtrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL dtrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL dtrtrs_UTN_parallel
|
||||
#define TRTRS_LNU_PARALLEL dtrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL dtrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL dtrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL dtrtrs_LTN_parallel
|
||||
#else
|
||||
#define TRTRS_UNU_SINGLE strtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE strtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE strtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE strtrs_UTN_single
|
||||
#define TRTRS_LNU_SINGLE strtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE strtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE strtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE strtrs_LTN_single
|
||||
#define TRTRS_UNU_PARALLEL strtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL strtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL strtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL strtrs_UTN_parallel
|
||||
#define TRTRS_LNU_PARALLEL strtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL strtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL strtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL strtrs_LTN_parallel
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
#define TRTRS_UNU_SINGLE xtrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE xtrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE xtrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE xtrtrs_UTN_single
|
||||
#define TRTRS_URU_SINGLE xtrtrs_URU_single
|
||||
#define TRTRS_URN_SINGLE xtrtrs_URN_single
|
||||
#define TRTRS_UCU_SINGLE xtrtrs_UCU_single
|
||||
#define TRTRS_UCN_SINGLE xtrtrs_UCN_single
|
||||
#define TRTRS_LNU_SINGLE xtrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE xtrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE xtrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE xtrtrs_LTN_single
|
||||
#define TRTRS_LRU_SINGLE xtrtrs_LRU_single
|
||||
#define TRTRS_LRN_SINGLE xtrtrs_LRN_single
|
||||
#define TRTRS_LCU_SINGLE xtrtrs_LCU_single
|
||||
#define TRTRS_LCN_SINGLE xtrtrs_LCN_single
|
||||
#define TRTRS_UNU_PARALLEL xtrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL xtrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL xtrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL xtrtrs_UTN_parallel
|
||||
#define TRTRS_URU_PARALLEL xtrtrs_URU_parallel
|
||||
#define TRTRS_URN_PARALLEL xtrtrs_URN_parallel
|
||||
#define TRTRS_UCU_PARALLEL xtrtrs_UCU_parallel
|
||||
#define TRTRS_UCN_PARALLEL xtrtrs_UCN_parallel
|
||||
#define TRTRS_LNU_PARALLEL xtrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL xtrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL xtrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL xtrtrs_LTN_parallel
|
||||
#define TRTRS_LRU_PARALLEL xtrtrs_LRU_parallel
|
||||
#define TRTRS_LRN_PARALLEL xtrtrs_LRN_parallel
|
||||
#define TRTRS_LCU_PARALLEL xtrtrs_LCU_parallel
|
||||
#define TRTRS_LCN_PARALLEL xtrtrs_LCN_parallel
|
||||
#elif defined(DOUBLE)
|
||||
#define TRTRS_UNU_SINGLE ztrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE ztrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE ztrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE ztrtrs_UTN_single
|
||||
#define TRTRS_URU_SINGLE ztrtrs_URU_single
|
||||
#define TRTRS_URN_SINGLE ztrtrs_URN_single
|
||||
#define TRTRS_UCU_SINGLE ztrtrs_UCU_single
|
||||
#define TRTRS_UCN_SINGLE ztrtrs_UCN_single
|
||||
#define TRTRS_LNU_SINGLE ztrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE ztrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE ztrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE ztrtrs_LTN_single
|
||||
#define TRTRS_LRU_SINGLE ztrtrs_LRU_single
|
||||
#define TRTRS_LRN_SINGLE ztrtrs_LRN_single
|
||||
#define TRTRS_LCU_SINGLE ztrtrs_LCU_single
|
||||
#define TRTRS_LCN_SINGLE ztrtrs_LCN_single
|
||||
#define TRTRS_UNU_PARALLEL ztrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL ztrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL ztrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL ztrtrs_UTN_parallel
|
||||
#define TRTRS_URU_PARALLEL ztrtrs_URU_parallel
|
||||
#define TRTRS_URN_PARALLEL ztrtrs_URN_parallel
|
||||
#define TRTRS_UCU_PARALLEL ztrtrs_UCU_parallel
|
||||
#define TRTRS_UCN_PARALLEL ztrtrs_UCN_parallel
|
||||
#define TRTRS_LNU_PARALLEL ztrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL ztrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL ztrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL ztrtrs_LTN_parallel
|
||||
#define TRTRS_LRU_PARALLEL ztrtrs_LRU_parallel
|
||||
#define TRTRS_LRN_PARALLEL ztrtrs_LRN_parallel
|
||||
#define TRTRS_LCU_PARALLEL ztrtrs_LCU_parallel
|
||||
#define TRTRS_LCN_PARALLEL ztrtrs_LCN_parallel
|
||||
#else
|
||||
#define TRTRS_UNU_SINGLE ctrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE ctrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE ctrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE ctrtrs_UTN_single
|
||||
#define TRTRS_URU_SINGLE ctrtrs_URU_single
|
||||
#define TRTRS_URN_SINGLE ctrtrs_URN_single
|
||||
#define TRTRS_UCU_SINGLE ctrtrs_UCU_single
|
||||
#define TRTRS_UCN_SINGLE ctrtrs_UCN_single
|
||||
#define TRTRS_LNU_SINGLE ctrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE ctrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE ctrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE ctrtrs_LTN_single
|
||||
#define TRTRS_LRU_SINGLE ctrtrs_LRU_single
|
||||
#define TRTRS_LRN_SINGLE ctrtrs_LRN_single
|
||||
#define TRTRS_LCU_SINGLE ctrtrs_LCU_single
|
||||
#define TRTRS_LCN_SINGLE ctrtrs_LCN_single
|
||||
#define TRTRS_UNU_PARALLEL ctrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL ctrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL ctrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL ctrtrs_UTN_parallel
|
||||
#define TRTRS_URU_PARALLEL ctrtrs_URU_parallel
|
||||
#define TRTRS_URN_PARALLEL ctrtrs_URN_parallel
|
||||
#define TRTRS_UCU_PARALLEL ctrtrs_UCU_parallel
|
||||
#define TRTRS_UCN_PARALLEL ctrtrs_UCN_parallel
|
||||
#define TRTRS_LNU_PARALLEL ctrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL ctrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL ctrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL ctrtrs_LTN_parallel
|
||||
#define TRTRS_LRU_PARALLEL ctrtrs_LRU_parallel
|
||||
#define TRTRS_LRN_PARALLEL ctrtrs_LRN_parallel
|
||||
#define TRTRS_LCU_PARALLEL ctrtrs_LCU_parallel
|
||||
#define TRTRS_LCN_PARALLEL ctrtrs_LCN_parallel
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
||||
|
||||
float (*snrm2_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*sasum_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*ssum_k) (BLASLONG, float *, BLASLONG);
|
||||
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
@@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
|
||||
|
||||
double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*dasum_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*dsum_k) (BLASLONG, double *, BLASLONG);
|
||||
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
|
||||
@@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
|
||||
xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
|
||||
@@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
|
||||
|
||||
float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*casum_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*csum_k) (BLASLONG, float *, BLASLONG);
|
||||
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
@@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
|
||||
|
||||
double (*znrm2_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*zasum_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*zsum_k) (BLASLONG, double *, BLASLONG);
|
||||
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
@@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
||||
|
||||
xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
#ifndef COMMON_POWER
|
||||
#define COMMON_POWER
|
||||
|
||||
#if defined(POWER8)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
||||
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
||||
#else
|
||||
@@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
#define HAVE_PREFETCH
|
||||
#endif
|
||||
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970)
|
||||
#define DCBT_ARG 0
|
||||
#else
|
||||
#define DCBT_ARG 8
|
||||
@@ -263,7 +263,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
#define L1_PREFETCH dcbtst
|
||||
#endif
|
||||
|
||||
#if defined(POWER8)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#define L1_DUALFETCH
|
||||
#define L1_PREFETCHSIZE (16 + 128 * 100)
|
||||
#define L1_PREFETCH dcbtst
|
||||
@@ -499,7 +499,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||
#ifndef __64BIT__
|
||||
#define PROLOGUE \
|
||||
.section .text;\
|
||||
@@ -598,9 +598,14 @@ REALNAME:;\
|
||||
#ifndef __64BIT__
|
||||
#define PROLOGUE \
|
||||
.machine "any";\
|
||||
.toc;\
|
||||
.globl .REALNAME;\
|
||||
.globl REALNAME;\
|
||||
.csect REALNAME[DS],3;\
|
||||
REALNAME:;\
|
||||
.long .REALNAME, TOC[tc0], 0;\
|
||||
.csect .text[PR],5;\
|
||||
.REALNAME:;
|
||||
.REALNAME:
|
||||
|
||||
#define EPILOGUE \
|
||||
_section_.text:;\
|
||||
@@ -611,9 +616,14 @@ _section_.text:;\
|
||||
|
||||
#define PROLOGUE \
|
||||
.machine "any";\
|
||||
.toc;\
|
||||
.globl .REALNAME;\
|
||||
.globl REALNAME;\
|
||||
.csect REALNAME[DS],3;\
|
||||
REALNAME:;\
|
||||
.llong .REALNAME, TOC[tc0], 0;\
|
||||
.csect .text[PR], 5;\
|
||||
.REALNAME:;
|
||||
.REALNAME:
|
||||
|
||||
#define EPILOGUE \
|
||||
_section_.text:;\
|
||||
@@ -774,7 +784,7 @@ Lmcount$lazy_ptr:
|
||||
|
||||
#define HALT mfspr r0, 1023
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||
#if defined(PPC440) || defined(PPC440FP2)
|
||||
#undef MAX_CPU_NUMBER
|
||||
#define MAX_CPU_NUMBER 1
|
||||
@@ -802,7 +812,7 @@ Lmcount$lazy_ptr:
|
||||
#define BUFFER_SIZE ( 2 << 20)
|
||||
#elif defined(PPC440FP2)
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#elif defined(POWER8)
|
||||
#elif defined(POWER8) || defined(POWER9)
|
||||
#define BUFFER_SIZE ( 64 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
@@ -819,7 +829,7 @@ Lmcount$lazy_ptr:
|
||||
#define MAP_ANONYMOUS MAP_ANON
|
||||
#endif
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||
#ifndef __64BIT__
|
||||
#define FRAMESLOT(X) (((X) * 4) + 8)
|
||||
#else
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#define QDOTC_K qdot_k
|
||||
#define QNRM2_K qnrm2_k
|
||||
#define QSCAL_K qscal_k
|
||||
#define QSUM_K qsum_k
|
||||
#define QSWAP_K qswap_k
|
||||
#define QROT_K qrot_k
|
||||
|
||||
@@ -161,6 +162,7 @@
|
||||
#define QDOTC_K gotoblas -> qdot_k
|
||||
#define QNRM2_K gotoblas -> qnrm2_k
|
||||
#define QSCAL_K gotoblas -> qscal_k
|
||||
#define QSUM_K gotoblas -> qsum_k
|
||||
#define QSWAP_K gotoblas -> qswap_k
|
||||
#define QROT_K gotoblas -> qrot_k
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#define ISMAX_K ismax_k
|
||||
#define ISMIN_K ismin_k
|
||||
#define SASUM_K sasum_k
|
||||
#define SSUM_K ssum_k
|
||||
#define SAXPYU_K saxpy_k
|
||||
#define SAXPYC_K saxpy_k
|
||||
#define SCOPY_K scopy_k
|
||||
@@ -170,6 +171,7 @@
|
||||
#define ISMAX_K gotoblas -> ismax_k
|
||||
#define ISMIN_K gotoblas -> ismin_k
|
||||
#define SASUM_K gotoblas -> sasum_k
|
||||
#define SSUM_K gotoblas -> ssum_k
|
||||
#define SAXPYU_K gotoblas -> saxpy_k
|
||||
#define SAXPYC_K gotoblas -> saxpy_k
|
||||
#define SCOPY_K gotoblas -> scopy_k
|
||||
|
||||
@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* SIZE must be carefully chosen to be:
|
||||
* - as small as possible to maximize the number of stack allocation
|
||||
* - large enough to support all architectures and kernel
|
||||
* Chosing a too small SIZE will lead to a stack smashing.
|
||||
* Choosing a SIZE too small will lead to a stack smashing.
|
||||
*/
|
||||
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
|
||||
/* make it volatile because some function (ex: dgemv_n.S) */ \
|
||||
|
||||
@@ -194,10 +194,6 @@ int trsm_thread(int mode, BLASLONG m, BLASLONG n,
|
||||
|
||||
int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
|
||||
|
||||
int beta_thread(int mode, BLASLONG m, BLASLONG n,
|
||||
double alpha_r, double alpha_i,
|
||||
void *c, BLASLONG ldc, int (*fuction)());
|
||||
|
||||
int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k,
|
||||
void *offsetA, BLASLONG lda,
|
||||
void *offsetB, BLASLONG jb,
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#define XDOTC_K xdotc_k
|
||||
#define XNRM2_K xnrm2_k
|
||||
#define XSCAL_K xscal_k
|
||||
#define XSUM_K xsum_k
|
||||
#define XSWAP_K xswap_k
|
||||
#define XROT_K xqrot_k
|
||||
|
||||
@@ -227,6 +228,7 @@
|
||||
#define XDOTC_K gotoblas -> xdotc_k
|
||||
#define XNRM2_K gotoblas -> xnrm2_k
|
||||
#define XSCAL_K gotoblas -> xscal_k
|
||||
#define XSUM_K gotoblas -> xsum_k
|
||||
#define XSWAP_K gotoblas -> xswap_k
|
||||
#define XROT_K gotoblas -> xqrot_k
|
||||
|
||||
|
||||
@@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
|
||||
y = blas_quick_divide_table[y];
|
||||
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y));
|
||||
|
||||
return result;
|
||||
#endif
|
||||
@@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
#endif
|
||||
|
||||
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
//Enable some optimazation for barcelona.
|
||||
//Enable some optimization for barcelona.
|
||||
#define BARCELONA_OPTIMIZATION
|
||||
#endif
|
||||
|
||||
|
||||
@@ -129,7 +129,8 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
|
||||
*ecx=cpuinfo[2];
|
||||
*edx=cpuinfo[3];
|
||||
#else
|
||||
__asm__ __volatile__("cpuid"
|
||||
__asm__ __volatile__("mov $0, %%ecx;"
|
||||
"cpuid"
|
||||
: "=a" (*eax),
|
||||
"=b" (*ebx),
|
||||
"=c" (*ecx),
|
||||
@@ -210,7 +211,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
|
||||
y = blas_quick_divide_table[y];
|
||||
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -276,7 +277,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
#ifdef ASSEMBLER
|
||||
|
||||
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
//Enable some optimazation for barcelona.
|
||||
//Enable some optimization for barcelona.
|
||||
#define BARCELONA_OPTIMIZATION
|
||||
#endif
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#define ZDOTC_K zdotc_k
|
||||
#define ZNRM2_K znrm2_k
|
||||
#define ZSCAL_K zscal_k
|
||||
#define ZSUM_K zsum_k
|
||||
#define ZSWAP_K zswap_k
|
||||
#define ZROT_K zdrot_k
|
||||
|
||||
@@ -249,6 +250,7 @@
|
||||
#define ZDOTC_K gotoblas -> zdotc_k
|
||||
#define ZNRM2_K gotoblas -> znrm2_k
|
||||
#define ZSCAL_K gotoblas -> zscal_k
|
||||
#define ZSUM_K gotoblas -> zsum_k
|
||||
#define ZSWAP_K gotoblas -> zswap_k
|
||||
#define ZROT_K gotoblas -> zdrot_k
|
||||
|
||||
|
||||
14
cpp_thread_test/Makefile
Normal file
14
cpp_thread_test/Makefile
Normal file
@@ -0,0 +1,14 @@
|
||||
include ../Makefile.rule
|
||||
|
||||
all :: dgemv_tester dgemm_tester
|
||||
|
||||
dgemv_tester :
|
||||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
|
||||
./dgemv_tester
|
||||
|
||||
dgemm_tester : dgemv_tester
|
||||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
|
||||
./dgemm_tester
|
||||
|
||||
clean ::
|
||||
rm -f dgemv_tester dgemm_tester
|
||||
55
cpp_thread_test/cpp_thread_safety_common.h
Normal file
55
cpp_thread_test/cpp_thread_safety_common.h
Normal file
@@ -0,0 +1,55 @@
|
||||
inline void pauser(){
|
||||
/// a portable way to pause a program
|
||||
std::string dummy;
|
||||
std::cout << "Press enter to continue...";
|
||||
std::getline(std::cin, dummy);
|
||||
}
|
||||
|
||||
void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
|
||||
for(uint32_t i=0; i<numMat; i++){
|
||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
|
||||
matBlock[i][j] = rngdist(PRNG);
|
||||
}
|
||||
}
|
||||
for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){
|
||||
for(uint32_t j=0; j<numMat; j++){
|
||||
matBlock[i+j] = matBlock[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){
|
||||
for(uint32_t i=0; i<numVec; i++){
|
||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||
vecBlock[i][j] = rngdist(PRNG);
|
||||
}
|
||||
}
|
||||
for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){
|
||||
for(uint32_t j=0; j<numVec; j++){
|
||||
vecBlock[i+j] = vecBlock[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::mt19937_64 InitPRNG(){
|
||||
std::random_device rd;
|
||||
std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG
|
||||
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||
//make sure the internal state of the PRNG is properly mixed by generating 10M random numbers
|
||||
//PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed
|
||||
for (uint32_t i=0;i<10000000;i++) rngdist(PRNG);
|
||||
return PRNG;
|
||||
}
|
||||
|
||||
void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
|
||||
for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){
|
||||
std::cout<<i<<std::endl;
|
||||
for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||
for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){
|
||||
std::cout<<matBlock[i][j*randomMatSize + k]<<" ";
|
||||
}
|
||||
std::cout<<std::endl;
|
||||
}
|
||||
std::cout<<std::endl;
|
||||
}
|
||||
}
|
||||
92
cpp_thread_test/dgemm_thread_safety.cpp
Normal file
92
cpp_thread_test/dgemm_thread_safety.cpp
Normal file
@@ -0,0 +1,92 @@
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <future>
|
||||
#include <omp.h>
|
||||
#include "../cblas.h"
|
||||
#include "cpp_thread_safety_common.h"
|
||||
|
||||
void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){
|
||||
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize);
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]){
|
||||
blasint randomMatSize = 1024; //dimension of the random square matrices used
|
||||
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
|
||||
uint32_t numTestRounds = 16; //number of testing rounds before success exit
|
||||
|
||||
if (argc > 4){
|
||||
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
if(argc == 4){
|
||||
std::vector<std::string> cliArgs;
|
||||
for (int i = 1; i < argc; i++){
|
||||
cliArgs.push_back(argv[i]);
|
||||
std::cout<<argv[i]<<std::endl;
|
||||
}
|
||||
randomMatSize = std::stoul(cliArgs[0]);
|
||||
numConcurrentThreads = std::stoul(cliArgs[1]);
|
||||
numTestRounds = std::stoul(cliArgs[2]);
|
||||
}
|
||||
|
||||
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||
std::vector<std::vector<double>> matBlock(numConcurrentThreads*3);
|
||||
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
|
||||
|
||||
std::cout<<"*----------------------------*\n";
|
||||
std::cout<<"| DGEMM thread safety tester |\n";
|
||||
std::cout<<"*----------------------------*\n";
|
||||
std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n';
|
||||
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
|
||||
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
||||
std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
||||
|
||||
std::cout<<"Initializing random number generator..."<<std::flush;
|
||||
std::mt19937_64 PRNG = InitPRNG();
|
||||
std::cout<<"done\n";
|
||||
|
||||
std::cout<<"Preparing to test CBLAS DGEMM thread safety\n";
|
||||
std::cout<<"Allocating matrices..."<<std::flush;
|
||||
for(uint32_t i=0; i<(numConcurrentThreads*3); i++){
|
||||
matBlock[i].resize(randomMatSize*randomMatSize);
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
//pauser();
|
||||
std::cout<<"Filling matrices with random numbers..."<<std::flush;
|
||||
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3);
|
||||
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Testing CBLAS DGEMM thread safety\n";
|
||||
omp_set_num_threads(numConcurrentThreads);
|
||||
for(uint32_t R=0; R<numTestRounds; R++){
|
||||
std::cout<<"DGEMM round #"<<R<<std::endl;
|
||||
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
|
||||
#pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads)
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize);
|
||||
//launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]);
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Waiting for threads to finish..."<<std::flush;
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
futureBlock[i].get();
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
|
||||
std::cout<<"Comparing results from different threads..."<<std::flush;
|
||||
for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread
|
||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
|
||||
if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread
|
||||
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl;
|
||||
std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout<<"OK!\n"<<std::endl;
|
||||
}
|
||||
std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl;
|
||||
return 0;
|
||||
}
|
||||
101
cpp_thread_test/dgemv_thread_safety.cpp
Normal file
101
cpp_thread_test/dgemv_thread_safety.cpp
Normal file
@@ -0,0 +1,101 @@
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <future>
|
||||
#include <omp.h>
|
||||
#include "../cblas.h"
|
||||
#include "cpp_thread_safety_common.h"
|
||||
|
||||
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
|
||||
const blasint inc = 1;
|
||||
cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]){
|
||||
blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
|
||||
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
|
||||
uint32_t numTestRounds = 16; //number of testing rounds before success exit
|
||||
|
||||
if (argc > 4){
|
||||
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
|
||||
abort();
|
||||
}
|
||||
if(argc == 4){
|
||||
std::vector<std::string> cliArgs;
|
||||
for (int i = 1; i < argc; i++){
|
||||
cliArgs.push_back(argv[i]);
|
||||
std::cout<<argv[i]<<std::endl;
|
||||
}
|
||||
randomMatSize = std::stoul(cliArgs.at(0));
|
||||
numConcurrentThreads = std::stoul(cliArgs.at(1));
|
||||
numTestRounds = std::stoul(cliArgs.at(2));
|
||||
}
|
||||
|
||||
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||
std::vector<std::vector<double>> matBlock(numConcurrentThreads);
|
||||
std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2);
|
||||
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
|
||||
|
||||
std::cout<<"*----------------------------*\n";
|
||||
std::cout<<"| DGEMV thread safety tester |\n";
|
||||
std::cout<<"*----------------------------*\n";
|
||||
std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n';
|
||||
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
|
||||
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
||||
std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
||||
|
||||
std::cout<<"Initializing random number generator..."<<std::flush;
|
||||
std::mt19937_64 PRNG = InitPRNG();
|
||||
std::cout<<"done\n";
|
||||
|
||||
std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
|
||||
std::cout<<"Allocating matrices..."<<std::flush;
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
matBlock.at(i).resize(randomMatSize*randomMatSize);
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Allocating vectors..."<<std::flush;
|
||||
for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
|
||||
vecBlock.at(i).resize(randomMatSize);
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
//pauser();
|
||||
|
||||
std::cout<<"Filling matrices with random numbers..."<<std::flush;
|
||||
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1);
|
||||
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads);
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Filling vectors with random numbers..."<<std::flush;
|
||||
FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2);
|
||||
std::cout<<"done\n";
|
||||
|
||||
std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
|
||||
omp_set_num_threads(numConcurrentThreads);
|
||||
for(uint32_t R=0; R<numTestRounds; R++){
|
||||
std::cout<<"DGEMV round #"<<R<<std::endl;
|
||||
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
|
||||
#pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Waiting for threads to finish..."<<std::flush;
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
futureBlock[i].get();
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Comparing results from different threads..."<<std::flush;
|
||||
for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
|
||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||
if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
|
||||
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
|
||||
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout<<"OK!\n"<<std::endl;
|
||||
}
|
||||
std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
|
||||
return 0;
|
||||
}
|
||||
6
cpuid.h
6
cpuid.h
@@ -53,6 +53,7 @@
|
||||
#define VENDOR_SIS 8
|
||||
#define VENDOR_TRANSMETA 9
|
||||
#define VENDOR_NSC 10
|
||||
#define VENDOR_HYGON 11
|
||||
#define VENDOR_UNKNOWN 99
|
||||
|
||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||
@@ -116,6 +117,7 @@
|
||||
#define CORE_EXCAVATOR 26
|
||||
#define CORE_ZEN 27
|
||||
#define CORE_SKYLAKEX 28
|
||||
#define CORE_DHYANA 29
|
||||
|
||||
#define HAVE_SSE (1 << 0)
|
||||
#define HAVE_SSE2 (1 << 1)
|
||||
@@ -139,6 +141,7 @@
|
||||
#define HAVE_FMA4 (1 << 19)
|
||||
#define HAVE_FMA3 (1 << 20)
|
||||
#define HAVE_AVX512VL (1 << 21)
|
||||
#define HAVE_AVX2 (1 << 22)
|
||||
|
||||
#define CACHE_INFO_L1_I 1
|
||||
#define CACHE_INFO_L1_D 2
|
||||
@@ -214,5 +217,8 @@ typedef struct {
|
||||
#define CPUTYPE_EXCAVATOR 50
|
||||
#define CPUTYPE_ZEN 51
|
||||
#define CPUTYPE_SKYLAKEX 52
|
||||
#define CPUTYPE_DHYANA 53
|
||||
|
||||
#define CPUTYPE_HYGON_UNKNOWN 54
|
||||
|
||||
#endif
|
||||
|
||||
@@ -34,7 +34,7 @@
|
||||
#define CPU_CORTEXA15 4
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKOWN",
|
||||
"UNKNOWN",
|
||||
"ARMV6",
|
||||
"ARMV7",
|
||||
"CORTEXA9",
|
||||
|
||||
189
cpuid_arm64.c
189
cpuid_arm64.c
@@ -29,27 +29,43 @@
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_ARMV8 1
|
||||
#define CPU_CORTEXA57 2
|
||||
#define CPU_VULCAN 3
|
||||
#define CPU_THUNDERX 4
|
||||
#define CPU_THUNDERX2T99 5
|
||||
// Arm
|
||||
#define CPU_CORTEXA53 2
|
||||
#define CPU_CORTEXA57 3
|
||||
#define CPU_CORTEXA72 4
|
||||
#define CPU_CORTEXA73 5
|
||||
// Qualcomm
|
||||
#define CPU_FALKOR 6
|
||||
// Cavium
|
||||
#define CPU_THUNDERX 7
|
||||
#define CPU_THUNDERX2T99 8
|
||||
//Hisilicon
|
||||
#define CPU_TSV110 9
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
"ARMV8" ,
|
||||
"CORTEXA53",
|
||||
"CORTEXA57",
|
||||
"VULCAN",
|
||||
"CORTEXA72",
|
||||
"CORTEXA73",
|
||||
"FALKOR",
|
||||
"THUNDERX",
|
||||
"THUNDERX2T99"
|
||||
"THUNDERX2T99",
|
||||
"TSV110"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"unknown",
|
||||
"armv8" ,
|
||||
"armv8",
|
||||
"cortexa53",
|
||||
"cortexa57",
|
||||
"vulcan",
|
||||
"cortexa72",
|
||||
"cortexa73",
|
||||
"falkor",
|
||||
"thunderx",
|
||||
"thunderx2t99"
|
||||
"thunderx2t99",
|
||||
"tsv110"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
@@ -78,7 +94,7 @@ int get_feature(char *search)
|
||||
if( p == NULL ) return 0;
|
||||
|
||||
t = strtok(p," ");
|
||||
while( t = strtok(NULL," "))
|
||||
while( (t = strtok(NULL," ")))
|
||||
{
|
||||
if (!strcmp(t, search)) { return(1); }
|
||||
}
|
||||
@@ -114,15 +130,28 @@ int detect(void)
|
||||
|
||||
fclose(infile);
|
||||
if(cpu_part != NULL && cpu_implementer != NULL) {
|
||||
if (strstr(cpu_implementer, "0x41") &&
|
||||
(strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08") || strstr(cpu_part,"0xd03") ))
|
||||
return CPU_CORTEXA57; //or compatible A53, A72
|
||||
else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42"))
|
||||
return CPU_VULCAN;
|
||||
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
|
||||
// Arm
|
||||
if (strstr(cpu_implementer, "0x41")) {
|
||||
if (strstr(cpu_part, "0xd03"))
|
||||
return CPU_CORTEXA53;
|
||||
else if (strstr(cpu_part, "0xd07"))
|
||||
return CPU_CORTEXA57;
|
||||
else if (strstr(cpu_part, "0xd08"))
|
||||
return CPU_CORTEXA72;
|
||||
else if (strstr(cpu_part, "0xd09"))
|
||||
return CPU_CORTEXA73;
|
||||
}
|
||||
// Qualcomm
|
||||
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
|
||||
return CPU_FALKOR;
|
||||
// Cavium
|
||||
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0a1"))
|
||||
return CPU_THUNDERX;
|
||||
else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43"))
|
||||
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
|
||||
return CPU_THUNDERX2T99;
|
||||
// HiSilicon
|
||||
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
|
||||
return CPU_TSV110;
|
||||
}
|
||||
|
||||
p = (char *) NULL ;
|
||||
@@ -177,67 +206,93 @@ void get_subdirname(void)
|
||||
printf("arm64");
|
||||
}
|
||||
|
||||
void get_cpucount(void)
|
||||
{
|
||||
int n=0;
|
||||
|
||||
#ifdef linux
|
||||
FILE *infile;
|
||||
char buffer[2048], *p,*t;
|
||||
p = (char *) NULL ;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if (!strncmp("processor", buffer, 9))
|
||||
n++;
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
printf("#define NUM_CORES %d\n",n);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
void get_cpuconfig(void)
|
||||
{
|
||||
|
||||
// All arches should define ARMv8
|
||||
printf("#define ARMV8\n");
|
||||
printf("#define HAVE_NEON\n"); // This shouldn't be necessary
|
||||
printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary
|
||||
|
||||
int d = detect();
|
||||
switch (d)
|
||||
{
|
||||
|
||||
case CPU_CORTEXA53:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
// Fall-through
|
||||
case CPU_ARMV8:
|
||||
printf("#define ARMV8\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
break;
|
||||
|
||||
case CPU_VULCAN:
|
||||
printf("#define VULCAN \n");
|
||||
printf("#define HAVE_VFP \n");
|
||||
printf("#define HAVE_VFPV3 \n");
|
||||
printf("#define HAVE_NEON \n");
|
||||
printf("#define HAVE_VFPV4 \n");
|
||||
printf("#define L1_CODE_SIZE 32768 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
printf("#define L1_DATA_SIZE 32768 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
||||
printf("#define L2_SIZE 262144 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define L3_SIZE 33554432 \n");
|
||||
printf("#define L3_LINESIZE 64 \n");
|
||||
printf("#define L3_ASSOCIATIVE 32 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
// Minimum parameters for ARMv8 (based on A53)
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
break;
|
||||
|
||||
case CPU_CORTEXA57:
|
||||
printf("#define CORTEXA57\n");
|
||||
printf("#define HAVE_VFP\n");
|
||||
printf("#define HAVE_VFPV3\n");
|
||||
printf("#define HAVE_NEON\n");
|
||||
printf("#define HAVE_VFPV4\n");
|
||||
case CPU_CORTEXA72:
|
||||
case CPU_CORTEXA73:
|
||||
// Common minimum settings for these Arm cores
|
||||
// Can change a lot, but we need to be conservative
|
||||
// TODO: detect info from /sys if possible
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 49152\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 3\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 2\n");
|
||||
printf("#define L2_SIZE 2097152\n");
|
||||
printf("#define L2_SIZE 524288\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
case CPU_FALKOR:
|
||||
printf("#define FALKOR\n");
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 128\n");
|
||||
printf("#define L2_SIZE 524288\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
|
||||
case CPU_THUNDERX:
|
||||
printf("#define ARMV8\n");
|
||||
printf("#define THUNDERX\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 128\n");
|
||||
@@ -249,11 +304,7 @@ void get_cpuconfig(void)
|
||||
break;
|
||||
|
||||
case CPU_THUNDERX2T99:
|
||||
printf("#define VULCAN \n");
|
||||
printf("#define HAVE_VFP \n");
|
||||
printf("#define HAVE_VFPV3 \n");
|
||||
printf("#define HAVE_NEON \n");
|
||||
printf("#define HAVE_VFPV4 \n");
|
||||
printf("#define THUNDERX2T99 \n");
|
||||
printf("#define L1_CODE_SIZE 32768 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
@@ -269,7 +320,23 @@ void get_cpuconfig(void)
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
|
||||
case CPU_TSV110:
|
||||
printf("#define TSV110 \n");
|
||||
printf("#define L1_CODE_SIZE 65536 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
|
||||
printf("#define L1_DATA_SIZE 65536 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
|
||||
printf("#define L2_SIZE 524228 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
}
|
||||
get_cpucount();
|
||||
}
|
||||
|
||||
|
||||
@@ -305,12 +372,10 @@ void get_features(void)
|
||||
if( p == NULL ) return;
|
||||
|
||||
t = strtok(p," ");
|
||||
while( t = strtok(NULL," "))
|
||||
while( (t = strtok(NULL," ")))
|
||||
{
|
||||
}
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -75,7 +75,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define CPU_1004K 2
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKOWN",
|
||||
"UNKNOWN",
|
||||
"P5600",
|
||||
"1004K"
|
||||
};
|
||||
|
||||
@@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define CPU_I6500 6
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKOWN",
|
||||
"UNKNOWN",
|
||||
"SICORTEX",
|
||||
"LOONGSON3A",
|
||||
"LOONGSON3B",
|
||||
|
||||
@@ -56,6 +56,7 @@
|
||||
#define CPUTYPE_CELL 6
|
||||
#define CPUTYPE_PPCG4 7
|
||||
#define CPUTYPE_POWER8 8
|
||||
#define CPUTYPE_POWER9 9
|
||||
|
||||
char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
@@ -66,7 +67,8 @@ char *cpuname[] = {
|
||||
"POWER6",
|
||||
"CELL",
|
||||
"PPCG4",
|
||||
"POWER8"
|
||||
"POWER8",
|
||||
"POWER9"
|
||||
};
|
||||
|
||||
char *lowercpuname[] = {
|
||||
@@ -78,7 +80,8 @@ char *lowercpuname[] = {
|
||||
"power6",
|
||||
"cell",
|
||||
"ppcg4",
|
||||
"power8"
|
||||
"power8",
|
||||
"power9"
|
||||
};
|
||||
|
||||
char *corename[] = {
|
||||
@@ -90,7 +93,8 @@ char *corename[] = {
|
||||
"POWER6",
|
||||
"CELL",
|
||||
"PPCG4",
|
||||
"POWER8"
|
||||
"POWER8",
|
||||
"POWER9"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
@@ -120,6 +124,7 @@ int detect(void){
|
||||
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
|
||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
||||
|
||||
@@ -127,6 +132,33 @@ int detect(void){
|
||||
#endif
|
||||
|
||||
#ifdef _AIX
|
||||
FILE *infile;
|
||||
char buffer[512], *p;
|
||||
|
||||
p = (char *)NULL;
|
||||
infile = popen("prtconf|grep 'Processor Type'", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("Pro", buffer, 3)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
pclose(infile);
|
||||
|
||||
if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3;
|
||||
if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4;
|
||||
if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970;
|
||||
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
|
||||
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
|
||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
||||
return CPUTYPE_POWER5;
|
||||
#endif
|
||||
|
||||
@@ -143,12 +175,12 @@ int detect(void){
|
||||
return CPUTYPE_PPC970;
|
||||
#endif
|
||||
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
|
||||
int id;
|
||||
id = __asm __volatile("mfpvr %0" : "=r"(id));
|
||||
__asm __volatile("mfpvr %0" : "=r"(id));
|
||||
switch ( id >> 16 ) {
|
||||
case 0x4e: // POWER9
|
||||
return return CPUTYPE_POWER8;
|
||||
return CPUTYPE_POWER9;
|
||||
break;
|
||||
case 0x4d:
|
||||
case 0x4b: // POWER8/8E
|
||||
|
||||
245
cpuid_x86.c
245
cpuid_x86.c
@@ -97,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
|
||||
("mov %%ebx, %%edi;"
|
||||
"cpuid;"
|
||||
"xchgl %%ebx, %%edi;"
|
||||
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
|
||||
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc");
|
||||
#else
|
||||
__asm__ __volatile__
|
||||
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
|
||||
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc");
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -211,6 +211,44 @@ int support_avx(){
|
||||
#endif
|
||||
}
|
||||
|
||||
int support_avx2(){
|
||||
#ifndef NO_AVX2
|
||||
int eax, ebx, ecx=0, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 0)
|
||||
ret=1; //OS supports AVX2
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int support_avx512(){
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & 32) != 32){
|
||||
ret=0; //OS does not even support AVX2
|
||||
}
|
||||
if((ebx & (1<<31)) != 0){
|
||||
xgetbv(0, &eax, &edx);
|
||||
if((eax & 0xe0) == 0xe0)
|
||||
ret=1; //OS supports AVX512VL
|
||||
}
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
int get_vendor(void){
|
||||
int eax, ebx, ecx, edx;
|
||||
@@ -233,6 +271,7 @@ int get_vendor(void){
|
||||
if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS;
|
||||
if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA;
|
||||
if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC;
|
||||
if (!strcmp(vendor, "HygonGenuine")) return VENDOR_HYGON;
|
||||
|
||||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
|
||||
|
||||
@@ -294,6 +333,8 @@ int get_cputype(int gettype){
|
||||
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
|
||||
#ifndef NO_AVX
|
||||
if (support_avx()) feature |= HAVE_AVX;
|
||||
if (support_avx2()) feature |= HAVE_AVX2;
|
||||
if (support_avx512()) feature |= HAVE_AVX512VL;
|
||||
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
|
||||
#endif
|
||||
|
||||
@@ -1006,7 +1047,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
|
||||
}
|
||||
}
|
||||
|
||||
if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) {
|
||||
if ((get_vendor() == VENDOR_AMD) ||
|
||||
(get_vendor() == VENDOR_HYGON) ||
|
||||
(get_vendor() == VENDOR_CENTAUR)) {
|
||||
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
LDTB.size = 4096;
|
||||
@@ -1154,7 +1197,11 @@ int get_cpuname(void){
|
||||
case 3:
|
||||
case 5:
|
||||
case 6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CPUTYPE_CORE2;
|
||||
#else
|
||||
return CPUTYPE_PENTIUM2;
|
||||
#endif
|
||||
case 7:
|
||||
case 8:
|
||||
case 10:
|
||||
@@ -1168,7 +1215,7 @@ int get_cpuname(void){
|
||||
return CPUTYPE_CORE2;
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
case 1: // family 6 exmodel 1
|
||||
switch (model) {
|
||||
case 6:
|
||||
return CPUTYPE_CORE2;
|
||||
@@ -1185,7 +1232,7 @@ int get_cpuname(void){
|
||||
return CPUTYPE_DUNNINGTON;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
case 2: // family 6 exmodel 2
|
||||
switch (model) {
|
||||
case 5:
|
||||
//Intel Core (Clarkdale) / Core (Arrandale)
|
||||
@@ -1214,7 +1261,7 @@ int get_cpuname(void){
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
case 3: // family 6 exmodel 3
|
||||
switch (model) {
|
||||
case 7:
|
||||
// Bay Trail
|
||||
@@ -1228,57 +1275,47 @@ int get_cpuname(void){
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 12:
|
||||
case 15:
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 13:
|
||||
//Broadwell
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
case 4: // family 6 exmodel 4
|
||||
switch (model) {
|
||||
case 5:
|
||||
case 6:
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 7:
|
||||
case 15:
|
||||
//Broadwell
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 14:
|
||||
//Skylake
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 12:
|
||||
@@ -1288,84 +1325,85 @@ int get_cpuname(void){
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
case 5: // family 6 exmodel 5
|
||||
switch (model) {
|
||||
case 6:
|
||||
//Broadwell
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 5:
|
||||
// Skylake X
|
||||
#ifndef NO_AVX512
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
#endif
|
||||
case 14:
|
||||
// Skylake
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 7:
|
||||
// Xeon Phi Knights Landing
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 12:
|
||||
// Apollo Lake
|
||||
case 15:
|
||||
// Denverton
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 6:
|
||||
case 6: // family 6 exmodel 6
|
||||
switch (model) {
|
||||
case 6: // Cannon Lake
|
||||
#ifndef NO_AVX512
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
break;
|
||||
case 7: // family 6 exmodel 7
|
||||
switch (model) {
|
||||
case 14: // Kaby Lake
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
case 10: // Goldmont Plus
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 14: // Ice Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
switch (model) {
|
||||
case 14: // Kaby Lake and refreshes
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
@@ -1393,7 +1431,11 @@ int get_cpuname(void){
|
||||
case 0x5:
|
||||
return CPUTYPE_AMDK6;
|
||||
case 0x6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CPUTYPE_BARCELONA;
|
||||
#else
|
||||
return CPUTYPE_ATHLON;
|
||||
#endif
|
||||
case 0xf:
|
||||
switch (exfamily) {
|
||||
case 0:
|
||||
@@ -1469,6 +1511,26 @@ int get_cpuname(void){
|
||||
return CPUTYPE_AMD_UNKNOWN;
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_HYGON){
|
||||
switch (family) {
|
||||
case 0xf:
|
||||
switch (exfamily) {
|
||||
case 9:
|
||||
//Hygon Dhyana
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_ZEN;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_BARCELONA;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return CPUTYPE_HYGON_UNKNOWN;
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_CYRIX){
|
||||
switch (family) {
|
||||
case 0x4:
|
||||
@@ -1590,7 +1652,8 @@ static char *cpuname[] = {
|
||||
"STEAMROLLER",
|
||||
"EXCAVATOR",
|
||||
"ZEN",
|
||||
"SKYLAKEX"
|
||||
"SKYLAKEX",
|
||||
"DHYANA"
|
||||
};
|
||||
|
||||
static char *lowercpuname[] = {
|
||||
@@ -1645,11 +1708,12 @@ static char *lowercpuname[] = {
|
||||
"steamroller",
|
||||
"excavator",
|
||||
"zen",
|
||||
"skylakex"
|
||||
"skylakex",
|
||||
"dhyana"
|
||||
};
|
||||
|
||||
static char *corename[] = {
|
||||
"UNKOWN",
|
||||
"UNKNOWN",
|
||||
"80486",
|
||||
"P5",
|
||||
"P6",
|
||||
@@ -1677,7 +1741,8 @@ static char *corename[] = {
|
||||
"STEAMROLLER",
|
||||
"EXCAVATOR",
|
||||
"ZEN",
|
||||
"SKYLAKEX"
|
||||
"SKYLAKEX",
|
||||
"DHYANA"
|
||||
};
|
||||
|
||||
static char *corename_lower[] = {
|
||||
@@ -1709,7 +1774,8 @@ static char *corename_lower[] = {
|
||||
"steamroller",
|
||||
"excavator",
|
||||
"zen",
|
||||
"skylakex"
|
||||
"skylakex",
|
||||
"dhyana"
|
||||
};
|
||||
|
||||
|
||||
@@ -1752,7 +1818,11 @@ int get_coretype(void){
|
||||
case 4:
|
||||
case 5:
|
||||
case 6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CORE_CORE2;
|
||||
#else
|
||||
return CORE_P6;
|
||||
#endif
|
||||
case 7:
|
||||
return CORE_KATMAI;
|
||||
case 8:
|
||||
@@ -1959,7 +2029,11 @@ int get_coretype(void){
|
||||
|
||||
if (vendor == VENDOR_AMD){
|
||||
if (family <= 0x5) return CORE_80486;
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
if (family <= 0xe) return CORE_BARCELONA;
|
||||
#else
|
||||
if (family <= 0xe) return CORE_ATHLON;
|
||||
#endif
|
||||
if (family == 0xf){
|
||||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
||||
else if (exfamily == 5) return CORE_BOBCAT;
|
||||
@@ -2009,6 +2083,8 @@ int get_coretype(void){
|
||||
switch (model) {
|
||||
case 1:
|
||||
// AMD Ryzen
|
||||
case 8:
|
||||
// Ryzen 2
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_ZEN;
|
||||
@@ -2024,6 +2100,23 @@ int get_coretype(void){
|
||||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_HYGON){
|
||||
if (family == 0xf){
|
||||
if (exfamily == 9) {
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_ZEN;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
|
||||
#endif
|
||||
else
|
||||
return CORE_BARCELONA;
|
||||
} else {
|
||||
return CORE_BARCELONA;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_CENTAUR) {
|
||||
switch (family) {
|
||||
case 0x6:
|
||||
@@ -2110,6 +2203,8 @@ void get_cpuconfig(void){
|
||||
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n");
|
||||
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n");
|
||||
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
|
||||
if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n");
|
||||
if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n");
|
||||
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
||||
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
||||
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
|
||||
@@ -2178,6 +2273,8 @@ void get_sse(void){
|
||||
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n");
|
||||
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n");
|
||||
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
|
||||
if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n");
|
||||
if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n");
|
||||
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
||||
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
||||
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
|
||||
|
||||
@@ -27,20 +27,23 @@
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
#define CPU_Z15 3
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13",
|
||||
"Z14"
|
||||
"Z14",
|
||||
"Z15"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13",
|
||||
"z14"
|
||||
"z14",
|
||||
"z15"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
@@ -64,10 +67,10 @@ int detect(void)
|
||||
|
||||
if (strstr(p, "2964")) return CPU_Z13;
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
|
||||
/* detect z14, but fall back to z13 */
|
||||
if (strstr(p, "3906")) return CPU_Z13;
|
||||
if (strstr(p, "3907")) return CPU_Z13;
|
||||
if (strstr(p, "3906")) return CPU_Z14;
|
||||
if (strstr(p, "3907")) return CPU_Z14;
|
||||
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14
|
||||
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
@@ -116,7 +119,14 @@ void get_cpuconfig(void)
|
||||
break;
|
||||
case CPU_Z14:
|
||||
printf("#define Z14\n");
|
||||
printf("#define L1_DATA_SIZE 131072\n");
|
||||
printf("#define L1_DATA_LINESIZE 256\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8\n");
|
||||
printf("#define L2_SIZE 4194304\n");
|
||||
printf("#define L2_LINESIZE 256\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
2
ctest.c
2
ctest.c
@@ -113,7 +113,7 @@ ARCH_X86
|
||||
ARCH_X86_64
|
||||
#endif
|
||||
|
||||
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER)
|
||||
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__)
|
||||
ARCH_POWER
|
||||
#endif
|
||||
|
||||
|
||||
@@ -6,6 +6,8 @@ TOPDIR = ..
|
||||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
override CFLAGS += -DADD$(BU) -DCBLAS
|
||||
override TARGET_ARCH=
|
||||
override TARGET_MACH=
|
||||
|
||||
LIB = $(TOPDIR)/$(LIBNAME)
|
||||
|
||||
|
||||
@@ -577,7 +577,7 @@
|
||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||
* ************************* STEST1 *****************************
|
||||
*
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||
*
|
||||
|
||||
@@ -1503,6 +1503,8 @@ C $ ' .' )
|
||||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
||||
@@ -653,7 +653,7 @@
|
||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||
* ************************* STEST1 *****************************
|
||||
*
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||
*
|
||||
|
||||
@@ -653,7 +653,7 @@
|
||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||
* ************************* STEST1 *****************************
|
||||
*
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||
*
|
||||
|
||||
@@ -577,7 +577,7 @@
|
||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||
* ************************* STEST1 *****************************
|
||||
*
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||
*
|
||||
|
||||
@@ -1504,6 +1504,8 @@ C $ ' .' )
|
||||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
||||
@@ -62,9 +62,36 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef TRANSA
|
||||
#ifndef thread_local
|
||||
# if __STDC_VERSION__ >= 201112 && !defined __STDC_NO_THREADS__
|
||||
# define thread_local _Thread_local
|
||||
# elif defined _WIN32 && ( \
|
||||
defined _MSC_VER || \
|
||||
defined __ICL || \
|
||||
defined __DMC__ || \
|
||||
defined __BORLANDC__ )
|
||||
# define thread_local __declspec(thread)
|
||||
/* note that ICC (linux) and Clang are covered by __GNUC__ */
|
||||
# elif defined __GNUC__ || \
|
||||
defined __SUNPRO_C || \
|
||||
defined __xlC__
|
||||
# define thread_local __thread
|
||||
# else
|
||||
# define UNSAFE
|
||||
#endif
|
||||
#endif
|
||||
#if defined USE_OPENMP
|
||||
#undef UNSAFE
|
||||
#endif
|
||||
|
||||
#if !defined(TRANSA) && !defined(UNSAFE)
|
||||
#define Y_DUMMY_NUM 1024
|
||||
#if defined(USE_OPENMP)
|
||||
static FLOAT y_dummy[Y_DUMMY_NUM];
|
||||
#pragma omp threadprivate(y_dummy)
|
||||
# else
|
||||
static thread_local FLOAT y_dummy[Y_DUMMY_NUM];
|
||||
# endif
|
||||
#endif
|
||||
|
||||
static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
|
||||
@@ -105,10 +132,12 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||
#ifdef TRANSA
|
||||
y += n_from * incy * COMPSIZE;
|
||||
#else
|
||||
# ifndef UNSAFE
|
||||
//for split matrix row (n) direction and vector x of gemv_n
|
||||
x += n_from * incx * COMPSIZE;
|
||||
//store partial result for every thread
|
||||
y += (m_to - m_from) * 1 * COMPSIZE * pos;
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -136,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
||||
#ifndef TRANSA
|
||||
#if !defined(TRANSA) && !defined(UNSAFE)
|
||||
int split_x=0;
|
||||
#endif
|
||||
|
||||
@@ -212,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||
i -= width;
|
||||
}
|
||||
|
||||
#ifndef TRANSA
|
||||
#if !defined(TRANSA) && !defined(UNSAFE)
|
||||
//try to split matrix on row direction and x.
|
||||
//Then, reduction.
|
||||
if (num_cpu < nthreads) {
|
||||
@@ -272,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||
exec_blas(num_cpu, queue);
|
||||
}
|
||||
|
||||
#ifndef TRANSA
|
||||
#if !defined(TRANSA) && !defined(UNSAFE)
|
||||
if(split_x==1){
|
||||
//reduction
|
||||
for(i=0; i<num_cpu; i++){
|
||||
|
||||
@@ -54,16 +54,12 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
/*FIXME the GEMV unrolling performed here was found to be broken, see issue 1332 */
|
||||
/* Multiplying DTB size by 100 is just a quick-and-dirty hack to disable it for now[B */
|
||||
for (is = 0; is < m; is += DTB_ENTRIES){
|
||||
|
||||
for (is = 0; is < m; is += DTB_ENTRIES * 100){
|
||||
|
||||
min_i = MIN(m - is, DTB_ENTRIES * 100);
|
||||
min_i = MIN(m - is, DTB_ENTRIES);
|
||||
|
||||
#ifndef TRANSA
|
||||
if (is > 0){
|
||||
fprintf(stderr,"WARNING unrolling of the trmv_U loop may give wrong results\n");
|
||||
if (is > 0){
|
||||
GEMV_N(is, min_i, 0, dp1,
|
||||
a + is * lda, lda,
|
||||
B + is, 1,
|
||||
|
||||
@@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
||||
|
||||
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
|
||||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
|
||||
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
|
||||
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
|
||||
|
||||
queue[num_cpu].mode = mode;
|
||||
queue[num_cpu].routine = trmv_kernel;
|
||||
@@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
||||
|
||||
range_m[num_cpu + 1] = range_m[num_cpu] + width;
|
||||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
|
||||
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
|
||||
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
|
||||
|
||||
queue[num_cpu].mode = mode;
|
||||
queue[num_cpu].routine = trmv_kernel;
|
||||
|
||||
@@ -48,6 +48,10 @@
|
||||
#define SWITCH_RATIO 2
|
||||
#endif
|
||||
|
||||
#ifndef GEMM_PREFERED_SIZE
|
||||
#define GEMM_PREFERED_SIZE 1
|
||||
#endif
|
||||
|
||||
//The array of job_t may overflow the stack.
|
||||
//Instead, use malloc to alloc job_t.
|
||||
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
||||
@@ -510,10 +514,29 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int round_up(int remainder, int width, int multiple)
|
||||
{
|
||||
if (multiple > remainder || width <= multiple)
|
||||
return width;
|
||||
width = (width + multiple - 1) / multiple;
|
||||
width = width * multiple;
|
||||
return width;
|
||||
}
|
||||
|
||||
|
||||
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
*range_n, FLOAT *sa, FLOAT *sb,
|
||||
BLASLONG nthreads_m, BLASLONG nthreads_n) {
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
#else
|
||||
CRITICAL_SECTION level3_lock;
|
||||
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
blas_arg_t newarg;
|
||||
|
||||
#ifndef USE_ALLOC_HEAP
|
||||
@@ -554,6 +577,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
pthread_mutex_lock(&level3_lock);
|
||||
#else
|
||||
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
/* Dynamically allocate workspace */
|
||||
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
||||
@@ -601,9 +632,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
num_parts = 0;
|
||||
while (m > 0){
|
||||
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
|
||||
|
||||
width = round_up(m, width, GEMM_PREFERED_SIZE);
|
||||
|
||||
m -= width;
|
||||
|
||||
if (m < 0) width = width + m;
|
||||
range_M[num_parts + 1] = range_M[num_parts] + width;
|
||||
|
||||
num_parts ++;
|
||||
}
|
||||
for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
|
||||
@@ -645,9 +681,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
if (width < SWITCH_RATIO) {
|
||||
width = SWITCH_RATIO;
|
||||
}
|
||||
width = round_up(n, width, GEMM_PREFERED_SIZE);
|
||||
|
||||
n -= width;
|
||||
if (n < 0) width = width + n;
|
||||
range_N[num_parts + 1] = range_N[num_parts] + width;
|
||||
|
||||
num_parts ++;
|
||||
}
|
||||
for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
|
||||
@@ -671,6 +710,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
free(job);
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
pthread_mutex_unlock(&level3_lock);
|
||||
#else
|
||||
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
||||
|
||||
BLASLONG width, i;
|
||||
BLASLONG n_from, n_to;
|
||||
double dnum, nf, nt, di;
|
||||
double dnum, nf, nt, di, dinum;
|
||||
|
||||
int num_cpu;
|
||||
int mask = 0;
|
||||
@@ -109,7 +109,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
||||
if (nthreads - num_cpu > 1) {
|
||||
|
||||
di = (double)i;
|
||||
width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1);
|
||||
dinum = di * di +dnum;
|
||||
if (dinum <0)
|
||||
width = (BLASLONG)(( - di + mask)/(mask+1)) * (mask+1);
|
||||
else
|
||||
width = (BLASLONG)(( sqrt(dinum) - di + mask)/(mask+1)) * (mask+1);
|
||||
|
||||
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
|
||||
|
||||
@@ -136,9 +140,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
||||
|
||||
nf = (double)(arg -> n - n_from);
|
||||
nt = (double)(arg -> n - n_to);
|
||||
|
||||
dnum = (nt * nt - nf * nf) / (double)nthreads;
|
||||
|
||||
num_cpu = 0;
|
||||
|
||||
range[0] = n_from;
|
||||
@@ -149,8 +151,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
||||
if (nthreads - num_cpu > 1) {
|
||||
|
||||
di = (double)(arg -> n - i);
|
||||
width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1);
|
||||
|
||||
dinum = di * di + dnum;
|
||||
if (dinum<0)
|
||||
width = ((BLASLONG)(di + mask)/(mask+1)) * (mask+1);
|
||||
else
|
||||
width = ((BLASLONG)((-sqrt(dinum) + di) + mask)/(mask+1)) * (mask+1);
|
||||
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
|
||||
|
||||
} else {
|
||||
|
||||
@@ -47,7 +47,11 @@ GenerateNamedObjects("abs.c" "DOUBLE" "z_abs" 0 "" "" 1)
|
||||
GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1)
|
||||
|
||||
if (DYNAMIC_ARCH)
|
||||
list(APPEND COMMON_SOURCES dynamic.c)
|
||||
if (ARM64)
|
||||
list(APPEND COMMON_SOURCES dynamic_arm64.c)
|
||||
else ()
|
||||
list(APPEND COMMON_SOURCES dynamic.c)
|
||||
endif ()
|
||||
else ()
|
||||
list(APPEND COMMON_SOURCES parameter.c)
|
||||
endif ()
|
||||
|
||||
@@ -15,7 +15,15 @@ endif
|
||||
# COMMONOBJS += info.$(SUFFIX)
|
||||
|
||||
ifeq ($(DYNAMIC_ARCH), 1)
|
||||
ifeq ($(ARCH),arm64)
|
||||
COMMONOBJS += dynamic_arm64.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),power)
|
||||
COMMONOBJS += dynamic_power.$(SUFFIX)
|
||||
else
|
||||
COMMONOBJS += dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
else
|
||||
COMMONOBJS += parameter.$(SUFFIX)
|
||||
endif
|
||||
@@ -71,7 +79,15 @@ BLAS_SERVER = blas_server.c
|
||||
endif
|
||||
|
||||
ifeq ($(DYNAMIC_ARCH), 1)
|
||||
ifeq ($(ARCH),arm64)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),power)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX)
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
|
||||
endif
|
||||
|
||||
@@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
|
||||
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
|
||||
/* jobs is queued. */
|
||||
|
||||
/* We need this grobal for cheking if initialization is finished. */
|
||||
/* We need this global for checking if initialization is finished. */
|
||||
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
|
||||
|
||||
/* Local Variables */
|
||||
@@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
|
||||
|
||||
#ifdef MONITOR
|
||||
|
||||
/* Monitor is a function to see thread's status for every seconds. */
|
||||
/* Usually it turns off and it's for debugging. */
|
||||
/* Monitor is a function to see thread's status for every second. */
|
||||
/* Usually it turns off and it's for debugging. */
|
||||
|
||||
static pthread_t monitor_thread;
|
||||
static int main_status[MAX_CPU_NUMBER];
|
||||
@@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) {
|
||||
|
||||
long i;
|
||||
|
||||
#ifdef SMP_SERVER
|
||||
// Handle lazy re-init of the thread-pool after a POSIX fork
|
||||
if (unlikely(blas_server_avail == 0)) blas_thread_init();
|
||||
#endif
|
||||
|
||||
if (num_threads < 1) num_threads = blas_num_threads;
|
||||
|
||||
#ifndef NO_AFFINITY
|
||||
|
||||
@@ -50,7 +50,7 @@
|
||||
|
||||
/* This is a thread implementation for Win32 lazy implementation */
|
||||
|
||||
/* Thread server common infomation */
|
||||
/* Thread server common information */
|
||||
typedef struct{
|
||||
CRITICAL_SECTION lock;
|
||||
HANDLE filled;
|
||||
@@ -61,7 +61,7 @@ typedef struct{
|
||||
|
||||
} blas_pool_t;
|
||||
|
||||
/* We need this global for cheking if initialization is finished. */
|
||||
/* We need this global for checking if initialization is finished. */
|
||||
int blas_server_avail = 0;
|
||||
|
||||
/* Local Variables */
|
||||
@@ -461,13 +461,22 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||
SetEvent(pool.killed);
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
||||
// Could also just use WaitForMultipleObjects
|
||||
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000);
|
||||
|
||||
#ifndef OS_WINDOWSSTORE
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
TerminateThread(blas_threads[i],0);
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
if (WAIT_OBJECT_0 != wait_thread_value) {
|
||||
TerminateThread(blas_threads[i],0);
|
||||
}
|
||||
#endif
|
||||
|
||||
CloseHandle(blas_threads[i]);
|
||||
}
|
||||
|
||||
CloseHandle(pool.filled);
|
||||
CloseHandle(pool.killed);
|
||||
|
||||
blas_server_avail = 0;
|
||||
}
|
||||
|
||||
@@ -478,7 +487,12 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||
|
||||
void goto_set_num_threads(int num_threads)
|
||||
{
|
||||
long i;
|
||||
long i;
|
||||
|
||||
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
|
||||
// Handle lazy re-init of the thread-pool after a POSIX fork
|
||||
if (unlikely(blas_server_avail == 0)) blas_thread_init();
|
||||
#endif
|
||||
|
||||
if (num_threads < 1) num_threads = blas_cpu_number;
|
||||
|
||||
|
||||
@@ -274,6 +274,7 @@ extern gotoblas_t gotoblas_SKYLAKEX;
|
||||
#define VENDOR_INTEL 1
|
||||
#define VENDOR_AMD 2
|
||||
#define VENDOR_CENTAUR 3
|
||||
#define VENDOR_HYGON 4
|
||||
#define VENDOR_UNKNOWN 99
|
||||
|
||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||
@@ -304,9 +305,49 @@ int support_avx(){
|
||||
#endif
|
||||
}
|
||||
|
||||
int support_avx2(){
|
||||
#ifndef NO_AVX2
|
||||
int eax, ebx, ecx=0, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 0)
|
||||
ret=1; //OS supports AVX2
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int support_avx512(){
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) == 0){
|
||||
ret=0; //OS does not even support AVX2
|
||||
}
|
||||
if((ebx & (1<<31)) != 0){
|
||||
xgetbv(0, &eax, &edx);
|
||||
if((eax & 0xe0) == 0xe0)
|
||||
ret=1; //OS supports AVX512VL
|
||||
}
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
#define FALLBACK_VERBOSE 1
|
||||
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
|
||||
#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
|
||||
#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
|
||||
#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
|
||||
|
||||
static int get_vendor(void){
|
||||
@@ -329,6 +370,7 @@ static int get_vendor(void){
|
||||
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
|
||||
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
|
||||
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
|
||||
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
|
||||
|
||||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
|
||||
|
||||
@@ -403,18 +445,24 @@ static gotoblas_t *get_coretype(void){
|
||||
}
|
||||
//Intel Haswell
|
||||
if (model == 12 || model == 15) {
|
||||
if(support_avx())
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Broadwell
|
||||
if (model == 13) {
|
||||
if(support_avx())
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
@@ -424,27 +472,36 @@ static gotoblas_t *get_coretype(void){
|
||||
case 4:
|
||||
//Intel Haswell
|
||||
if (model == 5 || model == 6) {
|
||||
if(support_avx())
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Broadwell
|
||||
if (model == 7 || model == 15) {
|
||||
if(support_avx())
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Skylake
|
||||
if (model == 14) {
|
||||
if(support_avx())
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
@@ -457,72 +514,104 @@ static gotoblas_t *get_coretype(void){
|
||||
case 5:
|
||||
//Intel Broadwell
|
||||
if (model == 6) {
|
||||
if(support_avx())
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
if (model == 5) {
|
||||
// Intel Skylake X
|
||||
#ifndef NO_AVX512
|
||||
return &gotoblas_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
//Intel Skylake
|
||||
if (model == 14) {
|
||||
if(support_avx())
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Phi Knights Landing
|
||||
if (model == 7) {
|
||||
if(support_avx())
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Apollo Lake
|
||||
if (model == 12) {
|
||||
//Apollo Lake or Denverton
|
||||
if (model == 12 || model == 15) {
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
return NULL;
|
||||
case 6:
|
||||
if (model == 6) {
|
||||
// Cannon Lake
|
||||
#ifndef NO_AVX512
|
||||
return &gotoblas_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return &gotoblas_HASWELL;
|
||||
#else
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return &gotoblas_NEHALEM;
|
||||
#endif
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 7:
|
||||
if (model == 14) {
|
||||
// Ice Lake
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 14 ) { // Kaby Lake
|
||||
if(support_avx())
|
||||
if (model == 14 ) { // Kaby Lake, Coffee Lake
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
@@ -535,7 +624,7 @@ static gotoblas_t *get_coretype(void){
|
||||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_AMD){
|
||||
if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
|
||||
if (family <= 0xe) {
|
||||
// Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
|
||||
cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
|
||||
@@ -615,6 +704,13 @@ static gotoblas_t *get_coretype(void){
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
} else if (exfamily == 9) {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else {
|
||||
return &gotoblas_BARCELONA;
|
||||
}
|
||||
|
||||
204
driver/others/dynamic_arm64.c
Normal file
204
driver/others/dynamic_arm64.c
Normal file
@@ -0,0 +1,204 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if (defined OS_LINUX || defined OS_ANDROID)
|
||||
#include <asm/hwcap.h>
|
||||
#include <sys/auxv.h>
|
||||
#endif
|
||||
|
||||
extern gotoblas_t gotoblas_ARMV8;
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
extern gotoblas_t gotoblas_THUNDERX;
|
||||
extern gotoblas_t gotoblas_THUNDERX2T99;
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#define NUM_CORETYPES 4
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
* that HWCAP_CPUID is defined
|
||||
*/
|
||||
#ifndef HWCAP_CPUID
|
||||
#define HWCAP_CPUID (1 << 11)
|
||||
#endif
|
||||
|
||||
#define get_cpu_ftr(id, var) ({ \
|
||||
asm("mrs %0, "#id : "=r" (var)); \
|
||||
})
|
||||
|
||||
static char *corename[] = {
|
||||
"armv8",
|
||||
"cortexa57",
|
||||
"thunderx",
|
||||
"thunderx2t99",
|
||||
"unknown"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
|
||||
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1];
|
||||
if (gotoblas == &gotoblas_THUNDERX) return corename[ 2];
|
||||
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
static gotoblas_t *force_coretype(char *coretype) {
|
||||
int i ;
|
||||
int found = -1;
|
||||
char message[128];
|
||||
|
||||
for ( i=0 ; i < NUM_CORETYPES; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype, corename[i], 20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 0: return (&gotoblas_ARMV8);
|
||||
case 1: return (&gotoblas_CORTEXA57);
|
||||
case 2: return (&gotoblas_THUNDERX);
|
||||
case 3: return (&gotoblas_THUNDERX2T99);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
int implementer, variant, part, arch, revision, midr_el1;
|
||||
|
||||
#if (defined OS_LINUX || defined OS_ANDROID)
|
||||
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
|
||||
char coremsg[128];
|
||||
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
|
||||
openblas_warning(1, coremsg);
|
||||
return NULL;
|
||||
}
|
||||
#else
|
||||
return NULL;
|
||||
#endif
|
||||
|
||||
get_cpu_ftr(MIDR_EL1, midr_el1);
|
||||
/*
|
||||
* MIDR_EL1
|
||||
*
|
||||
* 31 24 23 20 19 16 15 4 3 0
|
||||
* -----------------------------------------------------------------
|
||||
* | Implementer | Variant | Architecture | Part Number | Revision |
|
||||
* -----------------------------------------------------------------
|
||||
*/
|
||||
implementer = (midr_el1 >> 24) & 0xFF;
|
||||
part = (midr_el1 >> 4) & 0xFFF;
|
||||
|
||||
switch(implementer)
|
||||
{
|
||||
case 0x41: // ARM
|
||||
switch (part)
|
||||
{
|
||||
case 0xd07: // Cortex A57
|
||||
case 0xd08: // Cortex A72
|
||||
case 0xd03: // Cortex A53
|
||||
return &gotoblas_CORTEXA57;
|
||||
}
|
||||
break;
|
||||
case 0x42: // Broadcom
|
||||
switch (part)
|
||||
{
|
||||
case 0x516: // Vulcan
|
||||
return &gotoblas_THUNDERX2T99;
|
||||
}
|
||||
break;
|
||||
case 0x43: // Cavium
|
||||
switch (part)
|
||||
{
|
||||
case 0x0a1: // ThunderX
|
||||
return &gotoblas_THUNDERX;
|
||||
case 0x0af: // ThunderX2
|
||||
return &gotoblas_THUNDERX2T99;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_init(void) {
|
||||
|
||||
char coremsg[128];
|
||||
char coren[22];
|
||||
char *p;
|
||||
|
||||
if (gotoblas) return;
|
||||
|
||||
p = getenv("OPENBLAS_CORETYPE");
|
||||
if ( p )
|
||||
{
|
||||
gotoblas = force_coretype(p);
|
||||
}
|
||||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
}
|
||||
|
||||
if (gotoblas == NULL)
|
||||
{
|
||||
snprintf(coremsg, 128, "Falling back to generic ARMV8 core\n");
|
||||
openblas_warning(1, coremsg);
|
||||
gotoblas = &gotoblas_ARMV8;
|
||||
}
|
||||
|
||||
if (gotoblas && gotoblas->init) {
|
||||
strncpy(coren, gotoblas_corename(), 20);
|
||||
sprintf(coremsg, "Core: %s\n", coren);
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas -> init();
|
||||
} else {
|
||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_quit(void) {
|
||||
gotoblas = NULL;
|
||||
}
|
||||
110
driver/others/dynamic_power.c
Normal file
110
driver/others/dynamic_power.c
Normal file
@@ -0,0 +1,110 @@
|
||||
|
||||
#include "common.h"
|
||||
|
||||
extern gotoblas_t gotoblas_POWER6;
|
||||
extern gotoblas_t gotoblas_POWER8;
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
extern gotoblas_t gotoblas_POWER9;
|
||||
#endif
|
||||
|
||||
extern void openblas_warning(int verbose, const char *msg);
|
||||
|
||||
static char *corename[] = {
|
||||
"unknown",
|
||||
"POWER6",
|
||||
"POWER8",
|
||||
"POWER9"
|
||||
};
|
||||
|
||||
#define NUM_CORETYPES 4
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_POWER6) return corename[1];
|
||||
if (gotoblas == &gotoblas_POWER8) return corename[2];
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
#endif
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
|
||||
if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
|
||||
return &gotoblas_POWER6;
|
||||
if (__builtin_cpu_is("power8"))
|
||||
return &gotoblas_POWER8;
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
if (__builtin_cpu_is("power9"))
|
||||
return &gotoblas_POWER9;
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static gotoblas_t *force_coretype(char * coretype) {
|
||||
|
||||
int i ;
|
||||
int found = -1;
|
||||
char message[128];
|
||||
|
||||
for ( i = 0 ; i < NUM_CORETYPES; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype, corename[i], 20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 1: return (&gotoblas_POWER6);
|
||||
case 2: return (&gotoblas_POWER8);
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
case 3: return (&gotoblas_POWER9);
|
||||
#endif
|
||||
default: return NULL;
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_init(void) {
|
||||
|
||||
char coremsg[128];
|
||||
char coren[22];
|
||||
char *p;
|
||||
|
||||
|
||||
if (gotoblas) return;
|
||||
|
||||
p = getenv("OPENBLAS_CORETYPE");
|
||||
if ( p )
|
||||
{
|
||||
gotoblas = force_coretype(p);
|
||||
}
|
||||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
}
|
||||
|
||||
if (gotoblas == NULL)
|
||||
{
|
||||
snprintf(coremsg, 128, "Falling back to POWER8 core\n");
|
||||
openblas_warning(1, coremsg);
|
||||
gotoblas = &gotoblas_POWER8;
|
||||
}
|
||||
|
||||
if (gotoblas && gotoblas -> init) {
|
||||
strncpy(coren,gotoblas_corename(),20);
|
||||
sprintf(coremsg, "Core: %s\n",coren);
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas -> init();
|
||||
} else {
|
||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_quit(void) {
|
||||
gotoblas = NULL;
|
||||
}
|
||||
@@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {
|
||||
|
||||
int mynode = 1;
|
||||
|
||||
/* if number of threads is larger than inital condition */
|
||||
/* if number of threads is larger than initial condition */
|
||||
if (pos < 0) {
|
||||
sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
|
||||
return 0;
|
||||
@@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) {
|
||||
common -> shmid = pshmid;
|
||||
|
||||
if (common -> magic != SH_MAGIC) {
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if __GLIBC_PREREQ(2, 7)
|
||||
cpu_set_t *cpusetp;
|
||||
#else
|
||||
cpu_set_t cpuset;
|
||||
#endif
|
||||
#endif
|
||||
int nums;
|
||||
int ret;
|
||||
|
||||
@@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) {
|
||||
}
|
||||
CPU_FREE(cpusetp);
|
||||
#else
|
||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
|
||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
|
||||
if (ret!=0) {
|
||||
common->num_procs = nums;
|
||||
} else {
|
||||
@@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) {
|
||||
int i;
|
||||
int n = 0;
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,cpusetp)) n++;
|
||||
if (CPU_ISSET(i,&cpuset)) n++;
|
||||
common->num_procs = n;
|
||||
}
|
||||
#else
|
||||
common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
|
||||
common->num_procs = CPU_COUNT(&cpuset);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -129,7 +129,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
@@ -192,51 +192,74 @@ void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
cpu_set_t *cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
int i,n;
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int n;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
return nums;
|
||||
#else
|
||||
#if !__GLIBC_PREREQ(2, 3)
|
||||
return nums;
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
|
||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
|
||||
if (ret!=0) return nums;
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,cpusetp)) n++;
|
||||
if (CPU_ISSET(i,&cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
#endif
|
||||
return nums;
|
||||
#else
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) return nums;
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) return nums;
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
if (nums >= CPU_SETSIZE) {
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) {
|
||||
return nums;
|
||||
}
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) {
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
} else {
|
||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
|
||||
if (ret!=0) {
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT(&cpuset);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
@@ -259,6 +282,16 @@ int get_num_procs(void) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_AIX
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
int get_num_procs(void) {
|
||||
@@ -279,7 +312,7 @@ int get_num_procs(void) {
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
@@ -371,7 +404,7 @@ extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
@@ -379,7 +412,7 @@ int blas_get_cpu_number(void){
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
@@ -403,7 +436,7 @@ int blas_get_cpu_number(void){
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
@@ -1280,6 +1313,13 @@ void blas_memory_free_nolock(void * map_address) {
|
||||
free(map_address);
|
||||
}
|
||||
|
||||
#ifdef SMP
|
||||
void blas_thread_memory_cleanup(void) {
|
||||
blas_memory_cleanup((void*)get_memory_table());
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void blas_shutdown(void){
|
||||
#ifdef SMP
|
||||
BLASFUNC(blas_thread_shutdown)();
|
||||
@@ -1289,7 +1329,7 @@ void blas_shutdown(void){
|
||||
/* Only cleanupIf we were built for threading and TLS was initialized */
|
||||
if (local_storage_key)
|
||||
#endif
|
||||
blas_memory_cleanup((void*)get_memory_table());
|
||||
blas_thread_memory_cleanup();
|
||||
|
||||
#ifdef SEEK_ADDRESS
|
||||
base_address = 0UL;
|
||||
@@ -1476,6 +1516,14 @@ void DESTRUCTOR gotoblas_quit(void) {
|
||||
|
||||
blas_shutdown();
|
||||
|
||||
#if defined(SMP)
|
||||
#if defined(OS_WINDOWS)
|
||||
TlsFree(local_storage_key);
|
||||
#else
|
||||
pthread_key_delete(local_storage_key);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef PROFILE
|
||||
moncontrol (0);
|
||||
#endif
|
||||
@@ -1511,7 +1559,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
|
||||
break;
|
||||
case DLL_THREAD_DETACH:
|
||||
#if defined(SMP)
|
||||
blas_memory_cleanup((void*)get_memory_table());
|
||||
blas_thread_memory_cleanup();
|
||||
#endif
|
||||
break;
|
||||
case DLL_PROCESS_DETACH:
|
||||
@@ -1574,6 +1622,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
gotoblas_init();
|
||||
gotoblas_quit();
|
||||
|
||||
#if __PGIC__ < 19
|
||||
#if 0
|
||||
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
|
||||
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
|
||||
@@ -1581,13 +1630,16 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
|
||||
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
/* USE_TLS / COMPILE_TLS not set */
|
||||
|
||||
#include <errno.h>
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
|
||||
#define ALLOC_WINDOWS
|
||||
#ifndef MEM_LARGE_PAGES
|
||||
#define MEM_LARGE_PAGES 0x20000000
|
||||
@@ -1601,7 +1653,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#ifndef OS_WINDOWS
|
||||
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
|
||||
#include <sys/mman.h>
|
||||
#ifndef NO_SYSV_IPC
|
||||
#include <sys/shm.h>
|
||||
@@ -1621,7 +1673,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
@@ -1660,9 +1712,12 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
|
||||
#define CONSTRUCTOR __attribute__ ((constructor))
|
||||
#define DESTRUCTOR __attribute__ ((destructor))
|
||||
#else
|
||||
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
|
||||
#define CONSTRUCTOR __attribute__ ((constructor(101)))
|
||||
#define DESTRUCTOR __attribute__ ((destructor(101)))
|
||||
#else
|
||||
#define CONSTRUCTOR __attribute__ ((constructor))
|
||||
#define DESTRUCTOR __attribute__ ((destructor))
|
||||
#endif
|
||||
|
||||
#ifdef DYNAMIC_ARCH
|
||||
@@ -1681,50 +1736,75 @@ void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
cpu_set_t *cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
int i,n;
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int n;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
return nums;
|
||||
#else
|
||||
#if !__GLIBC_PREREQ(2, 3)
|
||||
return nums;
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
|
||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
|
||||
if (ret!=0) return nums;
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,cpusetp)) n++;
|
||||
if (CPU_ISSET(i,&cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
#endif
|
||||
return nums;
|
||||
#else
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) return nums;
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) return nums;
|
||||
nums = CPU_COUNT_S(size,cpusetp);
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
if (nums >= CPU_SETSIZE) {
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) {
|
||||
return nums;
|
||||
}
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) {
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
} else {
|
||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
|
||||
if (ret!=0) {
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT(&cpuset);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
@@ -1739,6 +1819,22 @@ int get_num_procs(void) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_AIX
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
int get_num_procs(void) {
|
||||
@@ -1759,7 +1855,7 @@ int get_num_procs(void) {
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
@@ -1836,7 +1932,7 @@ void openblas_fork_handler()
|
||||
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
|
||||
// In the mean time build with USE_OPENMP=0 or link against another
|
||||
// implementation of OpenMP.
|
||||
#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
||||
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
||||
int err;
|
||||
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
|
||||
if(err != 0)
|
||||
@@ -1849,7 +1945,7 @@ extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
@@ -1857,11 +1953,11 @@ int blas_get_cpu_number(void){
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
blas_goto_num = 0;
|
||||
// blas_goto_num = 0;
|
||||
#ifndef USE_OPENMP
|
||||
blas_goto_num=openblas_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
@@ -1873,7 +1969,7 @@ int blas_get_cpu_number(void){
|
||||
|
||||
#endif
|
||||
|
||||
blas_omp_num = 0;
|
||||
// blas_omp_num = 0;
|
||||
blas_omp_num=openblas_omp_num_threads_env();
|
||||
if (blas_omp_num < 0) blas_omp_num = 0;
|
||||
|
||||
@@ -1881,7 +1977,7 @@ int blas_get_cpu_number(void){
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
@@ -1945,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL;
|
||||
|
||||
static void alloc_mmap_free(struct release_t *release){
|
||||
|
||||
if (!release->address) return;
|
||||
|
||||
if (munmap(release -> address, BUFFER_SIZE)) {
|
||||
printf("OpenBLAS : munmap failed\n");
|
||||
int errsv=errno;
|
||||
perror("OpenBLAS : munmap failed:");
|
||||
printf("error code=%d,\trelease->address=%lx\n",errsv,release->address);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1968,11 +2068,21 @@ static void *alloc_mmap(void *address){
|
||||
}
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
release_pos ++;
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
} else {
|
||||
#ifdef DEBUG
|
||||
int errsv=errno;
|
||||
perror("OpenBLAS : mmap failed:");
|
||||
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef OS_LINUX
|
||||
@@ -2114,14 +2224,18 @@ static void *alloc_mmap(void *address){
|
||||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||
}
|
||||
#endif
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
release_pos ++;
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
}
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
return map_address;
|
||||
}
|
||||
@@ -2489,7 +2603,7 @@ void *blas_memory_alloc(int procpos){
|
||||
|
||||
int position;
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
int mypos;
|
||||
int mypos = 0;
|
||||
#endif
|
||||
|
||||
void *map_address;
|
||||
@@ -2520,6 +2634,11 @@ void *blas_memory_alloc(int procpos){
|
||||
NULL,
|
||||
};
|
||||
void *(**func)(void *address);
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
if (!memory_initialized) {
|
||||
#endif
|
||||
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
if (!memory_initialized) {
|
||||
@@ -2555,12 +2674,15 @@ void *blas_memory_alloc(int procpos){
|
||||
|
||||
}
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#if defined(USE_OPENMP)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Alloc Start ...\n");
|
||||
#endif
|
||||
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
/* #if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
|
||||
mypos = WhereAmI();
|
||||
|
||||
@@ -2569,13 +2691,17 @@ void *blas_memory_alloc(int procpos){
|
||||
|
||||
do {
|
||||
if (!memory[position].used && (memory[position].pos == mypos)) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
/* blas_lock(&memory[position].lock);*/
|
||||
|
||||
#else
|
||||
blas_lock(&memory[position].lock);
|
||||
#endif
|
||||
if (!memory[position].used) goto allocation;
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
/* blas_unlock(&memory[position].lock);*/
|
||||
#else
|
||||
blas_unlock(&memory[position].lock);
|
||||
#endif
|
||||
}
|
||||
|
||||
position ++;
|
||||
@@ -2583,25 +2709,30 @@ void *blas_memory_alloc(int procpos){
|
||||
} while (position < NUM_BUFFERS);
|
||||
|
||||
|
||||
#endif
|
||||
#endif */
|
||||
|
||||
position = 0;
|
||||
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
do {
|
||||
/* if (!memory[position].used) { */
|
||||
/* blas_lock(&memory[position].lock);*/
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
if (!memory[position].used) {
|
||||
blas_lock(&memory[position].lock);
|
||||
#endif
|
||||
if (!memory[position].used) goto allocation;
|
||||
|
||||
/* blas_unlock(&memory[position].lock);*/
|
||||
/* } */
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
blas_unlock(&memory[position].lock);
|
||||
}
|
||||
#endif
|
||||
position ++;
|
||||
|
||||
} while (position < NUM_BUFFERS);
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
goto error;
|
||||
|
||||
allocation :
|
||||
@@ -2611,10 +2742,11 @@ void *blas_memory_alloc(int procpos){
|
||||
#endif
|
||||
|
||||
memory[position].used = 1;
|
||||
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
/* blas_unlock(&memory[position].lock);*/
|
||||
|
||||
#else
|
||||
blas_unlock(&memory[position].lock);
|
||||
#endif
|
||||
if (!memory[position].addr) {
|
||||
do {
|
||||
#ifdef DEBUG
|
||||
@@ -2631,7 +2763,7 @@ void *blas_memory_alloc(int procpos){
|
||||
|
||||
#ifdef ALLOC_DEVICEDRIVER
|
||||
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
|
||||
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
|
||||
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -2659,9 +2791,13 @@ void *blas_memory_alloc(int procpos){
|
||||
|
||||
} while ((BLASLONG)map_address == -1);
|
||||
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
memory[position].addr = map_address;
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
|
||||
@@ -2715,8 +2851,9 @@ void blas_memory_free(void *free_area){
|
||||
#endif
|
||||
|
||||
position = 0;
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
#endif
|
||||
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
|
||||
position++;
|
||||
|
||||
@@ -2730,7 +2867,9 @@ void blas_memory_free(void *free_area){
|
||||
WMB;
|
||||
|
||||
memory[position].used = 0;
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Unmap Succeeded.\n\n");
|
||||
@@ -2745,8 +2884,9 @@ void blas_memory_free(void *free_area){
|
||||
for (position = 0; position < NUM_BUFFERS; position++)
|
||||
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
|
||||
#endif
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -2796,7 +2936,7 @@ void blas_shutdown(void){
|
||||
|
||||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||
|
||||
#ifdef SMP
|
||||
#if defined(SMP) || defined(USE_LOCKING)
|
||||
#if defined(USE_PTHREAD_LOCK)
|
||||
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
#elif defined(USE_PTHREAD_SPINLOCK)
|
||||
@@ -2821,7 +2961,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
|
||||
if (hot_alloc != 2) {
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
#if defined(SMP) || defined(USE_LOCKING)
|
||||
LOCK_COMMAND(&init_lock);
|
||||
#endif
|
||||
|
||||
@@ -2831,7 +2971,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
|
||||
size -= PAGESIZE;
|
||||
}
|
||||
|
||||
#ifdef SMP
|
||||
#if defined(SMP) || defined(USE_LOCKING)
|
||||
UNLOCK_COMMAND(&init_lock);
|
||||
#endif
|
||||
|
||||
@@ -3064,7 +3204,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
|
||||
gotoblas_init();
|
||||
gotoblas_quit();
|
||||
|
||||
#if __PGIC__ < 19
|
||||
#if 0
|
||||
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
|
||||
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
|
||||
@@ -3072,6 +3212,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
|
||||
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -38,21 +38,29 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifndef SMP
|
||||
#define blas_cpu_number 1
|
||||
#else
|
||||
|
||||
int blas_cpu_number = 1;
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
|
||||
return blas_cpu_number;
|
||||
}
|
||||
#ifdef OS_LINUX
|
||||
#include <sys/sysinfo.h>
|
||||
#include <sched.h>
|
||||
#include <errno.h>
|
||||
#include <linux/unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
|
||||
#define FIXED_PAGESIZE 4096
|
||||
|
||||
|
||||
void *sa = NULL;
|
||||
void *sb = NULL;
|
||||
static double static_buffer[BUFFER_SIZE/sizeof(double)];
|
||||
@@ -60,7 +68,7 @@ static double static_buffer[BUFFER_SIZE/sizeof(double)];
|
||||
void *blas_memory_alloc(int numproc){
|
||||
|
||||
if (sa == NULL){
|
||||
#if 1
|
||||
#if 0
|
||||
sa = (void *)qalloc(QFAST, BUFFER_SIZE);
|
||||
#else
|
||||
sa = (void *)malloc(BUFFER_SIZE);
|
||||
@@ -75,3 +83,296 @@ void blas_memory_free(void *free_area){
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#ifndef SMP
|
||||
|
||||
#define blas_cpu_number 1
|
||||
#define blas_num_threads 1
|
||||
|
||||
/* Dummy Function */
|
||||
int goto_get_num_procs (void) { return 1;};
|
||||
void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int n;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
#else
|
||||
#if !__GLIBC_PREREQ(2, 3)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
|
||||
if (ret!=0) return nums;
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,&cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
#endif
|
||||
return nums;
|
||||
#else
|
||||
if (nums >= CPU_SETSIZE) {
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) {
|
||||
return nums;
|
||||
}
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) {
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
} else {
|
||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
|
||||
if (ret!=0) {
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT(&cpuset);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef OS_ANDROID
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_AIX
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
|
||||
if (nums == 0) {
|
||||
|
||||
SYSTEM_INFO sysinfo;
|
||||
|
||||
GetSystemInfo(&sysinfo);
|
||||
|
||||
nums = sysinfo.dwNumberOfProcessors;
|
||||
}
|
||||
|
||||
return nums;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
|
||||
int m[2];
|
||||
size_t len;
|
||||
|
||||
if (nums == 0) {
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
sysctl(m, 2, &nums, &len, NULL, 0);
|
||||
}
|
||||
|
||||
return nums;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_DARWIN)
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
size_t len;
|
||||
if (nums == 0){
|
||||
len = sizeof(int);
|
||||
sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
|
||||
}
|
||||
return nums;
|
||||
}
|
||||
/*
|
||||
void set_stack_limit(int limitMB){
|
||||
int result=0;
|
||||
struct rlimit rl;
|
||||
rlim_t StackSize;
|
||||
|
||||
StackSize=limitMB*1024*1024;
|
||||
result=getrlimit(RLIMIT_STACK, &rl);
|
||||
if(result==0){
|
||||
if(rl.rlim_cur < StackSize){
|
||||
rl.rlim_cur=StackSize;
|
||||
result=setrlimit(RLIMIT_STACK, &rl);
|
||||
if(result !=0){
|
||||
fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
OpenBLAS uses the numbers of CPU cores in multithreading.
|
||||
It can be set by openblas_set_num_threads(int num_threads);
|
||||
*/
|
||||
int blas_cpu_number = 0;
|
||||
/*
|
||||
The numbers of threads in the thread pool.
|
||||
This value is equal or large than blas_cpu_number. This means some threads are sleep.
|
||||
*/
|
||||
int blas_num_threads = 0;
|
||||
|
||||
int goto_get_num_procs (void) {
|
||||
return blas_cpu_number;
|
||||
}
|
||||
|
||||
void openblas_fork_handler()
|
||||
{
|
||||
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
|
||||
// built with "make USE_OPENMP=0".
|
||||
// Hanging can still happen when OpenBLAS is built against the libgomp
|
||||
// implementation of OpenMP. The problem is tracked at:
|
||||
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
|
||||
// In the mean time build with USE_OPENMP=0 or link against another
|
||||
// implementation of OpenMP.
|
||||
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
||||
int err;
|
||||
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
|
||||
if(err != 0)
|
||||
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
extern int openblas_num_threads_env();
|
||||
extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
int blas_omp_num = 0;
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
// blas_goto_num = 0;
|
||||
#ifndef USE_OPENMP
|
||||
blas_goto_num=openblas_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
|
||||
if (blas_goto_num == 0) {
|
||||
blas_goto_num=openblas_goto_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// blas_omp_num = 0;
|
||||
blas_omp_num=openblas_omp_num_threads_env();
|
||||
if (blas_omp_num < 0) blas_omp_num = 0;
|
||||
|
||||
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#ifdef DEBUG
|
||||
printf( "Adjusted number of threads : %3d\n", blas_num_threads);
|
||||
#endif
|
||||
|
||||
blas_cpu_number = blas_num_threads;
|
||||
|
||||
return blas_num_threads;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
int openblas_get_num_procs(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
return get_num_procs();
|
||||
#endif
|
||||
}
|
||||
|
||||
int openblas_get_num_threads(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
// init blas_cpu_number if needed
|
||||
blas_get_cpu_number();
|
||||
return blas_cpu_number;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -35,15 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#if defined(_WIN32) && defined(_MSC_VER)
|
||||
#if _MSC_VER < 1900
|
||||
#define snprintf _snprintf
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static char* openblas_config_str=""
|
||||
"OpenBLAS "
|
||||
VERSION
|
||||
" "
|
||||
#ifdef USE64BITINT
|
||||
"USE64BITINT "
|
||||
" USE64BITINT "
|
||||
#endif
|
||||
#ifdef NO_CBLAS
|
||||
"NO_CBLAS "
|
||||
@@ -81,10 +78,10 @@ char tmpstr[20];
|
||||
#ifdef DYNAMIC_ARCH
|
||||
strcat(tmp_config_str, gotoblas_corename());
|
||||
#endif
|
||||
if (openblas_get_parallel() == 0)
|
||||
sprintf(tmpstr, " SINGLE_THREADED");
|
||||
else
|
||||
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
|
||||
if (openblas_get_parallel() == 0)
|
||||
sprintf(tmpstr, " SINGLE_THREADED");
|
||||
else
|
||||
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
|
||||
strcat(tmp_config_str, tmpstr);
|
||||
return tmp_config_str;
|
||||
}
|
||||
|
||||
@@ -730,35 +730,8 @@ void blas_set_parameter(void){
|
||||
|
||||
#if defined(ARCH_ARM64)
|
||||
|
||||
#if defined(VULCAN) || defined(THUNDERX2T99)
|
||||
unsigned long dgemm_prefetch_size_a;
|
||||
unsigned long dgemm_prefetch_size_b;
|
||||
unsigned long dgemm_prefetch_size_c;
|
||||
#endif
|
||||
|
||||
void blas_set_parameter(void)
|
||||
{
|
||||
#if defined(VULCAN) || defined(THUNDERX2T99)
|
||||
dgemm_p = 160;
|
||||
dgemm_q = 128;
|
||||
dgemm_r = 4096;
|
||||
|
||||
sgemm_p = 128;
|
||||
sgemm_q = 352;
|
||||
sgemm_r = 4096;
|
||||
|
||||
cgemm_p = 128;
|
||||
cgemm_q = 224;
|
||||
cgemm_r = 4096;
|
||||
|
||||
zgemm_p = 128;
|
||||
zgemm_q = 112;
|
||||
zgemm_r = 4096;
|
||||
|
||||
dgemm_prefetch_size_a = 3584;
|
||||
dgemm_prefetch_size_b = 512;
|
||||
dgemm_prefetch_size_c = 128;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
897
dynamic.c
Normal file
897
dynamic.c
Normal file
@@ -0,0 +1,897 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define strncasecmp _strnicmp
|
||||
#define strcasecmp _stricmp
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_X86
|
||||
#define EXTERN extern
|
||||
#else
|
||||
#define EXTERN
|
||||
#endif
|
||||
|
||||
#ifdef DYNAMIC_LIST
|
||||
extern gotoblas_t gotoblas_PRESCOTT;
|
||||
|
||||
#ifdef DYN_ATHLON
|
||||
extern gotoblas_t gotoblas_ATHLON;
|
||||
#else
|
||||
#define gotoblas_ATHLON gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_KATMAI
|
||||
extern gotoblas_t gotoblas_KATMAI;
|
||||
#else
|
||||
#define gotoblas_KATMAI gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BANIAS
|
||||
extern gotoblas_t gotoblas_BANIAS;
|
||||
#else
|
||||
#define gotoblas_BANIAS gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_COPPERMINE
|
||||
extern gotoblas_t gotoblas_COPPERMINE;
|
||||
#else
|
||||
#define gotoblas_COPPERMINE gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_NORTHWOOD
|
||||
extern gotoblas_t gotoblas_NORTHWOOD;
|
||||
#else
|
||||
#define gotoblas_NORTHWOOD gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_CORE2
|
||||
extern gotoblas_t gotoblas_CORE2;
|
||||
#else
|
||||
#define gotoblas_CORE2 gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_NEHALEM
|
||||
extern gotoblas_t gotoblas_NEHALEM;
|
||||
#else
|
||||
#define gotoblas_NEHALEM gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BARCELONA
|
||||
extern gotoblas_t gotoblas_BARCELONA;
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_BARCELONA gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_BARCELONA gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_ATOM
|
||||
extern gotoblas_t gotoblas_ATOM;
|
||||
elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_ATOM gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_ATOM gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_NANO
|
||||
extern gotoblas_t gotoblas_NANO;
|
||||
#else
|
||||
#define gotoblas_NANO gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_PENRYN
|
||||
extern gotoblas_t gotoblas_PENRYN;
|
||||
#else
|
||||
#define gotoblas_PENRYN gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_DUNNINGTON
|
||||
extern gotoblas_t gotoblas_DUNNINGTON;
|
||||
#else
|
||||
#define gotoblas_DUNNINGTON gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_OPTERON
|
||||
extern gotoblas_t gotoblas_OPTERON;
|
||||
#else
|
||||
#define gotoblas_OPTERON gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_OPTERON_SSE3
|
||||
extern gotoblas_t gotoblas_OPTERON_SSE3;
|
||||
#else
|
||||
#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BOBCAT
|
||||
extern gotoblas_t gotoblas_BOBCAT;
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_BOBCAT gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_BOBCAT gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_SANDYBRIDGE
|
||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BULLDOZER
|
||||
extern gotoblas_t gotoblas_BULLDOZER;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_BULLDOZER gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_BULLDOZER gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_PILEDRIVER
|
||||
extern gotoblas_t gotoblas_PILEDRIVER;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_PILEDRIVER gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_PILEDRIVER gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_STEAMROLLER
|
||||
extern gotoblas_t gotoblas_STEAMROLLER;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_STEAMROLLER gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_STEAMROLLER gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_EXCAVATOR
|
||||
extern gotoblas_t gotoblas_EXCAVATOR;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_EXCAVATOR gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_EXCAVATOR gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_HASWELL
|
||||
extern gotoblas_t gotoblas_HASWELL;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_HASWELL gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_ZEN
|
||||
extern gotoblas_t gotoblas_ZEN;
|
||||
#elif defined(DYN_HASWELL)
|
||||
#define gotoblas_ZEN gotoblas_HASWELL
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_ZEN gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_ZEN gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_SKYLAKEX
|
||||
extern gotoblas_t gotoblas_SKYLAKEX;
|
||||
#elif defined(DYN_HASWELL)
|
||||
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_SKYLAKEX gotoblas_PRESCOTT
|
||||
#endif
|
||||
|
||||
|
||||
#else // not DYNAMIC_LIST
|
||||
EXTERN gotoblas_t gotoblas_KATMAI;
|
||||
EXTERN gotoblas_t gotoblas_COPPERMINE;
|
||||
EXTERN gotoblas_t gotoblas_NORTHWOOD;
|
||||
EXTERN gotoblas_t gotoblas_BANIAS;
|
||||
EXTERN gotoblas_t gotoblas_ATHLON;
|
||||
|
||||
extern gotoblas_t gotoblas_PRESCOTT;
|
||||
extern gotoblas_t gotoblas_CORE2;
|
||||
extern gotoblas_t gotoblas_NEHALEM;
|
||||
extern gotoblas_t gotoblas_BARCELONA;
|
||||
#ifdef DYNAMIC_OLDER
|
||||
extern gotoblas_t gotoblas_ATOM;
|
||||
extern gotoblas_t gotoblas_NANO;
|
||||
extern gotoblas_t gotoblas_PENRYN;
|
||||
extern gotoblas_t gotoblas_DUNNINGTON;
|
||||
extern gotoblas_t gotoblas_OPTERON;
|
||||
extern gotoblas_t gotoblas_OPTERON_SSE3;
|
||||
extern gotoblas_t gotoblas_BOBCAT;
|
||||
#else
|
||||
#define gotoblas_ATOM gotoblas_NEHALEM
|
||||
#define gotoblas_NANO gotoblas_NEHALEM
|
||||
#define gotoblas_PENRYN gotoblas_CORE2
|
||||
#define gotoblas_DUNNINGTON gotoblas_CORE2
|
||||
#define gotoblas_OPTERON gotoblas_CORE2
|
||||
#define gotoblas_OPTERON_SSE3 gotoblas_CORE2
|
||||
#define gotoblas_BOBCAT gotoblas_CORE2
|
||||
#endif
|
||||
|
||||
#ifndef NO_AVX
|
||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||
extern gotoblas_t gotoblas_BULLDOZER;
|
||||
extern gotoblas_t gotoblas_PILEDRIVER;
|
||||
extern gotoblas_t gotoblas_STEAMROLLER;
|
||||
extern gotoblas_t gotoblas_EXCAVATOR;
|
||||
#ifdef NO_AVX2
|
||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||
#else
|
||||
extern gotoblas_t gotoblas_HASWELL;
|
||||
extern gotoblas_t gotoblas_ZEN;
|
||||
#ifndef NO_AVX512
|
||||
extern gotoblas_t gotoblas_SKYLAKEX;
|
||||
#else
|
||||
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
//Use NEHALEM kernels for sandy bridge
|
||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
||||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
||||
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
||||
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
||||
#define gotoblas_EXCAVATOR gotoblas_BARCELONA
|
||||
#define gotoblas_ZEN gotoblas_BARCELONA
|
||||
#endif
|
||||
|
||||
#endif // DYNAMIC_LIST
|
||||
|
||||
#define VENDOR_INTEL 1
|
||||
#define VENDOR_AMD 2
|
||||
#define VENDOR_CENTAUR 3
|
||||
#define VENDOR_HYGON 4
|
||||
#define VENDOR_UNKNOWN 99
|
||||
|
||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||
|
||||
#ifndef NO_AVX
|
||||
static inline void xgetbv(int op, int * eax, int * edx){
|
||||
//Use binary code for xgetbv
|
||||
__asm__ __volatile__
|
||||
(".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
|
||||
}
|
||||
#endif
|
||||
|
||||
int support_avx(){
|
||||
#ifndef NO_AVX
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
cpuid(1, &eax, &ebx, &ecx, &edx);
|
||||
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
|
||||
xgetbv(0, &eax, &edx);
|
||||
if((eax & 6) == 6){
|
||||
ret=1; //OS support AVX
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int support_avx2(){
|
||||
#ifndef NO_AVX2
|
||||
int eax, ebx, ecx=0, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 0)
|
||||
ret=1; //OS supports AVX2
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int support_avx512(){
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 1){
|
||||
ret=0; //OS does not even support AVX2
|
||||
}
|
||||
if((ebx & (1<<31)) != 0){
|
||||
xgetbv(0, &eax, &edx);
|
||||
if((eax & 0xe0) == 0xe0)
|
||||
ret=1; //OS supports AVX512VL
|
||||
}
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
#define FALLBACK_VERBOSE 1
|
||||
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
|
||||
#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
|
||||
#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
|
||||
#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
|
||||
|
||||
static int get_vendor(void){
|
||||
int eax, ebx, ecx, edx;
|
||||
|
||||
union
|
||||
{
|
||||
char vchar[16];
|
||||
int vint[4];
|
||||
} vendor;
|
||||
|
||||
cpuid(0, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
*(&vendor.vint[0]) = ebx;
|
||||
*(&vendor.vint[1]) = edx;
|
||||
*(&vendor.vint[2]) = ecx;
|
||||
|
||||
vendor.vchar[12] = '\0';
|
||||
|
||||
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
|
||||
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
|
||||
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
|
||||
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
|
||||
|
||||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
|
||||
|
||||
return VENDOR_UNKNOWN;
|
||||
}
|
||||
|
||||
static gotoblas_t *get_coretype(void){
|
||||
|
||||
int eax, ebx, ecx, edx;
|
||||
int family, exfamily, model, vendor, exmodel;
|
||||
|
||||
cpuid(1, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
family = BITMASK(eax, 8, 0x0f);
|
||||
exfamily = BITMASK(eax, 20, 0xff);
|
||||
model = BITMASK(eax, 4, 0x0f);
|
||||
exmodel = BITMASK(eax, 16, 0x0f);
|
||||
|
||||
vendor = get_vendor();
|
||||
|
||||
if (vendor == VENDOR_INTEL){
|
||||
switch (family) {
|
||||
case 0x6:
|
||||
switch (exmodel) {
|
||||
case 0:
|
||||
if (model <= 0x7) return &gotoblas_KATMAI;
|
||||
if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE;
|
||||
if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS;
|
||||
if (model == 14) return &gotoblas_BANIAS;
|
||||
if (model == 15) return &gotoblas_CORE2;
|
||||
return NULL;
|
||||
|
||||
case 1:
|
||||
if (model == 6) return &gotoblas_CORE2;
|
||||
if (model == 7) return &gotoblas_PENRYN;
|
||||
if (model == 13) return &gotoblas_DUNNINGTON;
|
||||
if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM;
|
||||
if (model == 12) return &gotoblas_ATOM;
|
||||
return NULL;
|
||||
|
||||
case 2:
|
||||
//Intel Core (Clarkdale) / Core (Arrandale)
|
||||
// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
|
||||
// Xeon (Clarkdale), 32nm
|
||||
if (model == 5) return &gotoblas_NEHALEM;
|
||||
|
||||
//Intel Xeon Processor 5600 (Westmere-EP)
|
||||
//Xeon Processor E7 (Westmere-EX)
|
||||
//Xeon E7540
|
||||
if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
|
||||
|
||||
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
|
||||
//Intel Core i7-3000 / Xeon E5
|
||||
if (model == 10 || model == 13) {
|
||||
if(support_avx())
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 3:
|
||||
//Intel Sandy Bridge 22nm (Ivy Bridge?)
|
||||
if (model == 10 || model == 14) {
|
||||
if(support_avx())
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Haswell
|
||||
if (model == 12 || model == 15) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Broadwell
|
||||
if (model == 13) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
if (model == 7) return &gotoblas_ATOM; //Bay Trail
|
||||
return NULL;
|
||||
case 4:
|
||||
//Intel Haswell
|
||||
if (model == 5 || model == 6) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Broadwell
|
||||
if (model == 7 || model == 15) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Skylake
|
||||
if (model == 14) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Braswell / Avoton
|
||||
if (model == 12 || model == 13) {
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
return NULL;
|
||||
case 5:
|
||||
//Intel Broadwell
|
||||
if (model == 6) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
if (model == 5) {
|
||||
// Intel Skylake X
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
//Intel Skylake
|
||||
if (model == 14) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Phi Knights Landing
|
||||
if (model == 7) {
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Apollo Lake or Denverton
|
||||
if (model == 12 || model == 15) {
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
return NULL;
|
||||
case 6:
|
||||
if (model == 6) {
|
||||
// Cannon Lake
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 7:
|
||||
if (model == 10) // Goldmont plus
|
||||
return &gotoblas_NEHALEM;
|
||||
if (model == 14) {
|
||||
// Ice Lake
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 14 ) { // Kaby Lake, Coffee Lake
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
case 0xf:
|
||||
if (model <= 0x2) return &gotoblas_NORTHWOOD;
|
||||
return &gotoblas_PRESCOTT;
|
||||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
|
||||
if (family <= 0xe) {
|
||||
// Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
|
||||
cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
|
||||
if ( (eax & 0xffff) >= 0x01) {
|
||||
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
||||
if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0)
|
||||
return NULL;
|
||||
}
|
||||
else
|
||||
return NULL;
|
||||
|
||||
return &gotoblas_ATHLON;
|
||||
}
|
||||
if (family == 0xf){
|
||||
if ((exfamily == 0) || (exfamily == 2)) {
|
||||
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
|
||||
else return &gotoblas_OPTERON;
|
||||
} else if (exfamily == 5) {
|
||||
return &gotoblas_BOBCAT;
|
||||
} else if (exfamily == 6) {
|
||||
if(model == 1){
|
||||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||
if(support_avx())
|
||||
return &gotoblas_BULLDOZER;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 2 || model == 3){
|
||||
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
|
||||
if(support_avx())
|
||||
return &gotoblas_PILEDRIVER;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 5){
|
||||
if(support_avx())
|
||||
return &gotoblas_EXCAVATOR;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 0 || model == 8){
|
||||
if (exmodel == 1) {
|
||||
//AMD Trinity
|
||||
if(support_avx())
|
||||
return &gotoblas_PILEDRIVER;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if (exmodel == 3) {
|
||||
//AMD STEAMROLLER
|
||||
if(support_avx())
|
||||
return &gotoblas_STEAMROLLER;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if (exmodel == 6) {
|
||||
if(support_avx())
|
||||
return &gotoblas_EXCAVATOR;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
} else if (exfamily == 8) {
|
||||
if (model == 1 || model == 8) {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
} else if (exfamily == 9) {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else {
|
||||
return &gotoblas_BARCELONA;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_CENTAUR) {
|
||||
switch (family) {
|
||||
case 0x6:
|
||||
return &gotoblas_NANO;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static char *corename[] = {
|
||||
"Unknown",
|
||||
"Katmai",
|
||||
"Coppermine",
|
||||
"Northwood",
|
||||
"Prescott",
|
||||
"Banias",
|
||||
"Atom",
|
||||
"Core2",
|
||||
"Penryn",
|
||||
"Dunnington",
|
||||
"Nehalem",
|
||||
"Athlon",
|
||||
"Opteron",
|
||||
"Opteron_SSE3",
|
||||
"Barcelona",
|
||||
"Nano",
|
||||
"Sandybridge",
|
||||
"Bobcat",
|
||||
"Bulldozer",
|
||||
"Piledriver",
|
||||
"Haswell",
|
||||
"Steamroller",
|
||||
"Excavator",
|
||||
"Zen",
|
||||
"SkylakeX"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
|
||||
if (gotoblas == &gotoblas_KATMAI) return corename[ 1];
|
||||
if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2];
|
||||
if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3];
|
||||
if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4];
|
||||
if (gotoblas == &gotoblas_BANIAS) return corename[ 5];
|
||||
if (gotoblas == &gotoblas_ATOM) return corename[ 6];
|
||||
if (gotoblas == &gotoblas_CORE2) return corename[ 7];
|
||||
if (gotoblas == &gotoblas_PENRYN) return corename[ 8];
|
||||
if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9];
|
||||
if (gotoblas == &gotoblas_NEHALEM) return corename[10];
|
||||
if (gotoblas == &gotoblas_ATHLON) return corename[11];
|
||||
if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12];
|
||||
if (gotoblas == &gotoblas_OPTERON) return corename[13];
|
||||
if (gotoblas == &gotoblas_BARCELONA) return corename[14];
|
||||
if (gotoblas == &gotoblas_NANO) return corename[15];
|
||||
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
|
||||
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
|
||||
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
||||
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
|
||||
if (gotoblas == &gotoblas_HASWELL) return corename[20];
|
||||
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
|
||||
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
|
||||
if (gotoblas == &gotoblas_ZEN) return corename[23];
|
||||
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
|
||||
static gotoblas_t *force_coretype(char *coretype){
|
||||
|
||||
int i ;
|
||||
int found = -1;
|
||||
char message[128];
|
||||
//char mname[20];
|
||||
|
||||
for ( i=1 ; i <= 24; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype,corename[i],20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (found < 0)
|
||||
{
|
||||
//strncpy(mname,coretype,20);
|
||||
snprintf(message, 128, "Core not found: %s\n",coretype);
|
||||
openblas_warning(1, message);
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 24: return (&gotoblas_SKYLAKEX);
|
||||
case 23: return (&gotoblas_ZEN);
|
||||
case 22: return (&gotoblas_EXCAVATOR);
|
||||
case 21: return (&gotoblas_STEAMROLLER);
|
||||
case 20: return (&gotoblas_HASWELL);
|
||||
case 19: return (&gotoblas_PILEDRIVER);
|
||||
case 18: return (&gotoblas_BULLDOZER);
|
||||
case 17: return (&gotoblas_BOBCAT);
|
||||
case 16: return (&gotoblas_SANDYBRIDGE);
|
||||
case 15: return (&gotoblas_NANO);
|
||||
case 14: return (&gotoblas_BARCELONA);
|
||||
case 13: return (&gotoblas_OPTERON);
|
||||
case 12: return (&gotoblas_OPTERON_SSE3);
|
||||
case 11: return (&gotoblas_ATHLON);
|
||||
case 10: return (&gotoblas_NEHALEM);
|
||||
case 9: return (&gotoblas_DUNNINGTON);
|
||||
case 8: return (&gotoblas_PENRYN);
|
||||
case 7: return (&gotoblas_CORE2);
|
||||
case 6: return (&gotoblas_ATOM);
|
||||
case 5: return (&gotoblas_BANIAS);
|
||||
case 4: return (&gotoblas_PRESCOTT);
|
||||
case 3: return (&gotoblas_NORTHWOOD);
|
||||
case 2: return (&gotoblas_COPPERMINE);
|
||||
case 1: return (&gotoblas_KATMAI);
|
||||
}
|
||||
return(NULL);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void gotoblas_dynamic_init(void) {
|
||||
|
||||
char coremsg[128];
|
||||
char coren[22];
|
||||
char *p;
|
||||
|
||||
|
||||
if (gotoblas) return;
|
||||
|
||||
p = getenv("OPENBLAS_CORETYPE");
|
||||
if ( p )
|
||||
{
|
||||
gotoblas = force_coretype(p);
|
||||
}
|
||||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
}
|
||||
|
||||
#ifdef ARCH_X86
|
||||
if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
|
||||
#else
|
||||
if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
|
||||
/* sanity check, if 64bit pointer we can't have a 32 bit cpu */
|
||||
if (sizeof(void*) == 8) {
|
||||
if (gotoblas == &gotoblas_KATMAI ||
|
||||
gotoblas == &gotoblas_COPPERMINE ||
|
||||
gotoblas == &gotoblas_NORTHWOOD ||
|
||||
gotoblas == &gotoblas_BANIAS ||
|
||||
gotoblas == &gotoblas_ATHLON)
|
||||
gotoblas = &gotoblas_PRESCOTT;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (gotoblas && gotoblas -> init) {
|
||||
strncpy(coren,gotoblas_corename(),20);
|
||||
sprintf(coremsg, "Core: %s\n",coren);
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas -> init();
|
||||
} else {
|
||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_quit(void) {
|
||||
|
||||
gotoblas = NULL;
|
||||
|
||||
}
|
||||
@@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol
|
||||
libgoto_hpl.def : gensymbol
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
endif
|
||||
|
||||
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
||||
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
||||
else
|
||||
@@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
|
||||
endif
|
||||
ifneq (,$(filter 1 2,$(NOFORTRAN)))
|
||||
#only build without Fortran
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
else
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
endif
|
||||
|
||||
dllinit.$(SUFFIX) : dllinit.c
|
||||
@@ -141,6 +145,14 @@ else
|
||||
$(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed
|
||||
../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), INTEL)
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||
else
|
||||
|
||||
ifneq ($(C_COMPILER), LSB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
@@ -152,6 +164,7 @@ else
|
||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
|
||||
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||
endif
|
||||
endif
|
||||
rm -f linktest
|
||||
|
||||
|
||||
@@ -40,15 +40,25 @@
|
||||
|
||||
void gotoblas_init(void);
|
||||
void gotoblas_quit(void);
|
||||
#if defined(SMP) && defined(USE_TLS)
|
||||
void blas_thread_memory_cleanup(void);
|
||||
#endif
|
||||
|
||||
BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {
|
||||
|
||||
if (reason == DLL_PROCESS_ATTACH) {
|
||||
gotoblas_init();
|
||||
}
|
||||
|
||||
if (reason == DLL_PROCESS_DETACH) {
|
||||
gotoblas_quit();
|
||||
switch(reason) {
|
||||
case DLL_PROCESS_ATTACH:
|
||||
gotoblas_init();
|
||||
break;
|
||||
case DLL_PROCESS_DETACH:
|
||||
gotoblas_quit();
|
||||
break;
|
||||
case DLL_THREAD_ATTACH:
|
||||
break;
|
||||
case DLL_THREAD_DETACH:
|
||||
#if defined(SMP) && defined(USE_TLS)
|
||||
blas_thread_memory_cleanup();
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
|
||||
@@ -618,19 +618,6 @@
|
||||
# functions added for lapack-3.7.0
|
||||
|
||||
slarfy,
|
||||
slasyf_rk,
|
||||
ssyconvf_rook,
|
||||
ssytf2_rk,
|
||||
ssytrf_rk,
|
||||
ssytrs_3,
|
||||
ssytri_3,
|
||||
ssytri_3x,
|
||||
ssycon_3,
|
||||
ssysv_rk,
|
||||
slasyf_aa,
|
||||
ssysv_aa,
|
||||
ssytrf_aa,
|
||||
ssytrs_aa,
|
||||
strevc3,
|
||||
sgelqt,
|
||||
sgelqt3,
|
||||
@@ -647,33 +634,8 @@
|
||||
stplqt,
|
||||
stplqt2,
|
||||
stpmlqt,
|
||||
ssytrd_2stage,
|
||||
ssytrd_sy2sb,
|
||||
ssytrd_sb2st,
|
||||
ssb2st_kernels,
|
||||
ssyevd_2stage,
|
||||
ssyev_2stage,
|
||||
ssyevx_2stage,
|
||||
ssyevr_2stage,
|
||||
ssbev_2stage,
|
||||
ssbevx_2stage,
|
||||
ssbevd_2stage,
|
||||
ssygv_2stage,
|
||||
dlarfy,
|
||||
dlasyf_rk,
|
||||
dsyconvf,
|
||||
dsyconvf_rook,
|
||||
dsytf2_rk,
|
||||
dsytrf_rk,
|
||||
dsytrs_3,
|
||||
dsytri_3,
|
||||
dsytri_3x,
|
||||
dsycon_3,
|
||||
dsysv_rk,
|
||||
dlasyf_aa,
|
||||
dsysv_aa,
|
||||
dsytrf_aa,
|
||||
dsytrs_aa,
|
||||
dtrevc3,
|
||||
dgelqt,
|
||||
dgelqt3,
|
||||
@@ -690,45 +652,8 @@
|
||||
dtplqt,
|
||||
dtplqt2,
|
||||
dtpmlqt,
|
||||
dsytrd_2stage,
|
||||
dsytrd_sy2sb,
|
||||
dsytrd_sb2st,
|
||||
dsb2st_kernels,
|
||||
dsyevd_2stage,
|
||||
dsyev_2stage,
|
||||
dsyevx_2stage,
|
||||
dsyevr_2stage,
|
||||
dsbev_2stage,
|
||||
dsbevx_2stage,
|
||||
dsbevd_2stage,
|
||||
dsygv_2stage,
|
||||
chetf2_rk,
|
||||
chetrf_rk,
|
||||
chetri_3,
|
||||
chetri_3x,
|
||||
chetrs_3,
|
||||
checon_3,
|
||||
chesv_rk,
|
||||
chesv_aa,
|
||||
chetrf_aa,
|
||||
chetrs_aa,
|
||||
clahef_aa,
|
||||
clahef_rk,
|
||||
clarfy,
|
||||
clasyf_rk,
|
||||
clasyf_aa,
|
||||
csyconvf,
|
||||
csyconvf_rook,
|
||||
csytf2_rk,
|
||||
csytrf_rk,
|
||||
csytrf_aa,
|
||||
csytrs_3,
|
||||
csytrs_aa,
|
||||
csytri_3,
|
||||
csytri_3x,
|
||||
csycon_3,
|
||||
csysv_rk,
|
||||
csysv_aa,
|
||||
ctrevc3,
|
||||
cgelqt,
|
||||
cgelqt3,
|
||||
@@ -745,45 +670,8 @@
|
||||
ctplqt,
|
||||
ctplqt2,
|
||||
ctpmlqt,
|
||||
chetrd_2stage,
|
||||
chetrd_he2hb,
|
||||
chetrd_hb2st,
|
||||
chb2st_kernels,
|
||||
cheevd_2stage,
|
||||
cheev_2stage,
|
||||
cheevx_2stage,
|
||||
cheevr_2stage,
|
||||
chbev_2stage,
|
||||
chbevx_2stage,
|
||||
chbevd_2stage,
|
||||
chegv_2stage,
|
||||
zhetf2_rk,
|
||||
zhetrf_rk,
|
||||
zhetri_3,
|
||||
zhetri_3x,
|
||||
zhetrs_3,
|
||||
zhecon_3,
|
||||
zhesv_rk,
|
||||
zhesv_aa,
|
||||
zhetrf_aa,
|
||||
zhetrs_aa,
|
||||
zlahef_aa,
|
||||
zlahef_rk,
|
||||
zlarfy,
|
||||
zlasyf_rk,
|
||||
zlasyf_aa,
|
||||
zsyconvf,
|
||||
zsyconvf_rook,
|
||||
zsytrs_aa,
|
||||
zsytf2_rk,
|
||||
zsytrf_rk,
|
||||
zsytrf_aa,
|
||||
zsytrs_3,
|
||||
zsytri_3,
|
||||
zsytri_3x,
|
||||
zsycon_3,
|
||||
zsysv_rk,
|
||||
zsysv_aa,
|
||||
ztrevc3,
|
||||
ztplqt,
|
||||
ztplqt2,
|
||||
@@ -800,43 +688,13 @@
|
||||
zlaswlq,
|
||||
zlamswlq,
|
||||
zgemlq,
|
||||
zhetrd_2stage,
|
||||
zhetrd_he2hb,
|
||||
zhetrd_hb2st,
|
||||
zhb2st_kernels,
|
||||
zheevd_2stage,
|
||||
zheev_2stage,
|
||||
zheevx_2stage,
|
||||
zheevr_2stage,
|
||||
zhbev_2stage,
|
||||
zhbevx_2stage,
|
||||
zhbevd_2stage,
|
||||
zhegv_2stage,
|
||||
sladiv1,
|
||||
dladiv1,
|
||||
iparam2stage,
|
||||
|
||||
# functions added for lapack-3.8.0
|
||||
|
||||
ilaenv2stage,
|
||||
ssysv_aa_2stage,
|
||||
ssytrf_aa_2stage,
|
||||
ssytrs_aa_2stage,
|
||||
chesv_aa_2stage,
|
||||
chetrf_aa_2stage,
|
||||
chetrs_aa_2stage,
|
||||
csysv_aa_2stage,
|
||||
csytrf_aa_2stage,
|
||||
csytrs_aa_2stage,
|
||||
dsysv_aa_2stage,
|
||||
dsytrf_aa_2stage,
|
||||
dsytrs_aa_2stage,
|
||||
zhesv_aa_2stage,
|
||||
zhetrf_aa_2stage,
|
||||
zhetrs_aa_2stage,
|
||||
zsysv_aa_2stage,
|
||||
zsytrf_aa_2stage,
|
||||
zsytrs_aa_2stage
|
||||
ilaenv2stage
|
||||
);
|
||||
|
||||
@lapack_extendedprecision_objs = (
|
||||
@@ -3509,6 +3367,59 @@
|
||||
zlahef_rook, zlasyf_rook,
|
||||
zsytf2_rook, zsytrf_rook, zsytrs_rook,
|
||||
zsytri_rook, zsycon_rook, zsysv_rook,
|
||||
# 3.7.0
|
||||
slasyf_rk, ssyconvf_rook, ssytf2_rk,
|
||||
ssytrf_rk, ssytrs_3, ssytri_3,
|
||||
ssytri_3x, ssycon_3, ssysv_rk,
|
||||
slasyf_aa, ssysv_aa, ssytrf_aa,
|
||||
ssytrs_aa, ssytrd_2stage, ssytrd_sy2sb,
|
||||
ssytrd_sb2st, ssb2st_kernels, ssyevd_2stage,
|
||||
ssyev_2stage, ssyevx_2stage, ssyevr_2stage,
|
||||
ssbev_2stage, ssbevx_2stage, ssbevd_2stage,
|
||||
ssygv_2stage, dlasyf_rk, dsyconvf_rook,
|
||||
dsytf2_rk, dsytrf_rk, dsytrs_3,
|
||||
dsytri_3, dsytri_3x, dsycon_3,
|
||||
dsysv_rk, dlasyf_aa, dsysv_aa,
|
||||
dsytrf_aa, dsytrs_aa, dsytrd_2stage,
|
||||
dsytrd_sy2sb, dsytrd_sb2st, dsb2st_kernels,
|
||||
dsyevd_2stage, dsyev_2stage, dsyevx_2stage,
|
||||
dsyevr_2stage, dsbev_2stage, dsbevx_2stage,
|
||||
dsbevd_2stage, dsygv_2stage, chetf2_rk,
|
||||
chetrf_rk, chetri_3, chetri_3x,
|
||||
chetrs_3, checon_3, chesv_rk,
|
||||
chesv_aa, chetrf_aa, chetrs_aa,
|
||||
clahef_aa, clahef_rk, clasyf_rk,
|
||||
clasyf_aa, csytf2_rk, csytrf_rk,
|
||||
csytrf_aa, csytrs_3, csytrs_aa,
|
||||
csytri_3, csytri_3x, csycon_3,
|
||||
csysv_rk, csysv_aa, csyconvf_rook,
|
||||
chetrd_2stage, chetrd_he2hb, chetrd_hb2st,
|
||||
chb2st_kernels, cheevd_2stage, cheev_2stage,
|
||||
cheevx_2stage, cheevr_2stage, chbev_2stage,
|
||||
chbevx_2stage, chbevd_2stage, chegv_2stage,
|
||||
zhetf2_rk, zhetrf_rk, zhetri_3,
|
||||
zhetri_3x, zhetrs_3, zhecon_3,
|
||||
zhesv_rk, zhesv_aa, zhetrf_aa,
|
||||
zhetrs_aa, zlahef_aa, zlahef_rk,
|
||||
zlasyf_rk, zlasyf_aa, zsyconvf_rook,
|
||||
zsytrs_aa, zsytf2_rk, zsytrf_rk,
|
||||
zsytrf_aa, zsytrs_3, zsytri_3,
|
||||
zsytri_3x, zsycon_3, zsysv_rk,
|
||||
zsysv_aa, zhetrd_2stage, zhetrd_he2hb,
|
||||
zhetrd_hb2st, zhb2st_kernels, zheevd_2stage,
|
||||
zheev_2stage, zheevx_2stage, zheevr_2stage,
|
||||
zhbev_2stage, zhbevx_2stage, zhbevd_2stage,
|
||||
zhegv_2stage,
|
||||
# 3.8.0
|
||||
ssysv_aa_2stage, ssytrf_aa_2stage,
|
||||
ssytrs_aa_2stage, chesv_aa_2stage,
|
||||
chetrf_aa_2stage, chetrs_aa_2stage,
|
||||
csysv_aa_2stage, csytrf_aa_2stage,
|
||||
csytrs_aa_2stage, dsysv_aa_2stage,
|
||||
dsytrf_aa_2stage, dsytrs_aa_2stage,
|
||||
zhesv_aa_2stage, zhetrf_aa_2stage,
|
||||
zhetrs_aa_2stage, zsysv_aa_2stage,
|
||||
zsytrf_aa_2stage, zsytrs_aa_2stage
|
||||
);
|
||||
|
||||
|
||||
|
||||
21
f_check
21
f_check
@@ -19,7 +19,7 @@ $nofortran = 0;
|
||||
|
||||
$compiler = join(" ", @ARGV);
|
||||
$compiler_bin = shift(@ARGV);
|
||||
|
||||
|
||||
# f77 is too ambiguous
|
||||
$compiler = "" if $compiler eq "f77";
|
||||
|
||||
@@ -125,11 +125,16 @@ if ($compiler eq "") {
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
|
||||
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
|
||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
|
||||
if ($data =~ / zho_ge__/) {
|
||||
$need2bu = 1;
|
||||
}
|
||||
if ($vendor =~ /G95/) {
|
||||
if ($ENV{NO_LAPACKE} != 1) {
|
||||
$need2bu = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($vendor eq "") {
|
||||
@@ -277,6 +282,8 @@ $linker_a = "";
|
||||
if ($link ne "") {
|
||||
|
||||
$link =~ s/\-Y\sP\,/\-Y/g;
|
||||
|
||||
$link =~ s/\-R\s*/\-rpath\@/g;
|
||||
|
||||
$link =~ s/\-rpath\s+/\-rpath\@/g;
|
||||
|
||||
@@ -292,9 +299,6 @@ if ($link ne "") {
|
||||
&& ($flags !~ /^-LIST:/)
|
||||
&& ($flags !~ /^-LANG:/)
|
||||
) {
|
||||
if ($vendor eq "PGI") {
|
||||
$flags =~ s/lib$/libso/;
|
||||
}
|
||||
$linker_L .= $flags . " ";
|
||||
}
|
||||
|
||||
@@ -311,17 +315,11 @@ if ($link ne "") {
|
||||
|
||||
if ($flags =~ /^\-rpath\@/) {
|
||||
$flags =~ s/\@/\,/g;
|
||||
if ($vendor eq "PGI") {
|
||||
$flags =~ s/lib$/libso/;
|
||||
}
|
||||
$linker_L .= "-Wl,". $flags . " " ;
|
||||
}
|
||||
|
||||
if ($flags =~ /^\-rpath-link\@/) {
|
||||
$flags =~ s/\@/\,/g;
|
||||
if ($vendor eq "PGI") {
|
||||
$flags =~ s/lib$/libso/;
|
||||
}
|
||||
$linker_L .= "-Wl,". $flags . " " ;
|
||||
}
|
||||
|
||||
@@ -330,7 +328,6 @@ if ($link ne "") {
|
||||
&& ($flags !~ /gfortranbegin/)
|
||||
&& ($flags !~ /frtbegin/)
|
||||
&& ($flags !~ /pathfstart/)
|
||||
&& ($flags !~ /numa/)
|
||||
&& ($flags !~ /crt[0-9]/)
|
||||
&& ($flags !~ /gcc/)
|
||||
&& ($flags !~ /user32/)
|
||||
|
||||
146
getarch.c
146
getarch.c
@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#ifdef OS_WINDOWS
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
@@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
#else
|
||||
#define NO_AVX512
|
||||
#endif
|
||||
/* #define FORCE_P2 */
|
||||
/* #define FORCE_KATMAI */
|
||||
/* #define FORCE_COPPERMINE */
|
||||
@@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_SKYLAKEX
|
||||
#ifdef NO_AVX512
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#define SUBARCHITECTURE "HASWELL"
|
||||
#define ARCHCONFIG "-DHASWELL " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
|
||||
"-DFMA3"
|
||||
#define LIBNAME "haswell"
|
||||
#define CORENAME "HASWELL"
|
||||
#else
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
@@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define LIBNAME "skylakex"
|
||||
#define CORENAME "SKYLAKEX"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ATOM
|
||||
#define FORCE
|
||||
@@ -618,6 +637,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define CORENAME "POWER8"
|
||||
#endif
|
||||
|
||||
#if defined(FORCE_POWER9)
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "POWER"
|
||||
#define SUBARCHITECTURE "POWER9"
|
||||
#define SUBDIRNAME "power"
|
||||
#define ARCHCONFIG "-DPOWER9 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
|
||||
"-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
#define LIBNAME "power9"
|
||||
#define CORENAME "POWER9"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_PPCG4
|
||||
#define FORCE
|
||||
@@ -927,11 +958,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define ARCHCONFIG "-DARMV8 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 "
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "armv8"
|
||||
#define CORENAME "ARMV8"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA53
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "CORTEXA53"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DCORTEXA53 " \
|
||||
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa53"
|
||||
#define CORENAME "CORTEXA53"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA57
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
@@ -942,26 +990,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa57"
|
||||
#define CORENAME "CORTEXA57"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_VULCAN
|
||||
#ifdef FORCE_CORTEXA72
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "VULCAN"
|
||||
#define SUBARCHITECTURE "CORTEXA72"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DVULCAN " \
|
||||
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
|
||||
#define ARCHCONFIG "-DCORTEXA72 " \
|
||||
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
|
||||
#define LIBNAME "vulcan"
|
||||
#define CORENAME "VULCAN"
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa72"
|
||||
#define CORENAME "CORTEXA72"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA73
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "CORTEXA73"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DCORTEXA73 " \
|
||||
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa73"
|
||||
#define CORENAME "CORTEXA73"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_FALKOR
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "FALKOR"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DFALKOR " \
|
||||
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "falkor"
|
||||
#define CORENAME "FALKOR"
|
||||
#else
|
||||
#endif
|
||||
|
||||
@@ -973,13 +1052,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define ARCHCONFIG "-DTHUNDERX " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
|
||||
"-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 "
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "thunderx"
|
||||
#define CORENAME "THUNDERX"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_THUNDERX2T99
|
||||
#define ARMV8
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "THUNDERX2T99"
|
||||
@@ -990,12 +1071,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "thunderx2t99"
|
||||
#define CORENAME "THUNDERX2T99"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_TSV110
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "TSV110"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DTSV110 " \
|
||||
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
|
||||
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "tsv110"
|
||||
#define CORENAME "TSV110"
|
||||
#else
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FORCE_ZARCH_GENERIC
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ZARCH"
|
||||
@@ -1016,8 +1114,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define CORENAME "Z13"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_Z14
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ZARCH"
|
||||
#define SUBARCHITECTURE "Z14"
|
||||
#define ARCHCONFIG "-DZ14 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64"
|
||||
#define LIBNAME "z14"
|
||||
#define CORENAME "Z14"
|
||||
#endif
|
||||
|
||||
#ifndef FORCE
|
||||
|
||||
#ifdef USER_TARGET
|
||||
#error "The TARGET specified on the command line or in Makefile.rule is not supported. Please choose a target from TargetList.txt"
|
||||
#endif
|
||||
|
||||
#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \
|
||||
defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__)
|
||||
#ifndef POWER
|
||||
@@ -1089,7 +1201,7 @@ static int get_num_cores(void) {
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
SYSTEM_INFO sysinfo;
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
int m[2], count;
|
||||
size_t len;
|
||||
#endif
|
||||
@@ -1103,7 +1215,7 @@ static int get_num_cores(void) {
|
||||
GetSystemInfo(&sysinfo);
|
||||
return sysinfo.dwNumberOfProcessors;
|
||||
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
|
||||
@@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES
|
||||
rotm.c rotmg.c # N.B. these do not have complex counterparts
|
||||
rot.c
|
||||
asum.c
|
||||
sum.c
|
||||
)
|
||||
|
||||
# these will have 'z' prepended for the complex version
|
||||
@@ -23,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES
|
||||
axpby.c
|
||||
)
|
||||
|
||||
# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f
|
||||
# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f
|
||||
# these all have 'z' sources for complex versions
|
||||
set(BLAS2_SOURCES
|
||||
gemv.c ger.c
|
||||
@@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES})
|
||||
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||
GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||
GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||
GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||
endif ()
|
||||
if (${float_type} STREQUAL "ZCOMPLEX")
|
||||
GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX")
|
||||
@@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES})
|
||||
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||
GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||
GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||
GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ SBLAS1OBJS = \
|
||||
saxpy.$(SUFFIX) sswap.$(SUFFIX) \
|
||||
scopy.$(SUFFIX) sscal.$(SUFFIX) \
|
||||
sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \
|
||||
sasum.$(SUFFIX) snrm2.$(SUFFIX) \
|
||||
sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \
|
||||
smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \
|
||||
smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \
|
||||
srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \
|
||||
@@ -51,7 +51,7 @@ DBLAS1OBJS = \
|
||||
daxpy.$(SUFFIX) dswap.$(SUFFIX) \
|
||||
dcopy.$(SUFFIX) dscal.$(SUFFIX) \
|
||||
ddot.$(SUFFIX) \
|
||||
dasum.$(SUFFIX) dnrm2.$(SUFFIX) \
|
||||
dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \
|
||||
dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \
|
||||
dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \
|
||||
drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \
|
||||
@@ -76,7 +76,7 @@ CBLAS1OBJS = \
|
||||
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
|
||||
ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \
|
||||
cdotc.$(SUFFIX) cdotu.$(SUFFIX) \
|
||||
scasum.$(SUFFIX) scnrm2.$(SUFFIX) \
|
||||
scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \
|
||||
scamax.$(SUFFIX) icamax.$(SUFFIX) \
|
||||
scamin.$(SUFFIX) icamin.$(SUFFIX) \
|
||||
csrot.$(SUFFIX) crotg.$(SUFFIX) \
|
||||
@@ -105,7 +105,7 @@ ZBLAS1OBJS = \
|
||||
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
|
||||
zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \
|
||||
zdotc.$(SUFFIX) zdotu.$(SUFFIX) \
|
||||
dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \
|
||||
dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \
|
||||
dzamax.$(SUFFIX) izamax.$(SUFFIX) \
|
||||
dzamin.$(SUFFIX) izamin.$(SUFFIX) \
|
||||
zdrot.$(SUFFIX) zrotg.$(SUFFIX) \
|
||||
@@ -146,7 +146,7 @@ QBLAS1OBJS = \
|
||||
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
|
||||
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
|
||||
qdot.$(SUFFIX) \
|
||||
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
||||
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
||||
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
|
||||
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
|
||||
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
|
||||
@@ -168,7 +168,7 @@ XBLAS1OBJS = \
|
||||
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
|
||||
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
|
||||
xdotc.$(SUFFIX) xdotu.$(SUFFIX) \
|
||||
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
||||
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
||||
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
|
||||
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
|
||||
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
|
||||
@@ -203,7 +203,7 @@ ifdef QUAD_PRECISION
|
||||
QBLAS1OBJS = \
|
||||
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
|
||||
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
|
||||
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
||||
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
||||
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
|
||||
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
|
||||
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
|
||||
@@ -224,7 +224,7 @@ QBLAS3OBJS = \
|
||||
XBLAS1OBJS = \
|
||||
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
|
||||
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
|
||||
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
||||
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
||||
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
|
||||
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
|
||||
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
|
||||
@@ -263,7 +263,8 @@ CSBLAS1OBJS = \
|
||||
cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
|
||||
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
|
||||
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
|
||||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
|
||||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
|
||||
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
|
||||
|
||||
CSBLAS2OBJS = \
|
||||
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
|
||||
@@ -280,7 +281,8 @@ CDBLAS1OBJS = \
|
||||
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
|
||||
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
||||
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
|
||||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
|
||||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
|
||||
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
|
||||
|
||||
CDBLAS2OBJS = \
|
||||
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
|
||||
@@ -300,7 +302,8 @@ CCBLAS1OBJS = \
|
||||
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
|
||||
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
|
||||
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
|
||||
cblas_caxpby.$(SUFFIX)
|
||||
cblas_caxpby.$(SUFFIX) \
|
||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)
|
||||
|
||||
CCBLAS2OBJS = \
|
||||
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
|
||||
@@ -326,7 +329,9 @@ CZBLAS1OBJS = \
|
||||
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
|
||||
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
|
||||
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
|
||||
cblas_zaxpby.$(SUFFIX)
|
||||
cblas_zaxpby.$(SUFFIX) \
|
||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)
|
||||
|
||||
|
||||
CZBLAS2OBJS = \
|
||||
cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \
|
||||
@@ -389,7 +394,7 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS)
|
||||
SLAPACKOBJS = \
|
||||
sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \
|
||||
spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \
|
||||
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX)
|
||||
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX)
|
||||
|
||||
|
||||
#DLAPACKOBJS = \
|
||||
@@ -400,14 +405,14 @@ SLAPACKOBJS = \
|
||||
DLAPACKOBJS = \
|
||||
dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \
|
||||
dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \
|
||||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX)
|
||||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX)
|
||||
|
||||
|
||||
QLAPACKOBJS = \
|
||||
qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \
|
||||
qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \
|
||||
qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \
|
||||
|
||||
qlaswp.$(SUFFIX) qtrtrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \
|
||||
qtrtrs.$(SUFFIX)
|
||||
|
||||
#CLAPACKOBJS = \
|
||||
# cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
|
||||
@@ -418,7 +423,7 @@ QLAPACKOBJS = \
|
||||
CLAPACKOBJS = \
|
||||
cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
|
||||
cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \
|
||||
clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX)
|
||||
clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX)
|
||||
|
||||
|
||||
#ZLAPACKOBJS = \
|
||||
@@ -430,13 +435,14 @@ CLAPACKOBJS = \
|
||||
ZLAPACKOBJS = \
|
||||
zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \
|
||||
zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \
|
||||
zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX)
|
||||
zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX)
|
||||
|
||||
|
||||
XLAPACKOBJS = \
|
||||
xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \
|
||||
xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \
|
||||
xlaswp.$(SUFFIX) xgetrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \
|
||||
xlaswp.$(SUFFIX) xtrtrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \
|
||||
xtrtrs.$(SUFFIX)
|
||||
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
SBLASOBJS += $(SLAPACKOBJS)
|
||||
@@ -560,6 +566,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c
|
||||
qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
@@ -1383,6 +1407,18 @@ cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c
|
||||
cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_icmax.$(SUFFIX) cblas_icmax.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_izmax.$(SUFFIX) cblas_izmax.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
@@ -1395,6 +1431,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c
|
||||
cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
@@ -1402,7 +1450,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
@@ -1984,7 +2032,7 @@ sgetrs.$(SUFFIX) sgetrs.$(PSUFFIX) : lapack/getrs.c
|
||||
dgetrs.$(SUFFIX) dgetrs.$(PSUFFIX) : lapack/getrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : getrs.c
|
||||
qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : lapack/getrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
@@ -1993,7 +2041,25 @@ cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : zgetrs.c
|
||||
xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
strtrs.$(SUFFIX) strtrs.$(PSUFFIX) : lapack/trtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
dtrtrs.$(SUFFIX) dtrtrs.$(PSUFFIX) : lapack/trtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
qtrtrs.$(SUFFIX) qtrtrs.$(PSUFFIX) : lapack/trtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ctrtrs.$(SUFFIX) ctrtrs.$(PSUFFIX) : lapack/ztrtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ztrtrs.$(SUFFIX) ztrtrs.$(PSUFFIX) : lapack/ztrtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
xtrtrs.$(SUFFIX) xtrtrs.$(PSUFFIX) : lapack/ztrtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : lapack/gesv.c
|
||||
|
||||
@@ -75,6 +75,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
if (incx == 0 && incy == 0) {
|
||||
*y += n * alpha *(*x);
|
||||
return;
|
||||
}
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
@@ -86,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
|
||||
//disable multi-thread when incx==0 or incy==0
|
||||
//In that case, the threads would be dependent.
|
||||
//
|
||||
//Temporarily work-around the low performance issue with small imput size &
|
||||
//Temporarily work-around the low performance issue with small input size &
|
||||
//multithreads.
|
||||
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
|
||||
nthreads = 1;
|
||||
|
||||
@@ -271,6 +271,14 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT)
|
||||
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) {
|
||||
sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc);
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
args.alpha = (void *)α
|
||||
args.beta = (void *)β
|
||||
|
||||
171
interface/lapack/trtrs.c
Normal file
171
interface/lapack/trtrs.c
Normal file
@@ -0,0 +1,171 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "QTRTRS"
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "DTRTRS"
|
||||
#else
|
||||
#define ERROR_NAME "STRTRS"
|
||||
#endif
|
||||
|
||||
static blasint (*trtrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_SINGLE, TRTRS_UNN_SINGLE, TRTRS_UTU_SINGLE, TRTRS_UTN_SINGLE, TRTRS_LNU_SINGLE, TRTRS_LNN_SINGLE, TRTRS_LTU_SINGLE, TRTRS_LTN_SINGLE,
|
||||
};
|
||||
|
||||
#ifdef SMP
|
||||
static blasint (*trtrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_PARALLEL, TRTRS_UNN_PARALLEL, TRTRS_UTU_PARALLEL, TRTRS_UTN_PARALLEL, TRTRS_LNU_PARALLEL, TRTRS_LNN_PARALLEL, TRTRS_LTU_PARALLEL, TRTRS_LTN_PARALLEL,
|
||||
};
|
||||
#endif
|
||||
|
||||
int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
|
||||
FLOAT *b, blasint *ldB, blasint *Info){
|
||||
|
||||
char uplo_arg = *UPLO;
|
||||
char trans_arg = *TRANS;
|
||||
char diag_arg = *DIAG;
|
||||
|
||||
blas_arg_t args;
|
||||
|
||||
blasint info;
|
||||
int uplo, trans, diag;
|
||||
FLOAT *buffer;
|
||||
#ifdef PPC440
|
||||
extern
|
||||
#endif
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
args.m = *N;
|
||||
args.n = *NRHS;
|
||||
args.a = (void *)a;
|
||||
args.lda = *ldA;
|
||||
args.b = (void *)b;
|
||||
args.ldb = *ldB;
|
||||
|
||||
info = 0;
|
||||
|
||||
TOUPPER(trans_arg);
|
||||
trans = -1;
|
||||
if (trans_arg == 'N') trans = 0;
|
||||
if (trans_arg == 'T') trans = 1;
|
||||
if (trans_arg == 'R') trans = 0;
|
||||
if (trans_arg == 'C') trans = 1;
|
||||
|
||||
uplo = -1;
|
||||
if (uplo_arg == 'U') uplo = 0;
|
||||
if (uplo_arg == 'L') uplo = 1;
|
||||
|
||||
diag = -1;
|
||||
if (diag_arg == 'U') diag = 0;
|
||||
if (diag_arg == 'N') diag = 1;
|
||||
|
||||
if (args.ldb < MAX(1, args.m)) info = 9;
|
||||
if (args.lda < MAX(1, args.m)) info = 7;
|
||||
if (args.n < 0) info = 5;
|
||||
if (args.m < 0) info = 4;
|
||||
if (trans < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (diag < 0) info = 3;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
args.alpha = NULL;
|
||||
args.beta = NULL;
|
||||
|
||||
*Info = 0;
|
||||
|
||||
if (args.m == 0) return 0;
|
||||
|
||||
if (diag) {
|
||||
if (AMIN_K(args.m, args.a, args.lda + 1) == ZERO) {
|
||||
*Info = IAMIN_K(args.m, args.a, args.lda + 1);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
#ifndef PPC440
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
(trtrs_single[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
(trtrs_parallel[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef PPC440
|
||||
blas_memory_free(buffer);
|
||||
#endif
|
||||
|
||||
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user