Compare commits
530 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
898212efcd | ||
|
|
210a1584c5 | ||
|
|
f2a7a67f5a | ||
|
|
e0e88f9edc | ||
|
|
5dc6aa74f0 | ||
|
|
e78fbe4654 | ||
|
|
b4f4ed378b | ||
|
|
cbc41973fd | ||
|
|
1b6db3dbba | ||
|
|
f681553c6a | ||
|
|
afadeeba2a | ||
|
|
02d4a49761 | ||
|
|
4d7dfe4845 | ||
|
|
af0a69f355 | ||
|
|
5a2fe5bfb9 | ||
|
|
342d3e8b5c | ||
|
|
efbd7c7840 | ||
|
|
3a7955cd93 | ||
|
|
47ba85f314 | ||
|
|
30f23be0f9 | ||
|
|
49bbf330ca | ||
|
|
38d5b4b124 | ||
|
|
6e3fbe8ac5 | ||
|
|
86273392e5 | ||
|
|
d909f9f3d4 | ||
|
|
12d3d94e2e | ||
|
|
f349be3bdb | ||
|
|
4777eb678f | ||
|
|
415876d117 | ||
|
|
da8435dc36 | ||
|
|
4c7065f3ee | ||
|
|
f62bfaafe8 | ||
|
|
d947116390 | ||
|
|
f176ff90af | ||
|
|
f4d4abd423 | ||
|
|
2b9443b7e7 | ||
|
|
fe0e66564e | ||
|
|
a6351e32f0 | ||
|
|
5b4b385ecf | ||
|
|
1dea57ab25 | ||
|
|
54ffe280df | ||
|
|
029d1e16b9 | ||
|
|
ea8e208029 | ||
|
|
0fca36c8c3 | ||
|
|
44cc7cdecc | ||
|
|
6492131792 | ||
|
|
6c8ec55fb7 | ||
|
|
fab746240c | ||
|
|
847607c768 | ||
|
|
4c81d1c3fe | ||
|
|
db4908ebfa | ||
|
|
ed3eb18cb2 | ||
|
|
239ff330f8 | ||
|
|
19c81a07cb | ||
|
|
e008646ba9 | ||
|
|
498479b13e | ||
|
|
b4cbfe6677 | ||
|
|
be1a42507c | ||
|
|
7bb59fceb7 | ||
|
|
eba2cd951e | ||
|
|
836c7fb9f5 | ||
|
|
d2693eac04 | ||
|
|
8acb6fe3a8 | ||
|
|
c47e35acee | ||
|
|
a27a61bb9a | ||
|
|
69560ad3ce | ||
|
|
b2319fd97a | ||
|
|
0266ba7cb6 | ||
|
|
7e09570e04 | ||
|
|
14e33e0f7e | ||
|
|
db57c449dc | ||
|
|
993e56b7b3 | ||
|
|
c9304199cf | ||
|
|
d86290edf0 | ||
|
|
89429fdaa2 | ||
|
|
d511063098 | ||
|
|
4f4e286bf6 | ||
|
|
ddb6cee0d5 | ||
|
|
cecc2c65aa | ||
|
|
220f6a1c55 | ||
|
|
2f6326a630 | ||
|
|
c0d0406b97 | ||
|
|
8f22ac552b | ||
|
|
da623ae838 | ||
|
|
eb2fdd3af0 | ||
|
|
0d8d261dd4 | ||
|
|
40caaef052 | ||
|
|
25b602d8a6 | ||
|
|
4ed99c2ce3 | ||
|
|
f20c4edc33 | ||
|
|
3cfdb1770c | ||
|
|
8186963d8c | ||
|
|
a4543e4918 | ||
|
|
2376aa1e8c | ||
|
|
4620f98812 | ||
|
|
726c44242b | ||
|
|
dcfc5cf714 | ||
|
|
06e3b07ecb | ||
|
|
623be6600a | ||
|
|
7ddc9d384c | ||
|
|
6ebcce229f | ||
|
|
1b5620b66e | ||
|
|
1f8bda71b9 | ||
|
|
3be660c000 | ||
|
|
1a8b6134c2 | ||
|
|
f0b822a709 | ||
|
|
130327e9af | ||
|
|
750719528a | ||
|
|
91e2b11d3c | ||
|
|
548aa522e5 | ||
|
|
6423b282a1 | ||
|
|
9335d42740 | ||
|
|
39ef0880ae | ||
|
|
b7da75e4fd | ||
|
|
a7627c5afd | ||
|
|
9499ab0d45 | ||
|
|
307c4c0786 | ||
|
|
e83df93975 | ||
|
|
13fa9f737d | ||
|
|
5958ffc9b6 | ||
|
|
cd0e4aadb1 | ||
|
|
e2621ef93a | ||
|
|
9e1b43ea9b | ||
|
|
5269348178 | ||
|
|
92e024bbb3 | ||
|
|
c4b464cac6 | ||
|
|
e6dd44d989 | ||
|
|
baf03a0937 | ||
|
|
7aab5e826c | ||
|
|
29417adf4c | ||
|
|
9d292d37b2 | ||
|
|
2e8ff4a781 | ||
|
|
dbba381dc3 | ||
|
|
f61991d439 | ||
|
|
efdbdd8f82 | ||
|
|
3906ef3b0f | ||
|
|
8adf0971d8 | ||
|
|
08e2e60762 | ||
|
|
fb9e678235 | ||
|
|
dc4fcb48df | ||
|
|
7a48247761 | ||
|
|
7dfc45e840 | ||
|
|
7fb6e576c2 | ||
|
|
cbb70438df | ||
|
|
706a08d4a0 | ||
|
|
9f3d903817 | ||
|
|
590be3fae3 | ||
|
|
3521cd48cb | ||
|
|
1e0192a5cc | ||
|
|
fe9aff17fe | ||
|
|
8c25b440a0 | ||
|
|
f84197c1a7 | ||
|
|
734bd265a8 | ||
|
|
1217eb910d | ||
|
|
d6d7a6685d | ||
|
|
f0e7345fb8 | ||
|
|
42f048cf6c | ||
|
|
4fbc0777f4 | ||
|
|
d7472606d5 | ||
|
|
03297ff9f0 | ||
|
|
2d8d0af0ea | ||
|
|
5f677e782e | ||
|
|
04c60cee5d | ||
|
|
3a53207cc9 | ||
|
|
0e73d20629 | ||
|
|
02087a62e7 | ||
|
|
03b4d79a7e | ||
|
|
5c729c6dce | ||
|
|
e1911b2e60 | ||
|
|
8f33da4f94 | ||
|
|
26ccf643a3 | ||
|
|
32264ba496 | ||
|
|
4ecf631f95 | ||
|
|
5af510081d | ||
|
|
164551d5a2 | ||
|
|
310b76aad7 | ||
|
|
c4da892ba0 | ||
|
|
cbfd3c87e1 | ||
|
|
26e87ac517 | ||
|
|
15b9d6b4a7 | ||
|
|
f7bcd962c1 | ||
|
|
93cc066921 | ||
|
|
2c7d4a7766 | ||
|
|
eef1c42f03 | ||
|
|
73f637e584 | ||
|
|
8b90e5f202 | ||
|
|
bd60fb6ffc | ||
|
|
37ea8702ee | ||
|
|
ec7d6c02bc | ||
|
|
c90c23e78f | ||
|
|
bda8820da7 | ||
|
|
c0ca63ea46 | ||
|
|
f497bb949b | ||
|
|
f86b1bc3da | ||
|
|
206e03fdac | ||
|
|
8b599836db | ||
|
|
9721b57ecf | ||
|
|
380f955078 | ||
|
|
49d18e65e3 | ||
|
|
904f9a267d | ||
|
|
4c033730bb | ||
|
|
65502c6af6 | ||
|
|
f71627fa2e | ||
|
|
d8d7bd33cb | ||
|
|
e72420e8c5 | ||
|
|
d00709e016 | ||
|
|
d444344497 | ||
|
|
fb7308b9b5 | ||
|
|
db50b24a4a | ||
|
|
88b70fba3e | ||
|
|
4c1d47098b | ||
|
|
40000d1f64 | ||
|
|
dc3664993c | ||
|
|
b8232c9054 | ||
|
|
114bbbc6d7 | ||
|
|
b67a92c19f | ||
|
|
4bf00da8fb | ||
|
|
c26780d451 | ||
|
|
d77d9bc920 | ||
|
|
37d3e2bd94 | ||
|
|
de8656769c | ||
|
|
d43e07198d | ||
|
|
da16764c7a | ||
|
|
98ebc8ac59 | ||
|
|
904b221f03 | ||
|
|
5cc35abc3d | ||
|
|
254774f5a6 | ||
|
|
ae9cdee753 | ||
|
|
53ee0b76bb | ||
|
|
dc6b04c375 | ||
|
|
3d4ccd2a13 | ||
|
|
c59652f0ce | ||
|
|
87d2e314db | ||
|
|
3a30c12019 | ||
|
|
c9a82f54d1 | ||
|
|
444cb78be5 | ||
|
|
171c20e3b6 | ||
|
|
c5fb91f1bc | ||
|
|
9a36a283d3 | ||
|
|
7e35d25ea0 | ||
|
|
3704f5e5b0 | ||
|
|
6b76066632 | ||
|
|
2b01132515 | ||
|
|
8e95a1e18d | ||
|
|
aa7b3dc3db | ||
|
|
13a29d13fd | ||
|
|
a6c2cb8417 | ||
|
|
d511a7bb4f | ||
|
|
3526ff2507 | ||
|
|
adcfe7b789 | ||
|
|
ceb44bef14 | ||
|
|
ed473267df | ||
|
|
0608bc5d82 | ||
|
|
3d511f0e66 | ||
|
|
0b8a436af9 | ||
|
|
352efdd13a | ||
|
|
4855af02a3 | ||
|
|
94a5a1f0f1 | ||
|
|
751d127d7c | ||
|
|
fc101b67e5 | ||
|
|
b0239a05fd | ||
|
|
623d580b4c | ||
|
|
974acb39ff | ||
|
|
2379abaa5e | ||
|
|
3caf781d7c | ||
|
|
55bb9f639a | ||
|
|
0dba04bb58 | ||
|
|
e96f5e3c65 | ||
|
|
558724e99f | ||
|
|
067c96a873 | ||
|
|
4b380c0b40 | ||
|
|
2dfb24730d | ||
|
|
725432efaa | ||
|
|
a2216ef19f | ||
|
|
5332cbae18 | ||
|
|
209b026e46 | ||
|
|
1ae607beca | ||
|
|
d393f1923f | ||
|
|
081d5ae971 | ||
|
|
0492f0f3f9 | ||
|
|
147e0a75fd | ||
|
|
ee068af843 | ||
|
|
2dbcddd83d | ||
|
|
d2bda3b56a | ||
|
|
903fd85c85 | ||
|
|
d57c681a6d | ||
|
|
d7efe5857c | ||
|
|
8fd694c18f | ||
|
|
e69b0b1771 | ||
|
|
9dc0bfd617 | ||
|
|
e6664ec2c9 | ||
|
|
dbb33f412f | ||
|
|
70b89a6205 | ||
|
|
07b144855a | ||
|
|
292a0aed66 | ||
|
|
42f0201e21 | ||
|
|
22db876d48 | ||
|
|
bdd6e3a153 | ||
|
|
7b8f580941 | ||
|
|
198adea961 | ||
|
|
86c5a0013f | ||
|
|
ef85c22474 | ||
|
|
d3555d2e50 | ||
|
|
c4b91bfcf1 | ||
|
|
0f5e86a0d9 | ||
|
|
7b294a99fd | ||
|
|
1e4b2e98d9 | ||
|
|
3fd6ccdf76 | ||
|
|
fa9a30b491 | ||
|
|
d90ca75a6c | ||
|
|
e107454454 | ||
|
|
d43962d013 | ||
|
|
2f6d35c3d4 | ||
|
|
86de5f768b | ||
|
|
2663e44724 | ||
|
|
6f2900c164 | ||
|
|
7888b5127c | ||
|
|
8808c291b9 | ||
|
|
8cdf0825de | ||
|
|
9e0dbe8e59 | ||
|
|
52f99d3944 | ||
|
|
186368ddc3 | ||
|
|
c0b94ae1df | ||
|
|
ddd86309a1 | ||
|
|
e9d453b623 | ||
|
|
ecb4babcf4 | ||
|
|
34753eaebb | ||
|
|
efa72a631b | ||
|
|
30d835168a | ||
|
|
8f6a744807 | ||
|
|
6726771645 | ||
|
|
a51cae6b2e | ||
|
|
d30b943251 | ||
|
|
0934568d9c | ||
|
|
697e64bbb6 | ||
|
|
bffb9b0e95 | ||
|
|
6ae7af78a3 | ||
|
|
041a26fd79 | ||
|
|
3c356b1a1f | ||
|
|
b1215f2f8c | ||
|
|
0b73041b16 | ||
|
|
9579bd47e5 | ||
|
|
09d47af2c0 | ||
|
|
ef0238ba2b | ||
|
|
a9f6f7ad39 | ||
|
|
1d254d321b | ||
|
|
41646ed006 | ||
|
|
3679781872 | ||
|
|
38dcf3454b | ||
|
|
e34d57ca90 | ||
|
|
20f492c298 | ||
|
|
c7c82be1c3 | ||
|
|
9564f688c4 | ||
|
|
90c1776c86 | ||
|
|
9cf861e8fa | ||
|
|
9b7b1da133 | ||
|
|
a5ab891292 | ||
|
|
90bb4ac821 | ||
|
|
23a0d1bc1f | ||
|
|
0e96c378fd | ||
|
|
ee16efff3c | ||
|
|
0197519dd7 | ||
|
|
865829cfac | ||
|
|
0571c3187b | ||
|
|
d12a2d0d04 | ||
|
|
2d369bd916 | ||
|
|
93843c55b6 | ||
|
|
e3a6132e12 | ||
|
|
736f0146c3 | ||
|
|
897fc2b6ef | ||
|
|
441c116105 | ||
|
|
8ecd80a34a | ||
|
|
4ba53db0da | ||
|
|
6c365ff648 | ||
|
|
e33bcdbb7b | ||
|
|
ec6b354c32 | ||
|
|
292d1af1a0 | ||
|
|
325b398e3c | ||
|
|
6f5667b4d4 | ||
|
|
cceeee7806 | ||
|
|
0a4546b742 | ||
|
|
b1eed27a54 | ||
|
|
1a3ad4b670 | ||
|
|
86a5f98e4a | ||
|
|
1caa44bea9 | ||
|
|
dbbf92c1d1 | ||
|
|
cb429d6b12 | ||
|
|
b0bded3f2f | ||
|
|
f9aaf22fc3 | ||
|
|
35ff3c731d | ||
|
|
63fa6c832e | ||
|
|
e4e5042e38 | ||
|
|
ae53e3e233 | ||
|
|
074d9bff7f | ||
|
|
f36862603a | ||
|
|
47691c031f | ||
|
|
ce7ddd8921 | ||
|
|
950c047b49 | ||
|
|
46509953a9 | ||
|
|
db348dcff2 | ||
|
|
a33f471065 | ||
|
|
ece3ce581e | ||
|
|
8189a98d85 | ||
|
|
d7a77091a3 | ||
|
|
3e1e74fca6 | ||
|
|
33b5670122 | ||
|
|
95e19e2e23 | ||
|
|
99ac042702 | ||
|
|
774b9f8653 | ||
|
|
eb1d2344f7 | ||
|
|
6fa9860dbe | ||
|
|
0cc36770f1 | ||
|
|
558cd543bf | ||
|
|
bd906e3410 | ||
|
|
35086cb501 | ||
|
|
2056ffc227 | ||
|
|
7745439312 | ||
|
|
c4b5abbe43 | ||
|
|
f87842483e | ||
|
|
3dbb32c734 | ||
|
|
609ea80276 | ||
|
|
3dfecaaf7c | ||
|
|
3165c915b6 | ||
|
|
457ccc42c9 | ||
|
|
00880c720a | ||
|
|
856bc36533 | ||
|
|
fe71887b68 | ||
|
|
10094bd885 | ||
|
|
eea0c0f2ed | ||
|
|
85be43e0df | ||
|
|
0cb9e9fc8d | ||
|
|
cb61d3b46b | ||
|
|
113840da12 | ||
|
|
deb2e66bcc | ||
|
|
9b2d69aa80 | ||
|
|
e3ff4cdd23 | ||
|
|
0745ba43a4 | ||
|
|
3ede843d50 | ||
|
|
2e8d6e8690 | ||
|
|
69a5558203 | ||
|
|
d6905403e3 | ||
|
|
411926b572 | ||
|
|
439b93f6d2 | ||
|
|
d6cf67778c | ||
|
|
b94dab5250 | ||
|
|
6178974cd9 | ||
|
|
0b9e4d1278 | ||
|
|
63fa3c3f8f | ||
|
|
3612d9a57a | ||
|
|
b60de4447a | ||
|
|
16dddb760e | ||
|
|
eff7c9166e | ||
|
|
f1bf2603e6 | ||
|
|
6f32991eae | ||
|
|
202fc9e8ed | ||
|
|
e378b24487 | ||
|
|
3628b22d49 | ||
|
|
af2b0d0205 | ||
|
|
4bf988959a | ||
|
|
a0e4fb3a28 | ||
|
|
2c445be8ba | ||
|
|
e3f4063683 | ||
|
|
6bbe6d5b92 | ||
|
|
89ae305e11 | ||
|
|
da8d7f09f1 | ||
|
|
25c986db5a | ||
|
|
a8f249458d | ||
|
|
bc5b35367f | ||
|
|
930aff2c2e | ||
|
|
ac3e2a3fdd | ||
|
|
9ccb12b031 | ||
|
|
e18a2c22db | ||
|
|
b716c0ef01 | ||
|
|
2efa3b70dc | ||
|
|
49959d4f1c | ||
|
|
0f27a03607 | ||
|
|
c2a8ebfe69 | ||
|
|
43aac5bacc | ||
|
|
bff2b7c94d | ||
|
|
2d45a262d9 | ||
|
|
ed652d8136 | ||
|
|
6fe0f1fab9 | ||
|
|
b0beb0b1ca | ||
|
|
018dec8588 | ||
|
|
5d6209e1f9 | ||
|
|
601b711c78 | ||
|
|
78702753f2 | ||
|
|
7aa1ff8ff6 | ||
|
|
d6c97cf010 | ||
|
|
1b2508362b | ||
|
|
cd898af59f | ||
|
|
0a535e58d8 | ||
|
|
9ce9e295fe | ||
|
|
9a38592c79 | ||
|
|
9b3965b08c | ||
|
|
531cb4f673 | ||
|
|
3559c5d7a2 | ||
|
|
8631e2976a | ||
|
|
2768bc1764 | ||
|
|
6f4698ee1f | ||
|
|
85e5165e98 | ||
|
|
17c16f2a71 | ||
|
|
91c3f86c2b | ||
|
|
75b1f3becc | ||
|
|
07c5e549b2 | ||
|
|
114eb159a4 | ||
|
|
005cce5507 | ||
|
|
b859b6e79d | ||
|
|
b212a2fb9f | ||
|
|
e40416567a | ||
|
|
b37e5fa2f8 | ||
|
|
326469ef4a | ||
|
|
c73d8ee40d | ||
|
|
abef2ea770 | ||
|
|
b26e32c3af | ||
|
|
7822eff936 | ||
|
|
865676682d | ||
|
|
0f7776af0b | ||
|
|
b03dc011be | ||
|
|
00ce35336e | ||
|
|
723776ddf7 | ||
|
|
5a77ec7f1c | ||
|
|
2fb11f873b | ||
|
|
ad63647446 | ||
|
|
87315e8a8d | ||
|
|
9031ebd7d5 | ||
|
|
12b41d5598 | ||
|
|
6cfd6195c5 | ||
|
|
5163a85d40 | ||
|
|
dbf9ad1f3d |
24
.drone.yml
24
.drone.yml
@@ -190,3 +190,27 @@ steps:
|
||||
- make -C ctest $COMMON_FLAGS
|
||||
- make -C utest $COMMON_FLAGS
|
||||
- make -C cpp_thread_test dgemm_tester
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm64_gcc10
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm64
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:20.04
|
||||
environment:
|
||||
CC: gcc-10
|
||||
FC: gfortran-10
|
||||
COMMON_FLAGS: 'TARGET=ARMV8 DYNAMIC_ARCH=1'
|
||||
commands:
|
||||
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC gfortran-10 perl python g++
|
||||
- $CC --version
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||
- make -C utest $COMMON_FLAGS
|
||||
- make -C test $COMMON_FLAGS
|
||||
|
||||
|
||||
2
.github/workflows/nightly-Homebrew-build.yml
vendored
2
.github/workflows/nightly-Homebrew-build.yml
vendored
@@ -43,7 +43,7 @@ jobs:
|
||||
- name: Update Homebrew
|
||||
if: github.event_name != 'pull_request'
|
||||
run: brew update || true
|
||||
|
||||
|
||||
- name: Install prerequisites
|
||||
run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas
|
||||
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -89,5 +89,7 @@ build.*
|
||||
*.swp
|
||||
benchmark/*.goto
|
||||
benchmark/smallscaling
|
||||
.vscode
|
||||
CMakeCache.txt
|
||||
CMakeFiles/*
|
||||
.vscode
|
||||
|
||||
15
.travis.yml
15
.travis.yml
@@ -224,12 +224,21 @@ matrix:
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
- brew update
|
||||
- brew install gcc@10
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
|
||||
|
||||
- BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode12
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
- brew update
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
|
||||
|
||||
# - <<: *test-macos
|
||||
# osx_image: xcode10
|
||||
# env:
|
||||
|
||||
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
||||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 13)
|
||||
set(OpenBLAS_PATCH_VERSION 17.dev)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
@@ -14,6 +14,9 @@ include(GNUInstallDirs)
|
||||
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
if(MSVC AND NOT DEFINED NOFORTRAN)
|
||||
set(NOFORTRAN ON)
|
||||
endif()
|
||||
|
||||
#######
|
||||
if(MSVC)
|
||||
@@ -229,7 +232,7 @@ if (NOT NO_CBLAS)
|
||||
add_subdirectory(utest)
|
||||
endif()
|
||||
|
||||
if (NOT MSVC AND NOT NOFORTRAN)
|
||||
if (NOT NOFORTRAN)
|
||||
# Build test and ctest
|
||||
add_subdirectory(test)
|
||||
if(NOT NO_CBLAS)
|
||||
|
||||
@@ -194,3 +194,6 @@ In chronological order:
|
||||
|
||||
* PingTouGe Semiconductor Co., Ltd.
|
||||
* [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910
|
||||
|
||||
* River Dillon <oss@outerpassage.net>
|
||||
* [2021-07-10] fix compilation with musl libc
|
||||
|
||||
161
Changelog.txt
161
Changelog.txt
@@ -1,4 +1,165 @@
|
||||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.17
|
||||
15-Jul-2021
|
||||
|
||||
common:
|
||||
- reverted the optimization of SGEMV_N/DGEMV_N for small input sizes
|
||||
and consecutive arguments as it led to stack overflows on x86_64
|
||||
with some operating systems (notably OSX and Windows)
|
||||
|
||||
x86_64:
|
||||
- reverted the performance patch for SGEMV_T on AVX512 as it caused
|
||||
wrong results in some applications
|
||||
|
||||
SPARC:
|
||||
- fixed compilation with compilers other than gcc
|
||||
====================================================================
|
||||
Version 0.3.16
|
||||
11-Jul-2021
|
||||
|
||||
common:
|
||||
- drastically reduced the stack size requirements for running the LAPACK
|
||||
testsuite (Reference-LAPACK PR 553)
|
||||
- fixed spurious test failures in the LAPACK testsuite (Reference-LAPACK
|
||||
PR 564)
|
||||
- expressly setting DYNAMIC_ARCH=0 no longer enables dynamic_arch mode
|
||||
- improved performance of xGER, xSPR, xSPR2, xSYR, xSYR2, xTRSV, SGEMV_N
|
||||
and DGEMV_N, for small input sizes and consecutive arguments
|
||||
- improved performance of xGETRF, xPORTF and xPOTRI for small input sizes
|
||||
by disabling multithreading
|
||||
- fixed installing with BSD versions of the "install" utility
|
||||
|
||||
RISCV:
|
||||
- fixed the implementation of xIMIN
|
||||
- improved the performance of DSDOT
|
||||
- fixed linking of the tests on C910V with current vendor gcc
|
||||
|
||||
POWER:
|
||||
- fixed SBGEMM computation for some odd value inputs
|
||||
- fixed compilation for PPCG4, PPC970, POWER3, POWER4 and POWER5
|
||||
|
||||
x86_64:
|
||||
- improved performance of SGEMV_N and SGEMV_T for small N on AVX512-capable cpus
|
||||
- worked around a miscompilation of ZGEMM/ZTRMM on Sandybridge with old gcc
|
||||
versions
|
||||
- fixed compilation with MS Visual Studio versions older than 2017
|
||||
- fixed macro name collision with winnt.h from the latest Win10 SDK
|
||||
- added cpu type autodetection for Intel Ice Lake SP
|
||||
- fixed cpu type autodetection for Intel Tiger Lake
|
||||
- added cpu type autodetection for recent Centaur/Zhaoxin models
|
||||
- fixed compilation with musl libc
|
||||
|
||||
ARM64:
|
||||
- fixed compilation with gcc/gfortran on the Apple M1
|
||||
- fixed linking of the tests on FreeBSD
|
||||
- fixed missing restore of a register in the recently rewritten DNRM2 kernel
|
||||
for ThunderX2 and Neoverse N1 that could cause spurious failures in e.g.
|
||||
DGEEV
|
||||
- added compiler optimization flags for the EMAG8180
|
||||
- added initial support for Cortex A55
|
||||
|
||||
ARM:
|
||||
- fixed linking of the tests on FreeBSD
|
||||
|
||||
====================================================================
|
||||
Version 0.3.15
|
||||
2-May-2021
|
||||
|
||||
common:
|
||||
- imported improvements and bugfixes from Reference-LAPACK 3.9.1
|
||||
- imported LAPACKE interface fixes from Reference-LAPACK PRs 534 + 537
|
||||
- fixed a problem in the cpu detection of 0.3.14 that prevented cross-compilation
|
||||
- fixed a sequence problem in the generation of softlinks to the library in GMAKE
|
||||
|
||||
RISC V:
|
||||
- fixed compilation on RISCV (missing entry in getarch)
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
|
||||
POWER:
|
||||
- fixed LAPACK testsuite failures seen with the NVIDIA HPC compiler
|
||||
- improved CGEMM, DGEMM and ZGEMM performance on POWER10
|
||||
- added an optimized ZGEMV kernel for POWER10
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
|
||||
x86_64:
|
||||
- added support for Intel Control-flow Enforcement Technology (CET)
|
||||
- reverted the DOMATCOPY_RT code to the generic C version
|
||||
- fixed a bug in the AVX512 SGEMM kernel introduced in 0.3.14
|
||||
- fixed misapplication of -msse flag to non-SSE cpus in DYNAMIC_ARCH
|
||||
- added support for compilation of the benchmarks on older OSX versions
|
||||
- fix propagation of the NO_AVX512 option in CMAKE builds
|
||||
- fix compilation of the AVX512 SGEMM kernel with clang-cl on Windows
|
||||
- fixed compilation of the CTESTs with INTERFACE64=1 (random faults on OSX)
|
||||
- corrected the Haswell DROT kernel to require AVX2/FMA3 rather than AVX512
|
||||
|
||||
ARM:
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
- fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs
|
||||
|
||||
ARM64:
|
||||
- fixed spurious reads outside the array in the SGEMM tcopy macro
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
- fixed a segmentation fault in DYNAMIC_ARCH builds (reappeared in 0.3.14)
|
||||
|
||||
MIPS
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
- fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs
|
||||
|
||||
MIPS64:
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
|
||||
SPARC:
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
|
||||
====================================================================
|
||||
Version 0.3.14
|
||||
17-Mar-2021
|
||||
|
||||
common:
|
||||
* Fixed a race condition on thread shutdown in non-OpenMP builds
|
||||
* Fixed custom BUFFERSIZE option getting ignored in gmake builds
|
||||
* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
|
||||
* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
|
||||
* Improved performance of OMATCOPY_RT across all platforms
|
||||
* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
|
||||
* Fixed potential misreading of the GCC compiler version in the build scripts
|
||||
* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
|
||||
* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
|
||||
|
||||
RISCV:
|
||||
* Fixed compilation on RISCV (missing entry in getarch)
|
||||
|
||||
POWER:
|
||||
* Fixed compilation for DYNAMIC_ARCH with clang and with old gcc versions
|
||||
* Added support for compilation on FreeBSD/ppc64le
|
||||
* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
|
||||
* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
|
||||
* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
|
||||
* Improved SCOPY and CCOPY performance on POWER10
|
||||
* Improved SGEMM and DGEMM performance on POWER10
|
||||
* Added support for compilation with the NVIDIA HPC compiler
|
||||
|
||||
x86_64:
|
||||
* Added an optimized bfloat16 GEMM kernel for Cooperlake
|
||||
* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
|
||||
* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
|
||||
* Added support for compilation with the NAG Fortran compiler
|
||||
* Fixed recognition of the AMD AOCC compiler
|
||||
* Fixed compilation for DYNAMIC_ARCH with clang on Windows
|
||||
* Added support for running the BLAS/CBLAS tests on Windows
|
||||
* Fixed signatures of the tls callback functions for Windows x64
|
||||
* Fixed various issues with fma intrinsics support handling
|
||||
|
||||
ARM:
|
||||
* Added support for embedded Cortex M targets via a new option EMBEDDED
|
||||
|
||||
ARMV8:
|
||||
* Fixed the THUNDERX2T99 and NEOVERSEN1 DNRM2/ZNRM2 kernels for inputs with Inf
|
||||
* Added support for the DYNAMIC_LIST option
|
||||
* Added support for compilation with the NVIDIA HPC compiler
|
||||
* Added support for compiling with the NAG Fortran compiler
|
||||
|
||||
====================================================================
|
||||
Version 0.3.13
|
||||
12-Dec-2020
|
||||
|
||||
8
Makefile
8
Makefile
@@ -59,6 +59,9 @@ endif
|
||||
@$(CC) --version > /dev/null 2>&1;\
|
||||
if [ $$? -eq 0 ]; then \
|
||||
cverinfo=`$(CC) --version | sed -n '1p'`; \
|
||||
if [ -z "$${cverinfo}" ]; then \
|
||||
cverinfo=`$(CC) --version | sed -n '2p'`; \
|
||||
fi; \
|
||||
echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\
|
||||
else \
|
||||
echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\
|
||||
@@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
@$(FC) --version > /dev/null 2>&1;\
|
||||
if [ $$? -eq 0 ]; then \
|
||||
fverinfo=`$(FC) --version | sed -n '1p'`; \
|
||||
if [ -z "$${fverinfo}" ]; then \
|
||||
fverinfo=`$(FC) --version | sed -n '2p'`; \
|
||||
fi; \
|
||||
echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\
|
||||
else \
|
||||
echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\
|
||||
@@ -161,7 +167,6 @@ ifeq ($(NO_SHARED), 1)
|
||||
$(error OpenBLAS: neither static nor shared are enabled.)
|
||||
endif
|
||||
endif
|
||||
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
@for d in $(SUBDIRS) ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||
@@ -190,6 +195,7 @@ endif
|
||||
ifdef USE_THREAD
|
||||
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
|
||||
endif
|
||||
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
@touch lib.grd
|
||||
|
||||
prof : prof_blas prof_lapack
|
||||
|
||||
@@ -1,28 +1,49 @@
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
|
||||
ifneq ($(GCCVERSIONGT4), 1)
|
||||
CCOMMON_OPT += -march=armv8-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a
|
||||
endif
|
||||
|
||||
|
||||
else
|
||||
|
||||
|
||||
ifeq ($(CORE), ARMV8)
|
||||
CCOMMON_OPT += -march=armv8-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA53)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA57)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA72)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA73)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||
endif
|
||||
endif
|
||||
|
||||
# Use a72 tunings because Neoverse-N1 is only available
|
||||
# in GCC>=9
|
||||
@@ -30,51 +51,105 @@ ifeq ($(CORE), NEOVERSEN1)
|
||||
ifeq ($(GCCVERSIONGTEQ7), 1)
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Use a53 tunings because a55 is only available in GCC>=8.1
|
||||
ifeq ($(CORE), CORTEXA55)
|
||||
ifeq ($(GCCVERSIONGTEQ7), 1)
|
||||
ifeq ($(GCCVERSIONGTEQ8), 1)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), FALKOR)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=falkor
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=falkor
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX2T99)
|
||||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX3T110)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), VORTEX)
|
||||
CCOMMON_OPT += -march=armv8.3-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.3-a
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq ($(CORE), TSV110)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq ($(CORE), EMAG8180)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=emag
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=emag
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
endif
|
||||
@@ -74,17 +74,17 @@ endif
|
||||
ifneq ($(OSNAME), AIX)
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
|
||||
endif
|
||||
|
||||
#for install static library
|
||||
ifneq ($(NO_STATIC),1)
|
||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
@@ -92,7 +92,7 @@ endif
|
||||
ifneq ($(NO_SHARED),1)
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
|
||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
|
||||
3
Makefile.loongarch64
Normal file
3
Makefile.loongarch64
Normal file
@@ -0,0 +1,3 @@
|
||||
ifdef BINARY64
|
||||
else
|
||||
endif
|
||||
@@ -10,9 +10,11 @@ USE_OPENMP = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER10)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER9)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.13
|
||||
VERSION = 0.3.17.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
||||
@@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64)
|
||||
override ARCH=x86_64
|
||||
else ifeq ($(ARCH), powerpc64)
|
||||
override ARCH=power
|
||||
else ifeq ($(ARCH), powerpc64le)
|
||||
override ARCH=power
|
||||
else ifeq ($(ARCH), powerpc)
|
||||
override ARCH=power
|
||||
else ifeq ($(ARCH), i386)
|
||||
@@ -181,7 +183,7 @@ endif
|
||||
|
||||
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
|
||||
ifeq ($(HOSTARCH), x86_64)
|
||||
ifeq ($(findstring pgcc,$(HOSTCC)),)
|
||||
ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),)
|
||||
GETARCH_FLAGS += -march=native
|
||||
endif
|
||||
endif
|
||||
@@ -331,6 +333,7 @@ GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
||||
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
||||
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
|
||||
GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8)
|
||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
|
||||
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
|
||||
@@ -378,6 +381,12 @@ ifeq ($(OSNAME), AIX)
|
||||
EXTRALIB += -lm
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
ifeq ($(ARCH), $(filter $(ARCH),arm arm64))
|
||||
EXTRALIB += -lm
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
NEED_PIC = 0
|
||||
NO_EXPRECISION = 1
|
||||
@@ -617,12 +626,18 @@ DYNAMIC_CORE += CORTEXA57
|
||||
DYNAMIC_CORE += CORTEXA72
|
||||
DYNAMIC_CORE += CORTEXA73
|
||||
DYNAMIC_CORE += NEOVERSEN1
|
||||
DYNAMIC_CORE += CORTEXA55
|
||||
DYNAMIC_CORE += FALKOR
|
||||
DYNAMIC_CORE += THUNDERX
|
||||
DYNAMIC_CORE += THUNDERX2T99
|
||||
DYNAMIC_CORE += TSV110
|
||||
DYNAMIC_CORE += EMAG8180
|
||||
DYNAMIC_CORE += THUNDERX3T110
|
||||
ifdef DYNAMIC_LIST
|
||||
override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST)
|
||||
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8
|
||||
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), mips64)
|
||||
@@ -663,6 +678,7 @@ endif
|
||||
endif # ARCH zarch
|
||||
|
||||
ifeq ($(ARCH), power)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
DYNAMIC_CORE = POWER6
|
||||
DYNAMIC_CORE += POWER8
|
||||
ifneq ($(C_COMPILER), GCC)
|
||||
@@ -689,6 +705,10 @@ else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
|
||||
endif
|
||||
endif
|
||||
else
|
||||
DYNAMIC_CORE = POWER8
|
||||
DYNAMIC_CORE += POWER9
|
||||
endif
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
@@ -760,6 +780,11 @@ NO_BINARY_MODE = 1
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
NO_BINARY_MODE = 1
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
|
||||
#
|
||||
# C Compiler dependent settings
|
||||
@@ -830,6 +855,13 @@ ifeq ($(OSNAME), AIX)
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
ifeq ($(CORE), LOONGSONG3R5)
|
||||
CCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
||||
FCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
ifndef BINARY_DEFINED
|
||||
@@ -847,9 +879,19 @@ endif
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20)
|
||||
PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20)
|
||||
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11)
|
||||
PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11)
|
||||
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011))
|
||||
NEWPGI := 1
|
||||
endif
|
||||
ifdef BINARY64
|
||||
ifeq ($(ARCH), x86_64)
|
||||
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
|
||||
CCOMMON_OPT += -tp p7-64
|
||||
ifneq ($(NEWPGI),1)
|
||||
CCOMMON_OPT += -D__MMX__ -Mnollvm
|
||||
endif
|
||||
else
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq ($(CORE), POWER8)
|
||||
@@ -877,13 +919,25 @@ endif
|
||||
# Fortran Compiler dependent settings
|
||||
#
|
||||
|
||||
ifeq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe
|
||||
ifdef INTERFACE64
|
||||
ifneq ($(INTERFACE64), 0)
|
||||
FCOMMON_OPT += -i8
|
||||
endif
|
||||
endif
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -openmp
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), FLANG)
|
||||
CCOMMON_OPT += -DF_INTERFACE_FLANG
|
||||
FCOMMON_OPT += -Mrecursive -Kieee
|
||||
ifeq ($(OSNAME), Linux)
|
||||
ifeq ($(ARCH), x86_64)
|
||||
FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`)
|
||||
ifeq ($(FLANG_VENDOR),AOCC)
|
||||
FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ")
|
||||
ifeq ($(FLANG_VENDOR), AMD)
|
||||
FCOMMON_OPT += -fno-unroll-loops
|
||||
endif
|
||||
endif
|
||||
@@ -1029,18 +1083,24 @@ ifeq ($(ARCH), x86_64)
|
||||
FCOMMON_OPT += -tp p7-64
|
||||
else
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq ($(CORE), POWER6)
|
||||
$(warning NVIDIA HPC compilers do not support POWER6.)
|
||||
endif
|
||||
ifeq ($(CORE), POWER8)
|
||||
FCOMMON_OPT += -tp pwr8
|
||||
endif
|
||||
ifeq ($(CORE), POWER9)
|
||||
FCOMMON_OPT += -tp pwr9
|
||||
endif
|
||||
ifeq ($(CORE), POWER10)
|
||||
$(warning NVIDIA HPC compilers do not support POWER10.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
FCOMMON_OPT += -tp p7
|
||||
endif
|
||||
FCOMMON_OPT += -Mrecursive
|
||||
FCOMMON_OPT += -Mrecursive -Kieee
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -mp
|
||||
endif
|
||||
@@ -1179,6 +1239,8 @@ CCOMMON_OPT += -fPIC
|
||||
endif
|
||||
ifeq ($(F_COMPILER), SUN)
|
||||
FCOMMON_OPT += -pic
|
||||
else ifeq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -PIC
|
||||
else
|
||||
FCOMMON_OPT += -fPIC
|
||||
endif
|
||||
@@ -1256,6 +1318,10 @@ CCOMMON_OPT += -DUSE_PAPI
|
||||
EXTRALIB += -lpapi -lperfctr
|
||||
endif
|
||||
|
||||
ifdef BUFFERSIZE
|
||||
CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE)
|
||||
endif
|
||||
|
||||
ifdef DYNAMIC_THREADS
|
||||
CCOMMON_OPT += -DDYNAMIC_THREADS
|
||||
endif
|
||||
@@ -1433,6 +1499,10 @@ LAPACK_FFLAGS := $(FFLAGS)
|
||||
LAPACK_FPFLAGS := $(FPFLAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER),NAG)
|
||||
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
endif
|
||||
|
||||
LAPACK_CFLAGS = $(CFLAGS)
|
||||
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
|
||||
ifdef INTERFACE64
|
||||
|
||||
17
Makefile.x86
17
Makefile.x86
@@ -1,10 +1,21 @@
|
||||
# COMPILER_PREFIX = mingw32-
|
||||
|
||||
ifdef HAVE_SSE
|
||||
CCOMMON_OPT += -msse
|
||||
FCOMMON_OPT += -msse
|
||||
ifneq ($(DYNAMIC_ARCH),1)
|
||||
ADD_CPUFLAGS = 1
|
||||
else
|
||||
ifdef TARGET_CORE
|
||||
ADD_CPUFLAGS = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef ADD_CPUFLAGS
|
||||
ifdef HAVE_SSE
|
||||
CCOMMON_OPT += -msse
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -msse
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), Interix)
|
||||
ARFLAGS = -m x86
|
||||
|
||||
@@ -8,42 +8,57 @@ endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
ifneq ($(DYNAMIC_ARCH),1)
|
||||
ADD_CPUFLAGS = 1
|
||||
else
|
||||
ifdef TARGET_CORE
|
||||
ADD_CPUFLAGS = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef ADD_CPUFLAGS
|
||||
ifdef HAVE_SSE3
|
||||
CCOMMON_OPT += -msse3
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -msse3
|
||||
endif
|
||||
endif
|
||||
ifdef HAVE_SSSE3
|
||||
CCOMMON_OPT += -mssse3
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mssse3
|
||||
endif
|
||||
endif
|
||||
ifdef HAVE_SSE4_1
|
||||
CCOMMON_OPT += -msse4.1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -msse4.1
|
||||
endif
|
||||
endif
|
||||
ifndef OLDGCC
|
||||
ifdef HAVE_AVX
|
||||
CCOMMON_OPT += -mavx
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mavx
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifndef NO_AVX2
|
||||
ifdef HAVE_AVX2
|
||||
CCOMMON_OPT += -mavx2
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mavx2
|
||||
endif
|
||||
endif
|
||||
ifndef OLDGCC
|
||||
ifdef HAVE_FMA3
|
||||
CCOMMON_OPT += -mfma
|
||||
FCOMMON_OPT += -mfma
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), SKYLAKEX)
|
||||
ifndef DYNAMIC_ARCH
|
||||
ifndef NO_AVX512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
@@ -56,18 +71,18 @@ endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), COOPERLAKE)
|
||||
ifndef DYNAMIC_ARCH
|
||||
ifndef NO_AVX512
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# cooperlake support was added in 10.1
|
||||
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
@@ -80,7 +95,6 @@ endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef HAVE_AVX2
|
||||
ifndef NO_AVX2
|
||||
@@ -112,6 +126,7 @@ endif
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(OSNAME), Interix)
|
||||
|
||||
11
README.md
11
README.md
@@ -13,17 +13,21 @@ Drone CI: [ library based on GotoBLAS2 1.13 BSD version.
|
||||
|
||||
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
|
||||
|
||||
For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib:
|
||||
<https://www.netlib.org/blas>. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six
|
||||
20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare <https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/> or Youtube <https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek> may be helpful.
|
||||
|
||||
## Binary Packages
|
||||
|
||||
We provide official binary packages for the following platform:
|
||||
|
||||
* Windows x86/x86_64
|
||||
|
||||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
|
||||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/xianyi/OpenBLAS/releases](https://github.com/xianyi/OpenBLAS/releases).
|
||||
|
||||
## Installation from Source
|
||||
|
||||
@@ -208,7 +212,8 @@ Please note that it is not possible to combine support for different architectur
|
||||
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
- **AIX**: Supported on PPC up to POWER8
|
||||
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **SunOS**: Supported by the community. We don't actively test the library on this OS:
|
||||
- **SunOS**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>.
|
||||
|
||||
## Usage
|
||||
|
||||
|
||||
@@ -92,6 +92,7 @@ CORTEXA57
|
||||
CORTEXA72
|
||||
CORTEXA73
|
||||
NEOVERSEN1
|
||||
CORTEXA55
|
||||
EMAG8180
|
||||
FALKOR
|
||||
THUNDERX
|
||||
@@ -109,3 +110,5 @@ Z14
|
||||
RISCV64_GENERIC
|
||||
C910V
|
||||
|
||||
11.LOONGARCH64:
|
||||
LOONGSON3R5
|
||||
|
||||
21
appveyor.yml
21
appveyor.yml
@@ -30,10 +30,10 @@ environment:
|
||||
CONDA_INSTALL_LOCN: C:\\Miniconda36-x64
|
||||
matrix:
|
||||
- COMPILER: clang-cl
|
||||
WITH_FORTRAN: yes
|
||||
WITH_FORTRAN: ON
|
||||
- COMPILER: clang-cl
|
||||
DYNAMIC_ARCH: ON
|
||||
WITH_FORTRAN: no
|
||||
WITH_FORTRAN: OFF
|
||||
- COMPILER: cl
|
||||
- COMPILER: MinGW64-gcc-7.2.0-mingw
|
||||
DYNAMIC_ARCH: OFF
|
||||
@@ -47,12 +47,8 @@ environment:
|
||||
install:
|
||||
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
|
||||
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
|
||||
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake
|
||||
|
||||
- if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja
|
||||
- if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja
|
||||
- if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang
|
||||
|
||||
- if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false
|
||||
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1
|
||||
- if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
- if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%"
|
||||
- if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%"
|
||||
@@ -68,15 +64,14 @@ before_build:
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
- if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
- if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON ..
|
||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
|
||||
|
||||
build_script:
|
||||
- cmake --build .
|
||||
|
||||
test_script:
|
||||
- echo Running Test
|
||||
- cd utest
|
||||
- openblas_utest
|
||||
- ctest -j2
|
||||
|
||||
|
||||
@@ -4,7 +4,15 @@ trigger:
|
||||
branches:
|
||||
include:
|
||||
- develop
|
||||
|
||||
resources:
|
||||
containers:
|
||||
- container: oneapi-hpckit
|
||||
image: intel/oneapi-hpckit:latest
|
||||
options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so'
|
||||
- container: oneapi-basekit
|
||||
image: intel/oneapi-basekit:latest
|
||||
options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so'
|
||||
|
||||
jobs:
|
||||
# manylinux1 is useful to test because the
|
||||
# standard Docker container uses an old version
|
||||
@@ -68,4 +76,92 @@ jobs:
|
||||
dir
|
||||
openblas_utest.exe
|
||||
|
||||
|
||||
- job: OSX_OpenMP
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10
|
||||
|
||||
- job: OSX_GCC_Nothreads
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
make USE_THREADS=0 CC=gcc-10 FC=gfortran-10
|
||||
|
||||
- job: OSX_OpenMP_Clang
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
brew install llvm libomp
|
||||
make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10
|
||||
|
||||
- job: OSX_Ifort_Clang
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
MACOS_FORTRAN_COMPONENTS: intel.oneapi.mac.ifort-compiler
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
brew install llvm libomp
|
||||
sudo mkdir -p /opt/intel
|
||||
sudo chown $USER /opt/intel
|
||||
displayName: prepare for cache restore
|
||||
- task: Cache@2
|
||||
inputs:
|
||||
path: /opt/intel/oneapi
|
||||
key: '"install" | "$(MACOS_HPCKIT_URL)" | "$(MACOS_FORTRAN_COMPONENTS)"'
|
||||
cacheHitVar: CACHE_RESTORED
|
||||
- script: |
|
||||
curl --output webimage.dmg --url $(MACOS_HPCKIT_URL) --retry 5 --retry-delay 5
|
||||
hdiutil attach webimage.dmg
|
||||
sudo /Volumes/"$(basename "$(MACOS_HPCKIT_URL)" .dmg)"/bootstrapper.app/Contents/MacOS/bootstrapper -s --action install --components="$(MACOS_FORTRAN_COMPONENTS)" --eula=accept --continue-with-optional-error=yes --log-dir=.
|
||||
installer_exit_code=$?
|
||||
hdiutil detach /Volumes/"$(basename "$URL" .dmg)" -quiet
|
||||
exit $installer_exit_code
|
||||
displayName: install
|
||||
condition: ne(variables.CACHE_RESTORED, 'true')
|
||||
- script: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
make CC=/usr/local/opt/llvm/bin/clang FC=ifort
|
||||
|
||||
- job: OSX_NDK_ARMV7
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
brew install --cask android-ndk
|
||||
export ANDROID_NDK_HOME=/usr/local/share/android-ndk
|
||||
make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4
|
||||
|
||||
- job: ALPINE_MUSL
|
||||
pool:
|
||||
vmImage: 'ubuntu-latest'
|
||||
steps:
|
||||
- script: |
|
||||
wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
|
||||
&& echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1
|
||||
alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||
sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
|
||||
alpine make DYNAMIC_ARCH=1 BINARY=64
|
||||
alpine make DYNAMIC_ARCH=1 BINARY=64 PREFIX=mytestdir install
|
||||
alpine ls -l mytestdir/include
|
||||
alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c
|
||||
alpine echo "#include <openblas_config.h>" >>test_install.c
|
||||
alpine echo "int main(){" >> test_install.c
|
||||
alpine echo "cpu_set_t* cpu_set = NULL;}" >>test_install.c
|
||||
alpine gcc -Imytestdir/include test_install.c -Lmytestdir/lib -lopenblas -lpthread -lgfortran -o test_install
|
||||
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
#include <time.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#elif defined(__APPLE__)
|
||||
#include <mach/mach_time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
@@ -74,6 +76,9 @@ static void *huge_malloc(BLASLONG size){
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
struct timeval start, stop;
|
||||
#elif defined(__APPLE__)
|
||||
mach_timebase_info_data_t info;
|
||||
uint64_t start = 0, stop = 0;
|
||||
#else
|
||||
struct timespec start = { 0, 0 }, stop = { 0, 0 };
|
||||
#endif
|
||||
@@ -82,6 +87,9 @@ double getsec()
|
||||
{
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
#elif defined(__APPLE__)
|
||||
mach_timebase_info(&info);
|
||||
return (double)(((stop - start) * info.numer)/info.denom) * 1.e-9;
|
||||
#else
|
||||
return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
|
||||
#endif
|
||||
@@ -90,6 +98,8 @@ double getsec()
|
||||
void begin() {
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
#elif defined(__APPLE__)
|
||||
start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
|
||||
#else
|
||||
clock_gettime(CLOCK_REALTIME, &start);
|
||||
#endif
|
||||
@@ -98,7 +108,9 @@ void begin() {
|
||||
void end() {
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
#elif defined(__APPLE__)
|
||||
stop = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
|
||||
#else
|
||||
clock_gettime(CLOCK_REALTIME, &stop);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -72,13 +72,17 @@ int main(int argc, char *argv[]){
|
||||
FLOAT *a,*work;
|
||||
FLOAT wkopt[4];
|
||||
blasint *ipiv;
|
||||
blasint m, i, j, info,lwork;
|
||||
blasint m, i, j, l, info,lwork;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
double time1;
|
||||
double time1,timeg;
|
||||
|
||||
char *p;
|
||||
char btest = 'I';
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
@@ -86,6 +90,9 @@ int main(int argc, char *argv[]){
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
|
||||
|
||||
@@ -124,32 +131,41 @@ int main(int argc, char *argv[]){
|
||||
fprintf(stderr, " SIZE FLops Time Lwork\n");
|
||||
|
||||
for(m = from; m <= to; m += step){
|
||||
|
||||
timeg = 0.;
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
GETRF (&m, &m, a, &m, ipiv, &info);
|
||||
for (l = 0; l < loops; l++) {
|
||||
|
||||
if (btest == 'F') begin();
|
||||
GETRF (&m, &m, a, &m, ipiv, &info);
|
||||
if (btest == 'F') {
|
||||
end();
|
||||
timeg += getsec();
|
||||
}
|
||||
if (info) {
|
||||
fprintf(stderr, "Matrix is not singular .. %d\n", info);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
begin();
|
||||
if (btest == 'I') begin();
|
||||
|
||||
lwork = -1;
|
||||
GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info);
|
||||
|
||||
lwork = (blasint)wkopt[0];
|
||||
GETRI(&m, a, &m, ipiv, work, &lwork, &info);
|
||||
end();
|
||||
if (btest == 'I') end();
|
||||
|
||||
if (info) {
|
||||
fprintf(stderr, "failed compute inverse matrix .. %d\n", info);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
time1 = getsec();
|
||||
|
||||
if (btest == 'I')
|
||||
timeg += getsec();
|
||||
|
||||
} // loops
|
||||
time1 = timeg/(double)loops;
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops : %10.2f Sec : %d\n",
|
||||
COMPSIZE * COMPSIZE * (4.0/3.0 * (double)m * (double)m *(double)m - (double)m *(double)m + 5.0/3.0* (double)m) / time1 * 1.e-6,time1,lwork);
|
||||
|
||||
@@ -72,17 +72,21 @@ int main(int argc, char *argv[]){
|
||||
FLOAT *a, *b;
|
||||
blasint *ipiv;
|
||||
|
||||
blasint m, i, j, info;
|
||||
blasint m, i, j, l, info;
|
||||
blasint unit = 1;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
FLOAT maxerr;
|
||||
|
||||
double time1, time2;
|
||||
double time1, time2, timeg1,timeg2;
|
||||
|
||||
char *p;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
@@ -110,9 +114,9 @@ int main(int argc, char *argv[]){
|
||||
fprintf(stderr, " SIZE Residual Decompose Solve Total\n");
|
||||
|
||||
for(m = from; m <= to; m += step){
|
||||
|
||||
timeg1 = timeg2 = 0.;
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
for (l = 0; l < loops; l++) {
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < m * COMPSIZE; i++){
|
||||
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
@@ -138,7 +142,7 @@ int main(int argc, char *argv[]){
|
||||
exit(1);
|
||||
}
|
||||
|
||||
time1 = getsec();
|
||||
timeg1 += getsec();
|
||||
|
||||
begin();
|
||||
|
||||
@@ -151,8 +155,10 @@ int main(int argc, char *argv[]){
|
||||
exit(1);
|
||||
}
|
||||
|
||||
time2 = getsec();
|
||||
|
||||
timeg2 += getsec();
|
||||
} //loops
|
||||
time1=timeg1/(double)loops;
|
||||
time2=timeg2/(double)loops;
|
||||
maxerr = 0.;
|
||||
|
||||
for(i = 0; i < m; i++){
|
||||
|
||||
@@ -99,14 +99,15 @@ int main(int argc, char *argv[]){
|
||||
char *p;
|
||||
char btest = 'F';
|
||||
|
||||
blasint m, i, j, info, uplos=0;
|
||||
double flops;
|
||||
blasint m, i, j, l, info, uplos=0;
|
||||
double flops = 0.;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
double time1;
|
||||
double time1, timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
@@ -119,6 +120,8 @@ int main(int argc, char *argv[]){
|
||||
|
||||
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]);
|
||||
|
||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
|
||||
@@ -129,19 +132,21 @@ int main(int argc, char *argv[]){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
for(m = from; m <= to; m += step){
|
||||
|
||||
for(m = from; m <= to; m += step){
|
||||
timeg=0.;
|
||||
for (l = 0; l < loops; l++) {
|
||||
#ifndef COMPLEX
|
||||
if (uplos & 1) {
|
||||
for (j = 0; j < m; j++) {
|
||||
for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.;
|
||||
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
|
||||
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
|
||||
for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5;
|
||||
}
|
||||
} else {
|
||||
for (j = 0; j < m; j++) {
|
||||
for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5;
|
||||
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
|
||||
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
|
||||
for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.;
|
||||
}
|
||||
}
|
||||
@@ -192,8 +197,8 @@ int main(int argc, char *argv[]){
|
||||
exit(1);
|
||||
}
|
||||
|
||||
time1 = getsec();
|
||||
flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
|
||||
if ( btest == 'F')
|
||||
timeg += getsec();
|
||||
|
||||
if ( btest == 'S' )
|
||||
{
|
||||
@@ -214,9 +219,7 @@ int main(int argc, char *argv[]){
|
||||
fprintf(stderr, "Potrs info = %d\n", info);
|
||||
exit(1);
|
||||
}
|
||||
time1 = getsec();
|
||||
flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
|
||||
|
||||
timeg += getsec();
|
||||
}
|
||||
|
||||
if ( btest == 'I' )
|
||||
@@ -232,11 +235,17 @@ int main(int argc, char *argv[]){
|
||||
fprintf(stderr, "Potri info = %d\n", info);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
time1 = getsec();
|
||||
flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
|
||||
timeg += getsec();
|
||||
}
|
||||
|
||||
} // loops
|
||||
|
||||
time1 = timeg/(double)loops;
|
||||
if ( btest == 'F')
|
||||
flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
|
||||
if ( btest == 'S')
|
||||
flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
|
||||
if ( btest == 'I')
|
||||
flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
|
||||
fprintf(stderr, "%8d : %10.2f MFlops : %10.3f Sec : Test=%c\n",m,flops ,time1,btest);
|
||||
|
||||
|
||||
|
||||
@@ -46,14 +46,17 @@ int main(int argc, char *argv[]){
|
||||
|
||||
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
|
||||
|
||||
blasint m, i, j;
|
||||
blasint m, i, j, l;
|
||||
blasint inc_x= 1;
|
||||
blasint inc_y= 1;
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
double time1;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
@@ -85,8 +88,9 @@ int main(int argc, char *argv[]){
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg = 0.;
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
for (l = 0; l < loops; l++) {
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
@@ -107,8 +111,10 @@ int main(int argc, char *argv[]){
|
||||
|
||||
end();
|
||||
|
||||
time1 = getsec();
|
||||
timeg += getsec();
|
||||
} // loops
|
||||
|
||||
time1 = timeg/(double)loops;
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6);
|
||||
|
||||
@@ -56,17 +56,20 @@ int main(int argc, char *argv[]){
|
||||
|
||||
char uplo='U';
|
||||
char trans='N';
|
||||
|
||||
|
||||
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
|
||||
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
|
||||
|
||||
blasint m, i, j;
|
||||
blasint m, i, j, l;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
double time1;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
@@ -95,9 +98,12 @@ int main(int argc, char *argv[]){
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
timeg = 0.;
|
||||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
for(l = 0; l < loops; l++) {
|
||||
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < m * COMPSIZE; i++){
|
||||
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
@@ -111,8 +117,10 @@ int main(int argc, char *argv[]){
|
||||
|
||||
end();
|
||||
|
||||
time1 = getsec();
|
||||
|
||||
timeg += getsec();
|
||||
|
||||
} //loops
|
||||
time1 = timeg / (double)loops;
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
|
||||
|
||||
57
c_check
57
c_check
@@ -1,11 +1,11 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
#use File::Basename;
|
||||
# use File::Temp qw(tempfile);
|
||||
|
||||
# Checking cross compile
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
|
||||
$hostarch = `uname -m | sed -e s/i.86/x86/`;
|
||||
$hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS");
|
||||
chop($hostarch);
|
||||
$hostarch = "x86_64" if ($hostarch eq "amd64");
|
||||
@@ -82,18 +82,19 @@ $os = Interix if ($data =~ /OS_INTERIX/);
|
||||
$os = Android if ($data =~ /OS_ANDROID/);
|
||||
$os = Haiku if ($data =~ /OS_HAIKU/);
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
|
||||
$defined = 0;
|
||||
|
||||
@@ -143,6 +144,11 @@ if ($architecture eq "riscv64") {
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "loongarch64") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($compiler eq "PGI") {
|
||||
$compiler_name .= " -tp p7" if ($binary eq "32");
|
||||
$compiler_name .= " -tp p7-64" if ($binary eq "64");
|
||||
@@ -215,17 +221,18 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
||||
}
|
||||
}
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
|
||||
$binformat = bin32;
|
||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
|
||||
5
cblas.h
5
cblas.h
@@ -125,9 +125,14 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx,
|
||||
|
||||
void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
|
||||
void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);
|
||||
void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
|
||||
void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);
|
||||
|
||||
void cblas_srotg(float *a, float *b, float *c, float *s);
|
||||
void cblas_drotg(double *a, double *b, double *c, double *s);
|
||||
void cblas_crotg(void *a, void *b, float *c, void *s);
|
||||
void cblas_zrotg(void *a, void *b, double *c, void *s);
|
||||
|
||||
|
||||
void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P);
|
||||
void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P);
|
||||
|
||||
@@ -44,7 +44,10 @@ endif ()
|
||||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (POWER)
|
||||
|
||||
@@ -124,9 +124,9 @@ if (NOT DYNAMIC_ARCH)
|
||||
if (HAVE_AVX)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx")
|
||||
endif ()
|
||||
if (HAVE_FMA3)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mfma")
|
||||
endif ()
|
||||
# if (HAVE_FMA3)
|
||||
#set (CCOMMON_OPT "${CCOMMON_OPT} -mfma")
|
||||
#endif ()
|
||||
if (HAVE_SSE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -msse")
|
||||
endif ()
|
||||
|
||||
@@ -66,7 +66,7 @@ set(SLASRC
|
||||
slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
|
||||
slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
|
||||
slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
|
||||
slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
|
||||
slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
|
||||
slarrv.f slartv.f
|
||||
slarz.f slarzb.f slarzt.f slasy2.f
|
||||
slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
|
||||
@@ -112,14 +112,14 @@ set(SLASRC
|
||||
sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f
|
||||
stpqrt.f stpqrt2.f stpmqrt.f stprfb.f
|
||||
sgelqt.f sgelqt3.f sgemlqt.f
|
||||
sgetsls.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
|
||||
sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
|
||||
sgelq.f slaswlq.f slamswlq.f sgemlq.f
|
||||
stplqt.f stplqt2.f stpmlqt.f
|
||||
ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
|
||||
ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
|
||||
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
|
||||
sgesvdq.f slaorhr_col_getrfnp.f
|
||||
slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
|
||||
slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f )
|
||||
|
||||
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
|
||||
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
|
||||
@@ -171,7 +171,7 @@ set(CLASRC
|
||||
claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
|
||||
claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
|
||||
claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
|
||||
clarf.f clarfb.f clarfg.f clarfgp.f clarft.f
|
||||
clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
|
||||
clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f clartv.f
|
||||
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f
|
||||
clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
|
||||
@@ -209,14 +209,14 @@ set(CLASRC
|
||||
cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f
|
||||
ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f
|
||||
cgelqt.f cgelqt3.f cgemlqt.f
|
||||
cgetsls.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
|
||||
cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
|
||||
cgelq.f claswlq.f clamswlq.f cgemlq.f
|
||||
ctplqt.f ctplqt2.f ctpmlqt.f
|
||||
chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
|
||||
cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
|
||||
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
|
||||
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
|
||||
cungtsqr.f cunhr_col.f )
|
||||
cungtsqr.f cungtsqr_row.f cunhr_col.f )
|
||||
|
||||
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
|
||||
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
|
||||
@@ -253,7 +253,7 @@ set(DLASRC
|
||||
dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
|
||||
dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
|
||||
dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
|
||||
dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
|
||||
dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
|
||||
dlargv.f dlarrv.f dlartv.f
|
||||
dlarz.f dlarzb.f dlarzt.f dlasy2.f
|
||||
dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
|
||||
@@ -300,14 +300,14 @@ set(DLASRC
|
||||
dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f
|
||||
dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f
|
||||
dgelqt.f dgelqt3.f dgemlqt.f
|
||||
dgetsls.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
|
||||
dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
|
||||
dgelq.f dlaswlq.f dlamswlq.f dgemlq.f
|
||||
dtplqt.f dtplqt2.f dtpmlqt.f
|
||||
dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
|
||||
dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
|
||||
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
|
||||
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
|
||||
dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
|
||||
dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f )
|
||||
|
||||
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
|
||||
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
|
||||
@@ -360,7 +360,7 @@ set(ZLASRC
|
||||
zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
|
||||
zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
|
||||
zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
|
||||
zlarcm.f zlarf.f zlarfb.f
|
||||
zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f
|
||||
zlarfg.f zlarfgp.f zlarft.f
|
||||
zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
|
||||
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
|
||||
@@ -402,13 +402,13 @@ set(ZLASRC
|
||||
ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f
|
||||
ztplqt.f ztplqt2.f ztpmlqt.f
|
||||
zgelqt.f zgelqt3.f zgemlqt.f
|
||||
zgetsls.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
|
||||
zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
|
||||
zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
|
||||
zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
|
||||
zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
|
||||
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
|
||||
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
|
||||
zungtsqr.f zunhr_col.f)
|
||||
zungtsqr.f zungtsqr_row.f zunhr_col.f)
|
||||
|
||||
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
|
||||
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
|
||||
|
||||
@@ -114,6 +114,8 @@ set(CSRC
|
||||
lapacke_cgetrs_work.c
|
||||
lapacke_cgetsls.c
|
||||
lapacke_cgetsls_work.c
|
||||
lapacke_cgetsqrhrt.c
|
||||
lapacke_cgetsqrhrt_work.c
|
||||
lapacke_cggbak.c
|
||||
lapacke_cggbak_work.c
|
||||
lapacke_cggbal.c
|
||||
@@ -590,6 +592,8 @@ set(CSRC
|
||||
lapacke_cungrq_work.c
|
||||
lapacke_cungtr.c
|
||||
lapacke_cungtr_work.c
|
||||
lapacke_cungtsqr_row.c
|
||||
lapacke_cungtsqr_row_work.c
|
||||
lapacke_cunmbr.c
|
||||
lapacke_cunmbr_work.c
|
||||
lapacke_cunmhr.c
|
||||
@@ -735,6 +739,8 @@ set(DSRC
|
||||
lapacke_dgetrs_work.c
|
||||
lapacke_dgetsls.c
|
||||
lapacke_dgetsls_work.c
|
||||
lapacke_dgetsqrhrt.c
|
||||
lapacke_dgetsqrhrt_work.c
|
||||
lapacke_dggbak.c
|
||||
lapacke_dggbak_work.c
|
||||
lapacke_dggbal.c
|
||||
@@ -862,6 +868,8 @@ set(DSRC
|
||||
lapacke_dorgrq_work.c
|
||||
lapacke_dorgtr.c
|
||||
lapacke_dorgtr_work.c
|
||||
lapacke_dorgtsqr_row.c
|
||||
lapacke_dorgtsqr_row_work.c
|
||||
lapacke_dormbr.c
|
||||
lapacke_dormbr_work.c
|
||||
lapacke_dormhr.c
|
||||
@@ -1309,6 +1317,8 @@ set(SSRC
|
||||
lapacke_sgetrs_work.c
|
||||
lapacke_sgetsls.c
|
||||
lapacke_sgetsls_work.c
|
||||
lapacke_sgetsqrhrt.c
|
||||
lapacke_sgetsqrhrt_work.c
|
||||
lapacke_sggbak.c
|
||||
lapacke_sggbak_work.c
|
||||
lapacke_sggbal.c
|
||||
@@ -1435,6 +1445,8 @@ set(SSRC
|
||||
lapacke_sorgrq_work.c
|
||||
lapacke_sorgtr.c
|
||||
lapacke_sorgtr_work.c
|
||||
lapacke_sorgtsqr_row.c
|
||||
lapacke_sorgtsqr_row_work.c
|
||||
lapacke_sormbr.c
|
||||
lapacke_sormbr_work.c
|
||||
lapacke_sormhr.c
|
||||
@@ -1877,6 +1889,8 @@ set(ZSRC
|
||||
lapacke_zgetrs_work.c
|
||||
lapacke_zgetsls.c
|
||||
lapacke_zgetsls_work.c
|
||||
lapacke_zgetsqrhrt.c
|
||||
lapacke_zgetsqrhrt_work.c
|
||||
lapacke_zggbak.c
|
||||
lapacke_zggbak_work.c
|
||||
lapacke_zggbal.c
|
||||
@@ -2351,6 +2365,8 @@ set(ZSRC
|
||||
lapacke_zungrq_work.c
|
||||
lapacke_zungtr.c
|
||||
lapacke_zungtr_work.c
|
||||
lapacke_zungtsqr_row.c
|
||||
lapacke_zungtsqr_row_work.c
|
||||
lapacke_zunmbr.c
|
||||
lapacke_zunmbr_work.c
|
||||
lapacke_zunmhr.c
|
||||
@@ -2499,6 +2515,5 @@ foreach (Utils_FILE ${Utils_SRC})
|
||||
endforeach ()
|
||||
|
||||
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
|
||||
configure_file("${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h" COPYONLY)
|
||||
include_directories(${lapacke_include_dir})
|
||||
set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
|
||||
|
||||
@@ -177,7 +177,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53")
|
||||
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53" OR "${TCORE}" STREQUAL "CORTEXA55")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t32768\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
|
||||
@@ -39,7 +39,7 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
||||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
||||
set(TARGET "BARCELONA")
|
||||
endif ()
|
||||
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
|
||||
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55")
|
||||
set(TARGET "ARMV7")
|
||||
endif ()
|
||||
endif ()
|
||||
@@ -148,16 +148,20 @@ endif ()
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
||||
if (DEFINED TARGET)
|
||||
if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
|
||||
# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
|
||||
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
# endif()
|
||||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
@@ -182,11 +186,11 @@ if (DEFINED TARGET)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
endif()
|
||||
endif()
|
||||
if (DEFINED HAVE_FMA3)
|
||||
if (NOT NO_AVX2)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
|
||||
endif()
|
||||
endif()
|
||||
# if (DEFINED HAVE_FMA3)
|
||||
# if (NOT NO_AVX2)
|
||||
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
|
||||
# endif()
|
||||
# endif()
|
||||
if (DEFINED HAVE_SSE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
|
||||
endif()
|
||||
@@ -233,6 +237,11 @@ if (BINARY64)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if(EMBEDDED)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DOS_EMBEDDED")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16")
|
||||
endif()
|
||||
|
||||
if (NEED_PIC)
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "IBM")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large")
|
||||
@@ -290,6 +299,10 @@ if (NO_AVX2)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2")
|
||||
endif ()
|
||||
|
||||
if (NO_AVX512)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||
endif ()
|
||||
|
||||
if (USE_THREAD)
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
# NO_AFFINITY = 1
|
||||
|
||||
@@ -74,6 +74,9 @@ macro(ParseMakefileVars MAKEFILE_IN)
|
||||
string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
|
||||
if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER)
|
||||
set (CMAKE_MATCH_1 CMAKE_C_COMPILER)
|
||||
endif ()
|
||||
if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}))
|
||||
# message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
@@ -251,6 +254,19 @@ function(GenerateNamedObjects sources_in)
|
||||
# now add the object and set the defines
|
||||
set(obj_defines ${defines_in})
|
||||
|
||||
list(FIND obj_defines "RC" def_idx)
|
||||
if (${def_idx} GREATER -1)
|
||||
# list(REMOVE_AT ${obj_defines} ${def_idx})
|
||||
list (REMOVE_ITEM obj_defines "RC")
|
||||
list(APPEND obj_defines "RC=RC")
|
||||
endif ()
|
||||
list(FIND obj_defines "CR" def_idx)
|
||||
if (${def_idx} GREATER -1)
|
||||
# list(REMOVE_AT ${obj_defines} ${def_idx})
|
||||
list (REMOVE_ITEM obj_defines "CR")
|
||||
list(APPEND obj_defines "CR=CR")
|
||||
endif ()
|
||||
|
||||
if (use_cblas)
|
||||
set(obj_name "cblas_${obj_name}")
|
||||
list(APPEND obj_defines "CBLAS")
|
||||
@@ -295,7 +311,15 @@ function(GenerateNamedObjects sources_in)
|
||||
configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY)
|
||||
file(REMOVE ${new_source_file}.tmp)
|
||||
list(APPEND SRC_LIST_OUT ${new_source_file})
|
||||
|
||||
message (STATUS ${new_source_file})
|
||||
if (DEFINED HAVE_FMA3)
|
||||
if ( ${new_source_file} MATCHES "(s|d?)rot_k.*c")
|
||||
set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
|
||||
endif ()
|
||||
if ( ${new_source_file} MATCHES "dgemv_t_k.*c")
|
||||
set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
|
||||
endif ()
|
||||
endif ()
|
||||
endforeach ()
|
||||
endforeach ()
|
||||
|
||||
|
||||
28
common.h
28
common.h
@@ -122,7 +122,7 @@ extern "C" {
|
||||
#define ATOM GOTO_ATOM
|
||||
#undef GOTO_ATOM
|
||||
#endif
|
||||
#else
|
||||
#elif !defined(OS_EMBEDDED)
|
||||
#include <sys/mman.h>
|
||||
#ifndef NO_SYSV_IPC
|
||||
#include <sys/shm.h>
|
||||
@@ -134,6 +134,9 @@ extern "C" {
|
||||
#if defined(SMP) || defined(USE_LOCKING)
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
#else
|
||||
#include <time.h>
|
||||
#include <math.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_SUNOS)
|
||||
@@ -413,6 +416,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
||||
#include "common_alpha.h"
|
||||
#endif
|
||||
|
||||
#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include)
|
||||
#if __has_include(<cet.h>)
|
||||
#include <cet.h>
|
||||
#endif
|
||||
#endif
|
||||
#ifndef _CET_ENDBR
|
||||
#define _CET_ENDBR
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_X86
|
||||
#include "common_x86.h"
|
||||
#endif
|
||||
@@ -437,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
||||
#include "common_mips.h"
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#ifdef ARCH_RISCV64
|
||||
#include "common_riscv64.h"
|
||||
#endif
|
||||
@@ -458,6 +470,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
||||
#include "common_zarch.h"
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_LOONGARCH64
|
||||
#include "common_loongarch64.h"
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#ifdef OS_WINDOWSSTORE
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
@@ -488,10 +504,12 @@ static inline unsigned long long rpcc(void){
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec;
|
||||
#else
|
||||
#elif !defined(OS_EMBEDDED)
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv,NULL);
|
||||
return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
#define RPCC_DEFINED
|
||||
@@ -521,6 +539,10 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||
#include "common_linux.h"
|
||||
#endif
|
||||
|
||||
#ifdef OS_EMBEDDED
|
||||
#define DTB_DEFAULT_ENTRIES 64
|
||||
#endif
|
||||
|
||||
#define MMAP_ACCESS (PROT_READ | PROT_WRITE)
|
||||
|
||||
#ifdef __NetBSD__
|
||||
|
||||
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifdef F_INTERFACE_FLANG
|
||||
#if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI)
|
||||
#define RETURN_BY_STACK
|
||||
#else
|
||||
#define RETURN_BY_COMPLEX
|
||||
|
||||
@@ -709,6 +709,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *);
|
||||
int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *);
|
||||
int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *);
|
||||
|
||||
int BLASFUNC(spotri)(char *, blasint *, float *, blasint *, blasint *);
|
||||
int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *);
|
||||
int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
|
||||
int BLASFUNC(cpotri)(char *, blasint *, float *, blasint *, blasint *);
|
||||
int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *);
|
||||
int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
|
||||
|
||||
int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *);
|
||||
int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *);
|
||||
int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
|
||||
|
||||
199
common_loongarch64.h
Normal file
199
common_loongarch64.h
Normal file
@@ -0,0 +1,199 @@
|
||||
/*****************************************************************************
|
||||
Copyright (c) 2011-2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#ifndef COMMON_LOONGARCH64
|
||||
#define COMMON_LOONGARCH64
|
||||
|
||||
#define MB __sync_synchronize()
|
||||
#define WMB __sync_synchronize()
|
||||
#define RMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
}
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory")
|
||||
#else
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory")
|
||||
#endif
|
||||
|
||||
#define GET_IMAGE_CANCEL
|
||||
|
||||
#else
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define LD fld.d
|
||||
#define ST fst.d
|
||||
#define MADD fmadd.d
|
||||
#define NMADD fnmadd.d
|
||||
#define MSUB fmsub.d
|
||||
#define NMSUB fnmsub.d
|
||||
#define ADD fadd.d
|
||||
#define SUB fsub.d
|
||||
#define MUL fmul.d
|
||||
#define MOV fmov.d
|
||||
#define CMOVT fsel
|
||||
#define MTC movgr2fr.d
|
||||
#define FABS fabs.d
|
||||
#define CMPEQ fcmp.ceq.d
|
||||
#define CMPLE fcmp.cle.d
|
||||
#define CMPLT fcmp.clt.d
|
||||
#define NEG fneg.d
|
||||
#else
|
||||
#define LD fld.s
|
||||
#define ST fst.s
|
||||
#define MADD fmadd.s
|
||||
#define NMADD fnmadd.s
|
||||
#define MSUB fmsub.s
|
||||
#define NMSUB fnmsub.s
|
||||
#define ADD fadd.s
|
||||
#define SUB fsub.s
|
||||
#define MUL fmul.s
|
||||
#define MOV fmov.s
|
||||
#define CMOVT fsel
|
||||
#define MTC movgr2fr.w
|
||||
#define FABS fabs.s
|
||||
#define CMPEQ fcmp.ceq.s
|
||||
#define CMPLE fcmp.cle.s
|
||||
#define CMPLT fcmp.clt.s
|
||||
#define NEG fneg.s
|
||||
#endif /* defined(DOUBLE) */
|
||||
|
||||
#if defined(__64BIT__) && defined(USE64BITINT)
|
||||
#define LDINT ld.d
|
||||
#define LDARG ld.d
|
||||
#define SDARG st.d
|
||||
#elif defined(__64BIT__) && !defined(USE64BITINT)
|
||||
#define LDINT ld.w
|
||||
#define LDARG ld.d
|
||||
#define SDARG st.d
|
||||
#else
|
||||
#define LDINT ld.w
|
||||
#define LDARG ld.w
|
||||
#define SDARG st.w
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef F_INTERFACE
|
||||
#define REALNAME ASMNAME
|
||||
#else
|
||||
#define REALNAME ASMFNAME
|
||||
#endif /* defined(F_INTERFACE) */
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.text ;\
|
||||
.align 5 ;\
|
||||
.globl REALNAME ;\
|
||||
.type REALNAME, @function ;\
|
||||
REALNAME: ;\
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
#define GNUSTACK .section .note.GNU-stack,"",@progbits
|
||||
#else
|
||||
#define GNUSTACK
|
||||
#endif /* defined(__linux__) && defined(__ELF__) */
|
||||
|
||||
#define EPILOGUE \
|
||||
.end REALNAME ;\
|
||||
GNUSTACK
|
||||
|
||||
#define PROFCODE
|
||||
|
||||
#define MOVT(dst, src, cc) \
|
||||
bceqz cc, 1f; \
|
||||
add.d dst, src, $r0; \
|
||||
1:
|
||||
|
||||
#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */
|
||||
|
||||
#endif /* defined(ASSEMBLER) */
|
||||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
|
||||
#define PAGESIZE (16UL << 1)
|
||||
#define FIXED_PAGESIZE (16UL << 10)
|
||||
#define HUGE_PAGESIZE ( 2 << 20)
|
||||
|
||||
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
|
||||
|
||||
#ifndef MAP_ANONYMOUS
|
||||
#define MAP_ANONYMOUS MAP_ANON
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -2490,7 +2490,8 @@
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\
|
||||
|| defined(ARCH_LOONGARCH64)
|
||||
extern BLASLONG gemm_offset_a;
|
||||
extern BLASLONG gemm_offset_b;
|
||||
extern BLASLONG sbgemm_p;
|
||||
|
||||
@@ -340,7 +340,8 @@ REALNAME:
|
||||
.align 16; \
|
||||
.globl REALNAME ;\
|
||||
.type REALNAME, @function; \
|
||||
REALNAME:
|
||||
REALNAME: \
|
||||
_CET_ENDBR
|
||||
|
||||
#ifdef PROFILE
|
||||
#define PROFCODE call mcount
|
||||
|
||||
@@ -451,7 +451,8 @@ REALNAME:
|
||||
.align 512; \
|
||||
.globl REALNAME ;\
|
||||
.type REALNAME, @function; \
|
||||
REALNAME:
|
||||
REALNAME: \
|
||||
_CET_ENDBR
|
||||
|
||||
#ifdef PROFILE
|
||||
#define PROFCODE call *mcount@GOTPCREL(%rip)
|
||||
|
||||
1
cpuid.h
1
cpuid.h
@@ -54,6 +54,7 @@
|
||||
#define VENDOR_TRANSMETA 9
|
||||
#define VENDOR_NSC 10
|
||||
#define VENDOR_HYGON 11
|
||||
#define VENDOR_ZHAOXIN 12
|
||||
#define VENDOR_UNKNOWN 99
|
||||
|
||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||
|
||||
@@ -36,6 +36,7 @@ size_t length=sizeof(value);
|
||||
#define CPU_ARMV8 1
|
||||
// Arm
|
||||
#define CPU_CORTEXA53 2
|
||||
#define CPU_CORTEXA55 14
|
||||
#define CPU_CORTEXA57 3
|
||||
#define CPU_CORTEXA72 4
|
||||
#define CPU_CORTEXA73 5
|
||||
@@ -67,7 +68,8 @@ static char *cpuname[] = {
|
||||
"EMAG8180",
|
||||
"NEOVERSEN1",
|
||||
"THUNDERX3T110",
|
||||
"VORTEX"
|
||||
"VORTEX",
|
||||
"CORTEXA55"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
@@ -84,7 +86,8 @@ static char *cpuname_lower[] = {
|
||||
"emag8180",
|
||||
"neoversen1",
|
||||
"thunderx3t110",
|
||||
"vortex"
|
||||
"vortex",
|
||||
"cortexa55"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
@@ -161,6 +164,8 @@ int detect(void)
|
||||
return CPU_CORTEXA73;
|
||||
else if (strstr(cpu_part, "0xd0c"))
|
||||
return CPU_NEOVERSEN1;
|
||||
else if (strstr(cpu_part, "0xd05"))
|
||||
return CPU_CORTEXA55;
|
||||
}
|
||||
// Qualcomm
|
||||
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
|
||||
@@ -281,6 +286,7 @@ void get_cpuconfig(void)
|
||||
{
|
||||
|
||||
case CPU_CORTEXA53:
|
||||
case CPU_CORTEXA55:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
// Fall-through
|
||||
case CPU_ARMV8:
|
||||
|
||||
110
cpuid_loongarch64.c
Normal file
110
cpuid_loongarch64.c
Normal file
@@ -0,0 +1,110 @@
|
||||
/*****************************************************************************
|
||||
Copyright (c) 2011-2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
**********************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_LOONGSON3R5 1
|
||||
|
||||
#define LOONGARCH_CFG2 0x02
|
||||
#define LOONGARCH_LASX 1<<7
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
"LOONGSON3R5"
|
||||
};
|
||||
|
||||
int detect(void) {
|
||||
uint32_t reg = 0;
|
||||
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(reg)
|
||||
: "r"(LOONGARCH_CFG2)
|
||||
);
|
||||
|
||||
if (reg & LOONGARCH_LASX)
|
||||
return CPU_LOONGSON3R5;
|
||||
else
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
|
||||
char *get_corename(void) {
|
||||
return cpuname[detect()];
|
||||
}
|
||||
|
||||
void get_architecture(void) {
|
||||
printf("LOONGARCH64");
|
||||
}
|
||||
|
||||
void get_subarchitecture(void) {
|
||||
if (detect() == CPU_LOONGSON3R5) {
|
||||
printf("LOONGSON3R5");
|
||||
} else {
|
||||
printf("UNKNOWN");
|
||||
}
|
||||
}
|
||||
|
||||
void get_subdirname(void) {
|
||||
printf("loongarch64");
|
||||
}
|
||||
|
||||
void get_cpuconfig(void) {
|
||||
if (detect() == CPU_LOONGSON3R5) {
|
||||
printf("#define LOONGSON3R5\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
} else {
|
||||
printf("#define LOONGSON3R5\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
}
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
if (detect() == CPU_LOONGSON3R5) {
|
||||
printf("loongson3r5\n");
|
||||
} else {
|
||||
printf("loongarch64\n");
|
||||
}
|
||||
}
|
||||
111
cpuid_x86.c
111
cpuid_x86.c
@@ -283,6 +283,7 @@ int get_vendor(void){
|
||||
if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX;
|
||||
if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN;
|
||||
if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR;
|
||||
if (!strcmp(vendor, " Shanghai ")) return VENDOR_ZHAOXIN;
|
||||
if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE;
|
||||
if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS;
|
||||
if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA;
|
||||
@@ -1066,7 +1067,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
|
||||
|
||||
if ((get_vendor() == VENDOR_AMD) ||
|
||||
(get_vendor() == VENDOR_HYGON) ||
|
||||
(get_vendor() == VENDOR_CENTAUR)) {
|
||||
(get_vendor() == VENDOR_CENTAUR) ||
|
||||
(get_vendor() == VENDOR_ZHAOXIN)) {
|
||||
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
LDTB.size = 4096;
|
||||
@@ -1189,7 +1191,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
|
||||
|
||||
int get_cpuname(void){
|
||||
|
||||
int family, exfamily, model, vendor, exmodel;
|
||||
int family, exfamily, model, vendor, exmodel, stepping;
|
||||
|
||||
if (!have_cpuid()) return CPUTYPE_80386;
|
||||
|
||||
@@ -1197,6 +1199,7 @@ int get_cpuname(void){
|
||||
exfamily = get_cputype(GET_EXFAMILY);
|
||||
model = get_cputype(GET_MODEL);
|
||||
exmodel = get_cputype(GET_EXMODEL);
|
||||
stepping = get_cputype(GET_STEPPING);
|
||||
|
||||
vendor = get_vendor();
|
||||
|
||||
@@ -1398,6 +1401,17 @@ int get_cpuname(void){
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 10: // Ice Lake SP
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 7: // family 6 exmodel 7
|
||||
@@ -1418,6 +1432,15 @@ int get_cpuname(void){
|
||||
case 9:
|
||||
case 8:
|
||||
switch (model) {
|
||||
case 12: // Tiger Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 14: // Kaby Lake and refreshes
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
@@ -1436,6 +1459,15 @@ int get_cpuname(void){
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 7: // Rocket Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -1598,13 +1630,20 @@ int get_cpuname(void){
|
||||
switch (family) {
|
||||
case 0x5:
|
||||
return CPUTYPE_CENTAURC6;
|
||||
break;
|
||||
case 0x6:
|
||||
return CPUTYPE_NANO;
|
||||
break;
|
||||
|
||||
if (model == 0xf && stepping < 0xe)
|
||||
return CPUTYPE_NANO;
|
||||
return CPUTYPE_NEHALEM;
|
||||
default:
|
||||
if (family >= 0x7)
|
||||
return CPUTYPE_NEHALEM;
|
||||
else
|
||||
return CPUTYPE_VIAC3;
|
||||
}
|
||||
return CPUTYPE_VIAC3;
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_ZHAOXIN){
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_RISE){
|
||||
@@ -1837,7 +1876,7 @@ char *get_lower_cpunamechar(void){
|
||||
|
||||
int get_coretype(void){
|
||||
|
||||
int family, exfamily, model, exmodel, vendor;
|
||||
int family, exfamily, model, exmodel, vendor, stepping;
|
||||
|
||||
if (!have_cpuid()) return CORE_80486;
|
||||
|
||||
@@ -1845,6 +1884,7 @@ int get_coretype(void){
|
||||
exfamily = get_cputype(GET_EXFAMILY);
|
||||
model = get_cputype(GET_MODEL);
|
||||
exmodel = get_cputype(GET_EXMODEL);
|
||||
stepping = get_cputype(GET_STEPPING);
|
||||
|
||||
vendor = get_vendor();
|
||||
|
||||
@@ -2014,6 +2054,19 @@ int get_coretype(void){
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 7:// Rocket Lake
|
||||
#ifndef NO_AVX512
|
||||
if(support_avx512())
|
||||
return CORE_SKYLAKEX;
|
||||
#endif
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
#endif
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
case 5:
|
||||
switch (model) {
|
||||
@@ -2081,7 +2134,22 @@ int get_coretype(void){
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
#endif
|
||||
#endif
|
||||
if (model == 10)
|
||||
#ifndef NO_AVX512
|
||||
if(support_avx512_bf16())
|
||||
return CORE_COOPERLAKE;
|
||||
return CORE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
#endif
|
||||
break;
|
||||
case 7:
|
||||
if (model == 10)
|
||||
@@ -2102,6 +2170,16 @@ int get_coretype(void){
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 12) { // Tiger Lake
|
||||
if(support_avx512())
|
||||
return CORE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
if (model == 14) { // Kaby Lake
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
@@ -2216,10 +2294,19 @@ int get_coretype(void){
|
||||
if (vendor == VENDOR_CENTAUR) {
|
||||
switch (family) {
|
||||
case 0x6:
|
||||
return CORE_NANO;
|
||||
break;
|
||||
if (model == 0xf && stepping < 0xe)
|
||||
return CORE_NANO;
|
||||
return CORE_NEHALEM;
|
||||
default:
|
||||
if (family >= 0x7)
|
||||
return CORE_NEHALEM;
|
||||
else
|
||||
return CORE_VIAC3;
|
||||
}
|
||||
return CORE_VIAC3;
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_ZHAOXIN) {
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
|
||||
return CORE_UNKNOWN;
|
||||
|
||||
4
ctest.c
4
ctest.c
@@ -157,6 +157,10 @@ ARCH_ARM64
|
||||
ARCH_RISCV64
|
||||
#endif
|
||||
|
||||
#ifdef __loongarch64
|
||||
ARCH_LOONGARCH64
|
||||
#endif
|
||||
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
|
||||
HAVE_C11
|
||||
#endif
|
||||
|
||||
@@ -4,10 +4,22 @@ include_directories(${PROJECT_BINARY_DIR})
|
||||
enable_language(Fortran)
|
||||
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
|
||||
if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
|
||||
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize")
|
||||
endif()
|
||||
|
||||
if(WIN32)
|
||||
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1
|
||||
"$ErrorActionPreference = \"Stop\"\n"
|
||||
"Get-Content $args[1] | & $args[0]\n"
|
||||
)
|
||||
set(test_helper powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1")
|
||||
else()
|
||||
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh
|
||||
"$1 < $2\n"
|
||||
)
|
||||
set(test_helper sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh")
|
||||
endif()
|
||||
|
||||
foreach(float_type ${FLOAT_TYPES})
|
||||
string(SUBSTRING ${float_type} 0 1 float_char_upper)
|
||||
@@ -21,7 +33,7 @@ foreach(float_type ${FLOAT_TYPES})
|
||||
c_${float_char}blas1.c)
|
||||
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
|
||||
add_test(NAME "x${float_char}cblat1"
|
||||
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat1")
|
||||
COMMAND $<TARGET_FILE:x${float_char}cblat1>)
|
||||
|
||||
#level2
|
||||
add_executable(x${float_char}cblat2
|
||||
@@ -33,7 +45,7 @@ foreach(float_type ${FLOAT_TYPES})
|
||||
constant.c)
|
||||
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
|
||||
add_test(NAME "x${float_char}cblat2"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat2" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2")
|
||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat2> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2")
|
||||
|
||||
#level3
|
||||
add_executable(x${float_char}cblat3
|
||||
@@ -45,6 +57,6 @@ foreach(float_type ${FLOAT_TYPES})
|
||||
constant.c)
|
||||
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
|
||||
add_test(NAME "x${float_char}cblat3"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat3" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
|
||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
|
||||
|
||||
endforeach()
|
||||
|
||||
@@ -6,6 +6,9 @@ TOPDIR = ..
|
||||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
override CFLAGS += -DADD$(BU) -DCBLAS
|
||||
ifeq ($(F_COMPILER),GFORTRAN)
|
||||
override FFLAGS += -fno-tree-vectorize
|
||||
endif
|
||||
override TARGET_ARCH=
|
||||
override TARGET_MACH=
|
||||
|
||||
@@ -212,6 +215,9 @@ ifeq ($(C_COMPILER), CLANG)
|
||||
CEXTRALIB = -lomp
|
||||
endif
|
||||
endif
|
||||
ifeq ($(F_COMPILER), NAG)
|
||||
CEXTRALIB = -lgomp
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
|
||||
@@ -20,7 +20,7 @@ void F77_cgemv(int *order, char *transp, int *m, int *n,
|
||||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) );
|
||||
A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -50,7 +50,7 @@ void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
|
||||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *ku+*kl+2;
|
||||
A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
for( i=0; i<*ku; i++ ){
|
||||
irow=*ku+*kl-i;
|
||||
jcol=(*ku)-i;
|
||||
@@ -94,7 +94,7 @@ void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -122,7 +122,7 @@ void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
A=(CBLAS_TEST_COMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -154,7 +154,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A = (CBLAS_TEST_COMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA;
|
||||
*incx, beta, y, *incy );
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -251,8 +251,8 @@ void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
|
||||
beta, y, *incy);
|
||||
else {
|
||||
LDA = *n;
|
||||
A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ));
|
||||
AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)*
|
||||
A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ));
|
||||
AP = (CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_COMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -311,7 +311,7 @@ void F77_ctbmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
x, *incx);
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -375,7 +375,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
*incx);
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ));
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -436,8 +436,8 @@ void F77_ctpmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
cblas_ctpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof(CBLAS_TEST_COMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -491,8 +491,8 @@ void F77_ctpsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
cblas_ctpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof(CBLAS_TEST_COMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -544,7 +544,7 @@ void F77_ctrmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA=*n+1;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -573,7 +573,7 @@ void F77_ctrsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A =(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
A =(CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -601,8 +601,8 @@ void F77_chpr(int *order, char *uplow, int *n, float *alpha,
|
||||
cblas_chpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap );
|
||||
else {
|
||||
LDA = *n;
|
||||
A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
AP = ( CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)*
|
||||
A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
AP = ( CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_COMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -678,8 +678,8 @@ void F77_chpr2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
|
||||
*incy, ap );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc( (((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_COMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -750,7 +750,7 @@ void F77_cher(int *order, char *uplow, int *n, float *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_COMPLEX ));
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX ));
|
||||
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
@@ -784,7 +784,7 @@ void F77_cher2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
|
||||
@@ -19,7 +19,7 @@ void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha,
|
||||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -43,7 +43,7 @@ void F77_dger(int *order, int *m, int *n, double *alpha, double *x, int *incx,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
|
||||
for( i=0; i<*m; i++ ) {
|
||||
for( j=0; j<*n; j++ )
|
||||
@@ -74,7 +74,7 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -102,7 +102,7 @@ void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -123,7 +123,7 @@ void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -146,7 +146,7 @@ void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -170,7 +170,7 @@ void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -196,7 +196,7 @@ void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *ku+*kl+2;
|
||||
A = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n+*kl)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*ku; i++ ){
|
||||
irow=*ku+*kl-i;
|
||||
jcol=(*ku)-i;
|
||||
@@ -236,7 +236,7 @@ void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -282,7 +282,7 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -325,7 +325,7 @@ void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -369,8 +369,8 @@ void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -411,8 +411,8 @@ void F77_dtpmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -451,8 +451,8 @@ void F77_dtpsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -488,8 +488,8 @@ void F77_dspr(int *order, char *uplow, int *n, double *alpha, double *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -540,8 +540,8 @@ void F77_dspr2(int *order, char *uplow, int *n, double *alpha, double *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
|
||||
@@ -26,34 +26,34 @@ void F77_dgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (transa == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A = (double *)malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = (double *)malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else {
|
||||
LDA = *m+1;
|
||||
A = ( double* )malloc( LDA*(*k)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
if (transb == CblasNoTrans) {
|
||||
LDB = *n+1;
|
||||
B = ( double* )malloc( (*k)*LDB*sizeof( double ) );
|
||||
B = ( double* )malloc( (*k)*(size_t)LDB*sizeof( double ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
}
|
||||
else {
|
||||
LDB = *k+1;
|
||||
B = ( double* )malloc( LDB*(*n)*sizeof( double ) );
|
||||
B = ( double* )malloc( (size_t)LDB*(*n)*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( double* )malloc( (*m)*LDC*sizeof( double ) );
|
||||
C = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) );
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -89,25 +89,25 @@ void F77_dsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( double* )malloc( (*m)*LDB*sizeof( double ) );
|
||||
B = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
LDC = *n+1;
|
||||
C = ( double* )malloc( (*m)*LDC*sizeof( double ) );
|
||||
C = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) );
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -143,20 +143,20 @@ void F77_dsyrk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*k)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*k)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( double* )malloc( (*n)*LDC*sizeof( double ) );
|
||||
C = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -191,8 +191,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
LDB = *k+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
B = ( double* )malloc( (*n)*LDB*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
B = ( double* )malloc( (*n)*(size_t)LDB*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
@@ -202,8 +202,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
else {
|
||||
LDA = *n+1;
|
||||
LDB = *n+1;
|
||||
A = ( double* )malloc( LDA*(*k)*sizeof( double ) );
|
||||
B = ( double* )malloc( LDB*(*k)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) );
|
||||
B = ( double* )malloc( (size_t)LDB*(*k)*sizeof( double ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
@@ -211,7 +211,7 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( double* )malloc( (*n)*LDC*sizeof( double ) );
|
||||
C = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -249,20 +249,20 @@ void F77_dtrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( double* )malloc( (*m)*LDB*sizeof( double ) );
|
||||
B = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
@@ -300,20 +300,20 @@ void F77_dtrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( double* )malloc( (*m)*LDB*sizeof( double ) );
|
||||
B = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
|
||||
@@ -19,7 +19,7 @@ void F77_sgemv(int *order, char *transp, int *m, int *n, float *alpha,
|
||||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -43,7 +43,7 @@ void F77_sger(int *order, int *m, int *n, float *alpha, float *x, int *incx,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
|
||||
for( i=0; i<*m; i++ ) {
|
||||
for( j=0; j<*n; j++ )
|
||||
@@ -74,7 +74,7 @@ void F77_strmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -102,7 +102,7 @@ void F77_strsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -123,7 +123,7 @@ void F77_ssymv(int *order, char *uplow, int *n, float *alpha, float *a,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -146,7 +146,7 @@ void F77_ssyr(int *order, char *uplow, int *n, float *alpha, float *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -170,7 +170,7 @@ void F77_ssyr2(int *order, char *uplow, int *n, float *alpha, float *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -196,7 +196,7 @@ void F77_sgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *ku+*kl+2;
|
||||
A = ( float* )malloc( (*n+*kl)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n+*kl)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*ku; i++ ){
|
||||
irow=*ku+*kl-i;
|
||||
jcol=(*ku)-i;
|
||||
@@ -236,7 +236,7 @@ void F77_stbmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -282,7 +282,7 @@ void F77_stbsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -325,7 +325,7 @@ void F77_ssbmv(int *order, char *uplow, int *n, int *k, float *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -369,8 +369,8 @@ void F77_sspmv(int *order, char *uplow, int *n, float *alpha, float *ap,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -410,8 +410,8 @@ void F77_stpmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -449,8 +449,8 @@ void F77_stpsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -485,8 +485,8 @@ void F77_sspr(int *order, char *uplow, int *n, float *alpha, float *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -536,8 +536,8 @@ void F77_sspr2(int *order, char *uplow, int *n, float *alpha, float *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
|
||||
@@ -23,34 +23,34 @@ void F77_sgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (transa == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A = (float *)malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = (float *)malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else {
|
||||
LDA = *m+1;
|
||||
A = ( float* )malloc( LDA*(*k)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
if (transb == CblasNoTrans) {
|
||||
LDB = *n+1;
|
||||
B = ( float* )malloc( (*k)*LDB*sizeof( float ) );
|
||||
B = ( float* )malloc( (*k)*(size_t)LDB*sizeof( float ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
}
|
||||
else {
|
||||
LDB = *k+1;
|
||||
B = ( float* )malloc( LDB*(*n)*sizeof( float ) );
|
||||
B = ( float* )malloc( (size_t)LDB*(*n)*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( float* )malloc( (*m)*LDC*sizeof( float ) );
|
||||
C = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) );
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -85,25 +85,25 @@ void F77_ssymm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( float* )malloc( (*m)*LDB*sizeof( float ) );
|
||||
B = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
LDC = *n+1;
|
||||
C = ( float* )malloc( (*m)*LDC*sizeof( float ) );
|
||||
C = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) );
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -139,20 +139,20 @@ void F77_ssyrk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*k)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*k)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( float* )malloc( (*n)*LDC*sizeof( float ) );
|
||||
C = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -187,8 +187,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
LDB = *k+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
B = ( float* )malloc( (*n)*LDB*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
B = ( float* )malloc( (*n)*(size_t)LDB*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
@@ -198,8 +198,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
else {
|
||||
LDA = *n+1;
|
||||
LDB = *n+1;
|
||||
A = ( float* )malloc( LDA*(*k)*sizeof( float ) );
|
||||
B = ( float* )malloc( LDB*(*k)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) );
|
||||
B = ( float* )malloc( (size_t)LDB*(*k)*sizeof( float ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
@@ -207,7 +207,7 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( float* )malloc( (*n)*LDC*sizeof( float ) );
|
||||
C = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -245,20 +245,20 @@ void F77_strmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( float* )malloc( (*m)*LDB*sizeof( float ) );
|
||||
B = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
@@ -296,20 +296,20 @@ void F77_strsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( float* )malloc( (*m)*LDB*sizeof( float ) );
|
||||
B = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
|
||||
@@ -20,7 +20,7 @@ void F77_zgemv(int *order, char *transp, int *m, int *n,
|
||||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_ZOMPLEX) );
|
||||
A = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -50,7 +50,7 @@ void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
|
||||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *ku+*kl+2;
|
||||
A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*ku; i++ ){
|
||||
irow=*ku+*kl-i;
|
||||
jcol=(*ku)-i;
|
||||
@@ -94,7 +94,7 @@ void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -122,7 +122,7 @@ void F77_zgerc(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -154,7 +154,7 @@ void F77_zhemv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA;
|
||||
*incx, beta, y, *incy );
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -251,8 +251,8 @@ void F77_zhpmv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
||||
beta, y, *incy);
|
||||
else {
|
||||
LDA = *n;
|
||||
A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
AP = (CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)*
|
||||
A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
AP = (CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_ZOMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -311,7 +311,7 @@ void F77_ztbmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
x, *incx);
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -375,7 +375,7 @@ void F77_ztbsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
*incx);
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -436,8 +436,8 @@ void F77_ztpmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
cblas_ztpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -491,8 +491,8 @@ void F77_ztpsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
cblas_ztpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -544,7 +544,7 @@ void F77_ztrmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA=*n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -573,7 +573,7 @@ void F77_ztrsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -601,8 +601,8 @@ void F77_zhpr(int *order, char *uplow, int *n, double *alpha,
|
||||
cblas_zhpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap );
|
||||
else {
|
||||
LDA = *n;
|
||||
A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
AP = ( CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)*
|
||||
A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
AP = ( CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_ZOMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -678,8 +678,8 @@ void F77_zhpr2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
||||
*incy, ap );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc( (((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_ZOMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -750,7 +750,7 @@ void F77_zher(int *order, char *uplow, int *n, double *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_ZOMPLEX ));
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX ));
|
||||
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
@@ -784,7 +784,7 @@ void F77_zher2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
|
||||
@@ -26,7 +26,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (transa == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -35,7 +35,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
||||
}
|
||||
else {
|
||||
LDA = *m+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*m; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -45,7 +45,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
||||
|
||||
if (transb == CblasNoTrans) {
|
||||
LDB = *n+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
B[i*LDB+j].real=b[j*(*ldb)+i].real;
|
||||
@@ -54,7 +54,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
||||
}
|
||||
else {
|
||||
LDB = *k+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
B[i*LDB+j].real=b[j*(*ldb)+i].real;
|
||||
@@ -63,7 +63,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
||||
}
|
||||
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
@@ -103,7 +103,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -112,7 +112,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -120,14 +120,14 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
||||
}
|
||||
}
|
||||
LDB = *n+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
B[i*LDB+j].real=b[j*(*ldb)+i].real;
|
||||
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
|
||||
}
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
@@ -167,25 +167,25 @@ void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -221,7 +221,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -230,7 +230,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -238,7 +238,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
@@ -277,7 +277,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -286,7 +286,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -294,7 +294,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
@@ -333,8 +333,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
LDB = *k+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -346,8 +346,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
else {
|
||||
LDA = *n+1;
|
||||
LDB = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -357,7 +357,7 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
@@ -397,8 +397,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
LDB = *k+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -410,8 +410,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
else {
|
||||
LDA = *n+1;
|
||||
LDB = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -421,7 +421,7 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
@@ -463,7 +463,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -472,7 +472,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -480,7 +480,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
}
|
||||
}
|
||||
LDB = *n+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
B[i*LDB+j].real=b[j*(*ldb)+i].real;
|
||||
@@ -522,7 +522,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -531,7 +531,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -539,7 +539,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
}
|
||||
}
|
||||
LDB = *n+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
B[i*LDB+j].real=b[j*(*ldb)+i].real;
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "cblas_test.h"
|
||||
int CBLAS_CallFromC;
|
||||
int RowMajorStrg;
|
||||
|
||||
|
||||
@@ -425,7 +425,7 @@ cgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -473,7 +473,7 @@ zgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -521,7 +521,7 @@ xgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -632,7 +632,7 @@ cgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -680,7 +680,7 @@ zgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -728,7 +728,7 @@ xgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -1895,7 +1895,7 @@ cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -1943,7 +1943,7 @@ zgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -1991,7 +1991,7 @@ xgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -2048,7 +2048,7 @@ cgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -2096,7 +2096,7 @@ zgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -2144,7 +2144,7 @@ xgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -2817,7 +2817,7 @@ cgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -2865,7 +2865,7 @@ zgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -2913,7 +2913,7 @@ xgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -3025,7 +3025,7 @@ cgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -3073,7 +3073,7 @@ zgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -3121,7 +3121,7 @@ xgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -4288,7 +4288,7 @@ cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -4336,7 +4336,7 @@ zgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -4384,7 +4384,7 @@ xgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -4441,7 +4441,7 @@ cgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -4489,7 +4489,7 @@ zgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -4537,7 +4537,7 @@ xgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
||||
@@ -1024,38 +1024,39 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||
|
||||
int i;
|
||||
|
||||
if (!blas_server_avail) return 0;
|
||||
|
||||
LOCK_COMMAND(&server_lock);
|
||||
|
||||
for (i = 0; i < blas_num_threads - 1; i++) {
|
||||
if (blas_server_avail) {
|
||||
|
||||
for (i = 0; i < blas_num_threads - 1; i++) {
|
||||
|
||||
|
||||
pthread_mutex_lock (&thread_status[i].lock);
|
||||
pthread_mutex_lock (&thread_status[i].lock);
|
||||
|
||||
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1);
|
||||
thread_status[i].status = THREAD_STATUS_WAKEUP;
|
||||
pthread_cond_signal (&thread_status[i].wakeup);
|
||||
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1);
|
||||
thread_status[i].status = THREAD_STATUS_WAKEUP;
|
||||
pthread_cond_signal (&thread_status[i].wakeup);
|
||||
|
||||
pthread_mutex_unlock(&thread_status[i].lock);
|
||||
pthread_mutex_unlock(&thread_status[i].lock);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_join(blas_threads[i], NULL);
|
||||
}
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_join(blas_threads[i], NULL);
|
||||
}
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_mutex_destroy(&thread_status[i].lock);
|
||||
pthread_cond_destroy (&thread_status[i].wakeup);
|
||||
}
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_mutex_destroy(&thread_status[i].lock);
|
||||
pthread_cond_destroy (&thread_status[i].wakeup);
|
||||
}
|
||||
|
||||
#ifdef NEED_STACKATTR
|
||||
pthread_attr_destory(&attr);
|
||||
pthread_attr_destroy(&attr);
|
||||
#endif
|
||||
|
||||
blas_server_avail = 0;
|
||||
blas_server_avail = 0;
|
||||
|
||||
}
|
||||
UNLOCK_COMMAND(&server_lock);
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -292,6 +292,7 @@ extern gotoblas_t gotoblas_COOPERLAKE;
|
||||
#define VENDOR_AMD 2
|
||||
#define VENDOR_CENTAUR 3
|
||||
#define VENDOR_HYGON 4
|
||||
#define VENDOR_ZHAOXIN 5
|
||||
#define VENDOR_UNKNOWN 99
|
||||
|
||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||
@@ -404,6 +405,7 @@ static int get_vendor(void){
|
||||
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
|
||||
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
|
||||
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
|
||||
if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_ZHAOXIN;
|
||||
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
|
||||
|
||||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
|
||||
@@ -414,7 +416,7 @@ static int get_vendor(void){
|
||||
static gotoblas_t *get_coretype(void){
|
||||
|
||||
int eax, ebx, ecx, edx;
|
||||
int family, exfamily, model, vendor, exmodel;
|
||||
int family, exfamily, model, vendor, exmodel, stepping;
|
||||
|
||||
cpuid(1, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
@@ -422,6 +424,7 @@ static gotoblas_t *get_coretype(void){
|
||||
exfamily = BITMASK(eax, 20, 0xff);
|
||||
model = BITMASK(eax, 4, 0x0f);
|
||||
exmodel = BITMASK(eax, 16, 0x0f);
|
||||
stepping = BITMASK(eax, 0, 0x0f);
|
||||
|
||||
vendor = get_vendor();
|
||||
|
||||
@@ -621,6 +624,22 @@ static gotoblas_t *get_coretype(void){
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
if (model == 10) {
|
||||
// Ice Lake SP
|
||||
if(support_avx512_bf16())
|
||||
return &gotoblas_COOPERLAKE;
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 7:
|
||||
if (model == 10) // Goldmont Plus
|
||||
@@ -644,6 +663,21 @@ static gotoblas_t *get_coretype(void){
|
||||
return NULL;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 12) { // Tiger Lake
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
if (model == 14 ) { // Kaby Lake, Coffee Lake
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
@@ -656,7 +690,7 @@ static gotoblas_t *get_coretype(void){
|
||||
}
|
||||
}
|
||||
case 10:
|
||||
if (model == 5 || model == 6) {
|
||||
if (model == 5 || model == 6) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
@@ -666,7 +700,20 @@ static gotoblas_t *get_coretype(void){
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
}
|
||||
if (model == 7) {
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
case 0xf:
|
||||
@@ -779,10 +826,19 @@ static gotoblas_t *get_coretype(void){
|
||||
if (vendor == VENDOR_CENTAUR) {
|
||||
switch (family) {
|
||||
case 0x6:
|
||||
return &gotoblas_NANO;
|
||||
if (model == 0xf && stepping < 0xe)
|
||||
return &gotoblas_NANO;
|
||||
return &gotoblas_NEHALEM;
|
||||
default:
|
||||
if (family >= 0x7)
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_ZHAOXIN) {
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
@@ -43,6 +43,68 @@
|
||||
#endif
|
||||
|
||||
extern gotoblas_t gotoblas_ARMV8;
|
||||
#ifdef DYNAMIC_LIST
|
||||
#ifdef DYN_CORTEXA53
|
||||
extern gotoblas_t gotoblas_CORTEXA53;
|
||||
#else
|
||||
#define gotoblas_CORTEXA53 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEXA57
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
#else
|
||||
#define gotoblas_CORTEXA57 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEXA72
|
||||
extern gotoblas_t gotoblas_CORTEXA72;
|
||||
#else
|
||||
#define gotoblas_CORTEXA72 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEXA73
|
||||
extern gotoblas_t gotoblas_CORTEXA73;
|
||||
#else
|
||||
#define gotoblas_CORTEXA73 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_FALKOR
|
||||
extern gotoblas_t gotoblas_FALKOR;
|
||||
#else
|
||||
#define gotoblas_FALKOR gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_TSV110
|
||||
extern gotoblas_t gotoblas_TSV110;
|
||||
#else
|
||||
#define gotoblas_TSV110 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_THUNDERX
|
||||
extern gotoblas_t gotoblas_THUNDERX;
|
||||
#else
|
||||
#define gotoblas_THUNDERX gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_THUNDERX2T99
|
||||
extern gotoblas_t gotoblas_THUNDERX2T99;
|
||||
#else
|
||||
#define gotoblas_THUNDERX2T99 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_THUNDERX3T110
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
#else
|
||||
#define gotoblas_THUNDERX3T110 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_EMAG8180
|
||||
extern gotoblas_t gotoblas_EMAG8180;
|
||||
#else
|
||||
#define gotoblas_EMAG8180 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_NEOVERSEN1
|
||||
extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||
#else
|
||||
#define gotoblas_NEOVERSEN1 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEX_A55
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
#else
|
||||
#define gotoblas_CORTEXA55 gotoblas_ARMV8
|
||||
#endif
|
||||
#else
|
||||
extern gotoblas_t gotoblas_CORTEXA53;
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
extern gotoblas_t gotoblas_CORTEXA72;
|
||||
@@ -54,10 +116,12 @@ extern gotoblas_t gotoblas_TSV110;
|
||||
extern gotoblas_t gotoblas_EMAG8180;
|
||||
extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
#endif
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#define NUM_CORETYPES 12
|
||||
#define NUM_CORETYPES 13
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
@@ -68,7 +132,7 @@ extern void openblas_warning(int verbose, const char * msg);
|
||||
#endif
|
||||
|
||||
#define get_cpu_ftr(id, var) ({ \
|
||||
__asm__("mrs %0, "#id : "=r" (var)); \
|
||||
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
|
||||
})
|
||||
|
||||
static char *corename[] = {
|
||||
@@ -84,6 +148,7 @@ static char *corename[] = {
|
||||
"emag8180",
|
||||
"neoversen1",
|
||||
"thunderx3t110",
|
||||
"cortexa55",
|
||||
"unknown"
|
||||
};
|
||||
|
||||
@@ -100,6 +165,7 @@ char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
|
||||
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
|
||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11];
|
||||
if (gotoblas == &gotoblas_CORTEXA55) return corename[12];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
@@ -131,6 +197,7 @@ static gotoblas_t *force_coretype(char *coretype) {
|
||||
case 9: return (&gotoblas_EMAG8180);
|
||||
case 10: return (&gotoblas_NEOVERSEN1);
|
||||
case 11: return (&gotoblas_THUNDERX3T110);
|
||||
case 12: return (&gotoblas_CORTEXA55);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
@@ -189,6 +256,8 @@ static gotoblas_t *get_coretype(void) {
|
||||
return &gotoblas_CORTEXA73;
|
||||
case 0xd0c: // Neoverse N1
|
||||
return &gotoblas_NEOVERSEN1;
|
||||
case 0xd05: // Cortex A55
|
||||
return &gotoblas_CORTEXA55;
|
||||
}
|
||||
break;
|
||||
case 0x42: // Broadcom
|
||||
|
||||
@@ -27,7 +27,9 @@ static char *corename[] = {
|
||||
#define NUM_CORETYPES 4
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
#ifndef C_PGI
|
||||
if (gotoblas == &gotoblas_POWER6) return corename[1];
|
||||
#endif
|
||||
if (gotoblas == &gotoblas_POWER8) return corename[2];
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
@@ -38,10 +40,164 @@ char *gotoblas_corename(void) {
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
static int __builtin_cpu_supports(char* arg)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(C_PGI) || defined(__clang__)
|
||||
/*
|
||||
* NV HPC compilers do not yet implement __builtin_cpu_is().
|
||||
* Fake a version here for use in the CPU detection code below.
|
||||
*
|
||||
* Strategy here is to first check the CPU to see what it actually is,
|
||||
* and then test the input to see if what the CPU actually is matches
|
||||
* what was requested.
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* Define POWER processor version table.
|
||||
*
|
||||
* NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time
|
||||
*/
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_POWER5 5
|
||||
#define CPU_POWER6 6
|
||||
#define CPU_POWER8 8
|
||||
#define CPU_POWER9 9
|
||||
#define CPU_POWER10 10
|
||||
|
||||
static struct {
|
||||
uint32_t pvr_mask;
|
||||
uint32_t pvr_value;
|
||||
const char* cpu_name;
|
||||
uint32_t cpu_type;
|
||||
} pvrPOWER [] = {
|
||||
|
||||
{ /* POWER6 in P5+ mode; 2.04-compliant processor */
|
||||
.pvr_mask = 0xffffffff,
|
||||
.pvr_value = 0x0f000001,
|
||||
.cpu_name = "POWER5+",
|
||||
.cpu_type = CPU_POWER5,
|
||||
},
|
||||
|
||||
{ /* Power6 aka POWER6X*/
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x003e0000,
|
||||
.cpu_name = "POWER6 (raw)",
|
||||
.cpu_type = CPU_POWER6,
|
||||
},
|
||||
|
||||
{ /* Power7 */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x003f0000,
|
||||
.cpu_name = "POWER7 (raw)",
|
||||
.cpu_type = CPU_POWER6,
|
||||
},
|
||||
|
||||
{ /* Power7+ */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004A0000,
|
||||
.cpu_name = "POWER7+ (raw)",
|
||||
.cpu_type = CPU_POWER6,
|
||||
},
|
||||
|
||||
{ /* Power8E */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004b0000,
|
||||
.cpu_name = "POWER8E (raw)",
|
||||
.cpu_type = CPU_POWER8,
|
||||
},
|
||||
|
||||
{ /* Power8NVL */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004c0000,
|
||||
.cpu_name = "POWER8NVL (raw)",
|
||||
.cpu_type = CPU_POWER8,
|
||||
},
|
||||
|
||||
{ /* Power8 */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004d0000,
|
||||
.cpu_name = "POWER8 (raw)",
|
||||
.cpu_type = CPU_POWER8,
|
||||
},
|
||||
|
||||
{ /* Power9 DD2.0 */
|
||||
.pvr_mask = 0xffffefff,
|
||||
.pvr_value = 0x004e0200,
|
||||
.cpu_name = "POWER9 (raw)",
|
||||
.cpu_type = CPU_POWER9,
|
||||
},
|
||||
|
||||
{ /* Power9 DD 2.1 */
|
||||
.pvr_mask = 0xffffefff,
|
||||
.pvr_value = 0x004e0201,
|
||||
.cpu_name = "POWER9 (raw)",
|
||||
.cpu_type = CPU_POWER9,
|
||||
},
|
||||
|
||||
{ /* Power9 DD2.2 or later */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004e0000,
|
||||
.cpu_name = "POWER9 (raw)",
|
||||
.cpu_type = CPU_POWER9,
|
||||
},
|
||||
|
||||
{ /* Power10 */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x00800000,
|
||||
.cpu_name = "POWER10 (raw)",
|
||||
.cpu_type = CPU_POWER10,
|
||||
},
|
||||
|
||||
{ /* End of table, pvr_mask and pvr_value must be zero */
|
||||
.pvr_mask = 0x0,
|
||||
.pvr_value = 0x0,
|
||||
.cpu_name = "Unknown",
|
||||
.cpu_type = CPU_UNKNOWN,
|
||||
},
|
||||
};
|
||||
|
||||
static int __builtin_cpu_is(const char *cpu) {
|
||||
int i;
|
||||
uint32_t pvr;
|
||||
uint32_t cpu_type;
|
||||
|
||||
asm("mfpvr %0" : "=r"(pvr));
|
||||
|
||||
for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) {
|
||||
if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(DEBUG)
|
||||
printf("%s: returning CPU=%s, cpu_type=%p\n", __func__,
|
||||
pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
|
||||
#endif
|
||||
cpu_type = pvrPOWER[i].cpu_type;
|
||||
|
||||
if (!strcmp(cpu, "power8"))
|
||||
return cpu_type == CPU_POWER8;
|
||||
if (!strcmp(cpu, "power9"))
|
||||
return cpu_type == CPU_POWER9;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* C_PGI */
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
|
||||
#ifndef C_PGI
|
||||
if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
|
||||
return &gotoblas_POWER6;
|
||||
#endif
|
||||
if (__builtin_cpu_is("power8"))
|
||||
return &gotoblas_POWER8;
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
@@ -53,7 +209,7 @@ static gotoblas_t *get_coretype(void) {
|
||||
return &gotoblas_POWER10;
|
||||
#endif
|
||||
/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
|
||||
if (__builtin_cpu_is("power10"))
|
||||
return &gotoblas_POWER9;
|
||||
#endif
|
||||
@@ -77,7 +233,9 @@ static gotoblas_t *force_coretype(char * coretype) {
|
||||
|
||||
switch (found)
|
||||
{
|
||||
#ifndef C_PGI
|
||||
case 1: return (&gotoblas_POWER6);
|
||||
#endif
|
||||
case 2: return (&gotoblas_POWER8);
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
case 3: return (&gotoblas_POWER9);
|
||||
|
||||
@@ -222,11 +222,11 @@ int get_num_procs(void);
|
||||
#else
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
@@ -428,7 +428,7 @@ extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
@@ -436,7 +436,7 @@ int blas_get_cpu_number(void){
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
@@ -460,7 +460,7 @@ int blas_get_cpu_number(void){
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
@@ -1241,7 +1241,7 @@ UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
func = &memoryalloc[0];
|
||||
|
||||
while ((func != NULL) && (map_address == (void *) -1)) {
|
||||
while ((*func != NULL) && (map_address == (void *) -1)) {
|
||||
|
||||
map_address = (*func)((void *)base_address);
|
||||
|
||||
@@ -1291,7 +1291,12 @@ UNLOCK_COMMAND(&alloc_lock);
|
||||
return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
|
||||
|
||||
error:
|
||||
printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
|
||||
printf("OpenBLAS : Program will terminate because you tried to allocate too many TLS memory regions.\n");
|
||||
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
|
||||
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
|
||||
printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
|
||||
printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
|
||||
printf("cpu cores than what OpenBLAS was configured to handle.\n");
|
||||
|
||||
return NULL;
|
||||
}
|
||||
@@ -1619,10 +1624,12 @@ static int on_process_term(void)
|
||||
#else
|
||||
#pragma data_seg(".CRT$XLB")
|
||||
#endif
|
||||
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
|
||||
#ifdef _WIN64
|
||||
static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
#pragma const_seg()
|
||||
#else
|
||||
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
#pragma data_seg()
|
||||
#endif
|
||||
|
||||
@@ -1631,10 +1638,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI
|
||||
#else
|
||||
#pragma data_seg(".CRT$XTU")
|
||||
#endif
|
||||
static int(*p_process_term)(void) = on_process_term;
|
||||
|
||||
#ifdef _WIN64
|
||||
static const int(*p_process_term)(void) = on_process_term;
|
||||
#pragma const_seg()
|
||||
#else
|
||||
static int(*p_process_term)(void) = on_process_term;
|
||||
#pragma data_seg()
|
||||
#endif
|
||||
#endif
|
||||
@@ -1668,16 +1677,23 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
#ifndef MEM_LARGE_PAGES
|
||||
#define MEM_LARGE_PAGES 0x20000000
|
||||
#endif
|
||||
#else
|
||||
#elif !defined(OS_EMBEDDED)
|
||||
#define ALLOC_MMAP
|
||||
#define ALLOC_MALLOC
|
||||
#else
|
||||
#define ALLOC_MALLOC
|
||||
|
||||
inline int puts(const char *str) { return 0; }
|
||||
inline int printf(const char *format, ...) { return 0; }
|
||||
inline char *getenv(const char *name) { return ""; }
|
||||
inline int atoi(const char *str) { return 0; }
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
|
||||
#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED)
|
||||
#include <sys/mman.h>
|
||||
#ifndef NO_SYSV_IPC
|
||||
#include <sys/shm.h>
|
||||
@@ -1691,7 +1707,6 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
#include <sys/sysinfo.h>
|
||||
#include <sched.h>
|
||||
#include <errno.h>
|
||||
#include <linux/unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
@@ -1969,7 +1984,7 @@ extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
@@ -1977,7 +1992,7 @@ int blas_get_cpu_number(void){
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
@@ -2001,7 +2016,7 @@ int blas_get_cpu_number(void){
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
@@ -2868,8 +2883,12 @@ void *blas_memory_alloc(int procpos){
|
||||
return (void *)memory[position].addr;
|
||||
|
||||
error:
|
||||
printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
|
||||
|
||||
printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
|
||||
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
|
||||
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
|
||||
printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
|
||||
printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
|
||||
printf("cpu cores than what OpenBLAS was configured to handle.\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
@@ -139,9 +139,13 @@ endif
|
||||
ifneq (,$(filter 1 2,$(NOFORTRAN)))
|
||||
#only build without Fortran
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
else
|
||||
ifeq ($(F_COMPILER), INTEL)
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all-load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def
|
||||
else
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
endif
|
||||
endif
|
||||
|
||||
dllinit.$(SUFFIX) : dllinit.c
|
||||
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# Changelog
|
||||
# 2017/09/03 staticfloat
|
||||
|
||||
70
f_check
70
f_check
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
|
||||
@@ -32,9 +32,9 @@ if ($compiler eq "") {
|
||||
"xlf95", "xlf90", "xlf",
|
||||
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
|
||||
"pathf90", "pathf95",
|
||||
"pgf95", "pgf90", "pgf77",
|
||||
"pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran",
|
||||
"flang", "egfortran",
|
||||
"ifort");
|
||||
"ifort", "nagfor");
|
||||
|
||||
OUTER:
|
||||
foreach $lists (@lists) {
|
||||
@@ -64,7 +64,9 @@ if ($compiler eq "") {
|
||||
if (!$?) {
|
||||
|
||||
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`;
|
||||
|
||||
if ($data eq "") {
|
||||
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`;
|
||||
}
|
||||
if ($data =~ /zhoge_/) {
|
||||
$bu = "_";
|
||||
}
|
||||
@@ -76,6 +78,7 @@ if ($compiler eq "") {
|
||||
|
||||
} elsif ($data =~ /GNU/ || $data =~ /GCC/ ) {
|
||||
|
||||
$data =~ s/\(+.*?\)+//g;
|
||||
$data =~ /(\d+)\.(\d+).(\d+)/;
|
||||
$major = $1;
|
||||
$minor = $2;
|
||||
@@ -87,7 +90,7 @@ if ($compiler eq "") {
|
||||
if ($compiler =~ /flang/) {
|
||||
$vendor = FLANG;
|
||||
$openmp = "-fopenmp";
|
||||
} elsif ($compiler =~ /pgf/) {
|
||||
} elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
|
||||
$vendor = PGI;
|
||||
$openmp = "-mp";
|
||||
} else {
|
||||
@@ -123,7 +126,7 @@ if ($compiler eq "") {
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($data =~ /PGF/) {
|
||||
if ($data =~ /PGF/ || $data =~ /NVF/) {
|
||||
$vendor = PGI;
|
||||
$openmp = "-mp";
|
||||
}
|
||||
@@ -133,8 +136,16 @@ if ($compiler eq "") {
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($data =~ /NAG/) {
|
||||
$vendor = NAG;
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
|
||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
|
||||
if ($data eq "") {
|
||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`;
|
||||
}
|
||||
if ($data =~ / zho_ge__/) {
|
||||
$need2bu = 1;
|
||||
}
|
||||
@@ -177,7 +188,7 @@ if ($compiler eq "") {
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /pgf/) {
|
||||
if ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
|
||||
$vendor = PGI;
|
||||
$bu = "_";
|
||||
$openmp = "-mp";
|
||||
@@ -222,6 +233,12 @@ if ($compiler eq "") {
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /nagfor/) {
|
||||
$vendor = NAG;
|
||||
$bu = "_";
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($vendor eq "") {
|
||||
$nofortran = 1;
|
||||
$compiler = "gfortran";
|
||||
@@ -275,14 +292,20 @@ if (!$?) {
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
#For nagfor
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
$binary = "" if ($?);
|
||||
}
|
||||
|
||||
if ($binary eq "") {
|
||||
$link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
}
|
||||
|
||||
if ( $vendor eq "NAG") {
|
||||
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
$linker_L = "";
|
||||
$linker_l = "";
|
||||
$linker_a = "";
|
||||
@@ -291,11 +314,11 @@ if ($link ne "") {
|
||||
|
||||
$link =~ s/\-Y\sP\,/\-Y/g;
|
||||
|
||||
$link =~ s/\-R\s*/\-rpath\@/g;
|
||||
$link =~ s/\-R\s*/\-rpath\%/g;
|
||||
|
||||
$link =~ s/\-rpath\s+/\-rpath\@/g;
|
||||
$link =~ s/\-rpath\s+/\-rpath\%/g;
|
||||
|
||||
$link =~ s/\-rpath-link\s+/\-rpath-link\@/g;
|
||||
$link =~ s/\-rpath-link\s+/\-rpath-link\%/g;
|
||||
|
||||
@flags = split(/[\s\,\n]/, $link);
|
||||
# remove leading and trailing quotes from each flag.
|
||||
@@ -321,21 +344,22 @@ if ($link ne "") {
|
||||
}
|
||||
|
||||
|
||||
if ($flags =~ /^\-rpath\@/) {
|
||||
$flags =~ s/\@/\,/g;
|
||||
if ($flags =~ /^\-rpath\%/) {
|
||||
$flags =~ s/\%/\,/g;
|
||||
$linker_L .= "-Wl,". $flags . " " ;
|
||||
}
|
||||
|
||||
if ($flags =~ /^\-rpath-link\@/) {
|
||||
$flags =~ s/\@/\,/g;
|
||||
if ($flags =~ /^\-rpath-link\%/) {
|
||||
$flags =~ s/\%/\,/g;
|
||||
$linker_L .= "-Wl,". $flags . " " ;
|
||||
}
|
||||
if ($flags =~ /-lgomp/ && $CC =~ /clang/) {
|
||||
if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) {
|
||||
$flags = "-lomp";
|
||||
}
|
||||
|
||||
if (
|
||||
($flags =~ /^\-l/)
|
||||
&& ($flags !~ /ibrary/)
|
||||
&& ($flags !~ /gfortranbegin/)
|
||||
&& ($flags !~ /frtbegin/)
|
||||
&& ($flags !~ /pathfstart/)
|
||||
@@ -352,15 +376,21 @@ if ($link ne "") {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
|
||||
if ( $flags =~ /quickfit.o/ && $vendor == NAG) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
if ( $flags =~ /safefit.o/ && $vendor == NAG) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
if ( $flags =~ /thsafe.o/ && $vendor == NAG) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
|
||||
$linker_a .= $flags . " " if $flags =~ /\.a$/;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if ($vendor eq "INTEL"){
|
||||
$linker_a .= "-lgfortran"
|
||||
}
|
||||
|
||||
if ($vendor eq "FLANG"){
|
||||
$linker_a .= "-lflang"
|
||||
}
|
||||
|
||||
40
getarch.c
40
getarch.c
@@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
/* #define FORCE_SICORTEX */
|
||||
/* #define FORCE_LOONGSON3R3 */
|
||||
/* #define FORCE_LOONGSON3R4 */
|
||||
/* #define FORCE_LOONGSON3R5 */
|
||||
/* #define FORCE_I6400 */
|
||||
/* #define FORCE_P6600 */
|
||||
/* #define FORCE_P5600 */
|
||||
@@ -842,6 +843,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_LOONGSON3R5
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "LOONGARCH"
|
||||
#define SUBARCHITECTURE "LOONGSON3R5"
|
||||
#define SUBDIRNAME "loongarch64"
|
||||
#define ARCHCONFIG "-DLOONGSON3R5 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
|
||||
#define LIBNAME "loongson3r5"
|
||||
#define CORENAME "LOONGSON3R5"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_I6400
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "MIPS"
|
||||
@@ -1159,6 +1174,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA55
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "CORTEXA55"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DCORTEXA55 " \
|
||||
"-DL1_CODE_SIZE=16384 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
|
||||
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||
"-DL2_SIZE=65536 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa55"
|
||||
#define CORENAME "CORTEXA55"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_FALKOR
|
||||
#define FORCE
|
||||
@@ -1373,8 +1403,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
#ifdef __loongarch64
|
||||
#include "cpuid_loongarch64.c"
|
||||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
#ifdef __riscv
|
||||
#include "cpuid_riscv64.c"
|
||||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
#ifdef __arm__
|
||||
@@ -1447,7 +1483,7 @@ int main(int argc, char *argv[]){
|
||||
#ifdef FORCE
|
||||
printf("CORE=%s\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
|
||||
printf("CORE=%s\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
@@ -1595,7 +1631,7 @@ printf("ELF_VERSION=2\n");
|
||||
#ifdef FORCE
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -4,6 +4,14 @@
|
||||
#else
|
||||
#include "config_kernel.h"
|
||||
#endif
|
||||
#if (defined(__WIN32__) || defined(__WIN64__) || defined(__CYGWIN32__) || defined(__CYGWIN64__) || defined(_WIN32) || defined(_WIN64)) && defined(__64BIT__)
|
||||
typedef long long BLASLONG;
|
||||
typedef unsigned long long BLASULONG;
|
||||
#else
|
||||
typedef long BLASLONG;
|
||||
typedef unsigned long BLASULONG;
|
||||
#endif
|
||||
|
||||
#include "param.h"
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
||||
@@ -316,7 +316,7 @@ CCBLAS1OBJS = \
|
||||
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
|
||||
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
|
||||
cblas_caxpby.$(SUFFIX) \
|
||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)
|
||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
|
||||
|
||||
CCBLAS2OBJS = \
|
||||
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
|
||||
@@ -346,7 +346,7 @@ CZBLAS1OBJS = \
|
||||
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
|
||||
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
|
||||
cblas_zaxpby.$(SUFFIX) \
|
||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)
|
||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
|
||||
|
||||
|
||||
CZBLAS2OBJS = \
|
||||
@@ -1634,6 +1634,12 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c
|
||||
cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_crotg.$(SUFFIX) cblas_crotg.$(PSUFFIX): zrotg.c
|
||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||
|
||||
cblas_zrotg.$(SUFFIX) cblas_zrotg.$(PSUFFIX): zrotg.c
|
||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||
|
||||
cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
@@ -1664,6 +1670,12 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c
|
||||
cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F)
|
||||
|
||||
cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
$count = 0;
|
||||
|
||||
|
||||
@@ -49,6 +49,8 @@
|
||||
#define ERROR_NAME "QGEMM "
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "DGEMM "
|
||||
#elif defined(BFLOAT16)
|
||||
#define ERROR_NAME "SBGEMM "
|
||||
#else
|
||||
#define ERROR_NAME "SGEMM "
|
||||
#endif
|
||||
@@ -124,6 +126,7 @@ void NAME(char *TRANSA, char *TRANSB,
|
||||
|
||||
#ifdef SMP
|
||||
double MNK;
|
||||
#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
@@ -142,6 +145,7 @@ void NAME(char *TRANSA, char *TRANSB,
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3)
|
||||
int nodes;
|
||||
@@ -246,6 +250,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||
|
||||
#ifdef SMP
|
||||
double MNK;
|
||||
#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
@@ -264,6 +269,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3)
|
||||
int nodes;
|
||||
@@ -417,8 +423,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||
sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
|
||||
#ifdef SMP
|
||||
#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
|
||||
mode |= (transa << BLAS_TRANSA_SHIFT);
|
||||
mode |= (transb << BLAS_TRANSB_SHIFT);
|
||||
#endif
|
||||
|
||||
MNK = (double) args.m * (double) args.n * (double) args.k;
|
||||
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||
|
||||
@@ -201,7 +201,14 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
||||
#if 0
|
||||
/* this optimization causes stack corruption on x86_64 under OSX, Windows and FreeBSD */
|
||||
if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) {
|
||||
GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
@@ -164,6 +164,11 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (m == 0 || n == 0) return;
|
||||
if (alpha == 0.) return;
|
||||
|
||||
if (incx == 1 && incy == 1 && 1L*m*n <= 2048 *GEMM_MULTITHREAD_THRESHOLD) {
|
||||
GER(m, n, 0, alpha, x, incx, y, incy, a, lda, NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
@@ -150,9 +150,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||
#endif
|
||||
|
||||
if ( *lda > *ldb )
|
||||
msize = (*lda) * (*ldb) * sizeof(FLOAT);
|
||||
msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT);
|
||||
else
|
||||
msize = (*ldb) * (*ldb) * sizeof(FLOAT);
|
||||
msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT);
|
||||
|
||||
b = malloc(msize);
|
||||
if ( b == NULL )
|
||||
|
||||
@@ -95,7 +95,14 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
#ifndef DOUBLE
|
||||
if (args.m*args.n < 40000)
|
||||
#else
|
||||
if (args.m*args.n < 10000)
|
||||
#endif
|
||||
args.nthreads=1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
#ifndef DOUBLE
|
||||
if (args.n <128)
|
||||
#else
|
||||
if (args.n <64)
|
||||
#endif
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
|
||||
@@ -121,6 +121,9 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
if (args.n < 180)
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
|
||||
@@ -95,7 +95,10 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
if (args.m*args.n <10000)
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
#ifndef DOUBLE
|
||||
if (args.n < 64)
|
||||
#else
|
||||
if (args.n < 64)
|
||||
#endif
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
|
||||
@@ -121,6 +121,15 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
|
||||
#ifdef SMP
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
#ifndef DOUBLE
|
||||
if (args.n < 200)
|
||||
#else
|
||||
if (args.n < 150)
|
||||
#endif
|
||||
args.nthreads=1;
|
||||
else
|
||||
#endif
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -107,7 +107,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||
dq1 = dp1 * *dx1;
|
||||
if(ABS(dq1) > ABS(dq2))
|
||||
{
|
||||
dflag = ZERO;
|
||||
dh11 = ONE;
|
||||
dh22 = ONE;
|
||||
dh21 = - dy1 / *dx1;
|
||||
|
||||
@@ -167,6 +167,26 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
if (incx == 1 && n <100) {
|
||||
blasint i;
|
||||
if (uplo==0) {
|
||||
for (i = 0; i < n; i++){
|
||||
if (x[i] != ZERO) {
|
||||
AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0);
|
||||
}
|
||||
a += i + 1;
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < n; i++){
|
||||
if (x[i] != ZERO) {
|
||||
AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0);
|
||||
}
|
||||
a += n - i;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (incx < 0 ) x -= (n - 1) * incx;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
@@ -168,6 +168,24 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
if (incx == 1 && incy == 1 && n < 50) {
|
||||
blasint i;
|
||||
if (!uplo) {
|
||||
for (i = 0; i < n; i++){
|
||||
AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0);
|
||||
AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0);
|
||||
a += i + 1;
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < n; i++){
|
||||
AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0);
|
||||
AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0);
|
||||
a += n - i;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
@@ -168,7 +168,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
#if 1
|
||||
if (incx == 1 && n < 100) {
|
||||
BLASLONG i;
|
||||
|
||||
if (uplo == 0) {
|
||||
for (i = 0; i < n; i++){
|
||||
if (x[i] != ZERO) {
|
||||
AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0);
|
||||
}
|
||||
a += lda;
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < n; i++){
|
||||
if (x[i] != ZERO) {
|
||||
AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0);
|
||||
}
|
||||
a += 1 + lda;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
if (incx < 0 ) x -= (n - 1) * incx;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
@@ -170,6 +170,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
if (incx == 1 && incy == 1 && n < 100) {
|
||||
blasint i;
|
||||
if (!uplo) {
|
||||
for (i = 0; i < n; i++){
|
||||
AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0);
|
||||
AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0);
|
||||
a += lda;
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < n; i++){
|
||||
AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0);
|
||||
AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0);
|
||||
a += 1 + lda;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
if (incx < 0 ) x -= (n - 1) * incx;
|
||||
|
||||
@@ -354,6 +354,17 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
|
||||
#endif
|
||||
|
||||
args.common = NULL;
|
||||
#ifndef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
if (args.n < 100)
|
||||
#else
|
||||
if (args.n < 200)
|
||||
#endif
|
||||
#else
|
||||
if (args.n < 65)
|
||||
#endif
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(3);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
|
||||
@@ -188,6 +188,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if (incx == 1 && trans == 0 && n < 50) {
|
||||
buffer = NULL;
|
||||
(trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
@@ -172,9 +172,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||
#endif
|
||||
|
||||
if ( *lda > *ldb )
|
||||
msize = (*lda) * (*ldb) * sizeof(FLOAT) * 2;
|
||||
msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2;
|
||||
else
|
||||
msize = (*ldb) * (*ldb) * sizeof(FLOAT) * 2;
|
||||
msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2;
|
||||
|
||||
b = malloc(msize);
|
||||
if ( b == NULL )
|
||||
|
||||
@@ -79,8 +79,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||
aa_i = fabs(da_r);
|
||||
}
|
||||
|
||||
scale = (aa_i / aa_r);
|
||||
ada = aa_r * sqrt(ONE + scale * scale);
|
||||
if (aa_r == ZERO) {
|
||||
ada = 0.;
|
||||
} else {
|
||||
scale = (aa_i / aa_r);
|
||||
ada = aa_r * sqrt(ONE + scale * scale);
|
||||
}
|
||||
|
||||
bb_r = fabs(db_r);
|
||||
bb_i = fabs(db_i);
|
||||
@@ -90,9 +94,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||
bb_i = fabs(bb_r);
|
||||
}
|
||||
|
||||
scale = (bb_i / bb_r);
|
||||
adb = bb_r * sqrt(ONE + scale * scale);
|
||||
|
||||
if (bb_r == ZERO) {
|
||||
adb = 0.;
|
||||
} else {
|
||||
scale = (bb_i / bb_r);
|
||||
adb = bb_r * sqrt(ONE + scale * scale);
|
||||
}
|
||||
scale = ada + adb;
|
||||
|
||||
aa_r = da_r / scale;
|
||||
|
||||
@@ -172,6 +172,32 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
if (incx == 1 && n < 50) {
|
||||
blasint i;
|
||||
if (!uplo) {
|
||||
for (i = 0; i < n; i++){
|
||||
if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) {
|
||||
AXPYU_K(i + 1, 0, 0,
|
||||
alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1],
|
||||
alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1],
|
||||
x, 1, a, 1, NULL, 0);
|
||||
}
|
||||
a += lda;
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < n; i++){
|
||||
if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) {
|
||||
AXPYU_K(n - i, 0, 0,
|
||||
alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1],
|
||||
alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1],
|
||||
x + i * 2, 1, a, 1, NULL, 0);
|
||||
}
|
||||
a += 2 + lda;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
@@ -199,6 +199,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if (incx == 1 && trans == 0 && n < 50) {
|
||||
buffer = NULL;
|
||||
(trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
@@ -187,10 +187,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||
endif ()
|
||||
# Makefile.L3
|
||||
set(USE_TRMM false)
|
||||
if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE))
|
||||
string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE)
|
||||
if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE))
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10))
|
||||
if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10))
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
|
||||
|
||||
@@ -36,7 +36,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
override CFLAGS += -march=cooperlake
|
||||
else
|
||||
override CFLAGS += -march=skylake-avx512
|
||||
override CFLAGS += -march=skylake-avx512 -mavx512f
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||
@@ -47,7 +47,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE)
|
||||
endif
|
||||
endif
|
||||
else ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 -mavx512f
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
|
||||
@@ -1,3 +1,11 @@
|
||||
FMAFLAG=
|
||||
ifndef OLDGCC
|
||||
ifdef HAVE_FMA3
|
||||
FMAFLAG = -mfma
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
### AMAX ###
|
||||
|
||||
ifndef SAMAXKERNEL
|
||||
@@ -828,10 +836,10 @@ $(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KE
|
||||
$(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
FMAFLAG=
|
||||
ifndef OLDGCC
|
||||
ifdef HAVE_FMA3
|
||||
FMAFLAG = -mfma
|
||||
endif
|
||||
endif
|
||||
|
||||
### GEMV ###
|
||||
|
||||
ifndef SGEMVNKERNEL
|
||||
@@ -263,7 +270,7 @@ $(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KER
|
||||
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@
|
||||
|
||||
$(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
|
||||
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@
|
||||
$(CC) -c $(CFLAGS) $(FMAFLAG) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL)
|
||||
|
||||
@@ -818,6 +818,8 @@ ifeq ($(OS), AIX)
|
||||
m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
|
||||
rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
|
||||
else ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||
endif
|
||||
@@ -828,6 +830,8 @@ ifeq ($(OS), AIX)
|
||||
m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
|
||||
rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
|
||||
else ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||
endif
|
||||
@@ -838,6 +842,8 @@ ifeq ($(OS), AIX)
|
||||
m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
|
||||
rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
|
||||
else ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||
endif
|
||||
@@ -848,6 +854,8 @@ ifeq ($(OS), AIX)
|
||||
m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
|
||||
rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
|
||||
else ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||
endif
|
||||
@@ -1044,6 +1052,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
|
||||
rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
@@ -1054,6 +1064,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
@@ -1064,6 +1076,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
endif
|
||||
@@ -1074,6 +1088,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
endif
|
||||
@@ -1084,6 +1100,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
@@ -1094,6 +1112,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
@@ -1104,6 +1124,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
@@ -1114,6 +1136,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
@@ -1187,29 +1211,55 @@ $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
endif
|
||||
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
|
||||
endif
|
||||
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
|
||||
endif
|
||||
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
endif
|
||||
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
endif
|
||||
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
|
||||
endif
|
||||
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/*****************************************************
|
||||
* 2014/06/09 Saar
|
||||
*
|
||||
* Order rowMajor
|
||||
* Trans
|
||||
*
|
||||
******************************************************/
|
||||
|
||||
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
|
||||
{
|
||||
BLASLONG i,j;
|
||||
FLOAT *aptr,*bptr;
|
||||
BLASLONG i, j;
|
||||
FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
|
||||
FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4;
|
||||
|
||||
if ( rows <= 0 ) return(0);
|
||||
if ( cols <= 0 ) return(0);
|
||||
if (rows <= 0) return 0;
|
||||
if (cols <= 0) return 0;
|
||||
|
||||
aptr = a;
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
for ( i=0; i<rows ; i++ )
|
||||
{
|
||||
bptr = &b[i];
|
||||
for(j=0; j<cols; j++)
|
||||
{
|
||||
bptr[j*ldb] = alpha * aptr[j];
|
||||
}
|
||||
aptr += lda;
|
||||
}
|
||||
i = (rows >> 2);
|
||||
if (i > 0) {
|
||||
do {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
a_offset4 = a_offset3 + lda;
|
||||
a_offset += 4 * lda;
|
||||
|
||||
return(0);
|
||||
b_offset1 = b_offset;
|
||||
b_offset2 = b_offset1 + ldb;
|
||||
b_offset3 = b_offset2 + ldb;
|
||||
b_offset4 = b_offset3 + ldb;
|
||||
b_offset += 4;
|
||||
|
||||
j = (cols >> 2);
|
||||
if (j > 0) {
|
||||
do {
|
||||
/* Column 1 of MAT_B */
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
|
||||
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
|
||||
|
||||
/* Column 2 of MAT_B */
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
|
||||
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
|
||||
|
||||
/* Column 3 of MAT_B */
|
||||
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A
|
||||
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
|
||||
*(b_offset3 + 2) = *(a_offset3 + 2)*alpha;
|
||||
*(b_offset4 + 2) = *(a_offset3 + 3)*alpha;
|
||||
|
||||
/* Column 4 of MAT_B */
|
||||
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A
|
||||
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
|
||||
*(b_offset3 + 3) = *(a_offset4 + 2)*alpha;
|
||||
*(b_offset4 + 3) = *(a_offset4 + 3)*alpha;
|
||||
|
||||
a_offset1 += 4;
|
||||
a_offset2 += 4;
|
||||
a_offset3 += 4;
|
||||
a_offset4 += 4;
|
||||
b_offset1 += ldb * 4;
|
||||
b_offset2 += ldb * 4;
|
||||
b_offset3 += ldb * 4;
|
||||
b_offset4 += ldb * 4;
|
||||
|
||||
j--;
|
||||
} while (j > 0);
|
||||
} // if(j > 0)
|
||||
|
||||
|
||||
if (cols & 2) {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
|
||||
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
|
||||
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
|
||||
|
||||
a_offset1 += 2;
|
||||
a_offset2 += 2;
|
||||
a_offset3 += 2;
|
||||
a_offset4 += 2;
|
||||
|
||||
b_offset1 += ldb*2;
|
||||
|
||||
}
|
||||
|
||||
if (cols & 1) {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
|
||||
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
|
||||
|
||||
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
|
||||
}
|
||||
|
||||
i--;
|
||||
} while (i > 0);
|
||||
}
|
||||
|
||||
|
||||
if (rows & 2) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset += 2 * lda;
|
||||
|
||||
b_offset1 = b_offset;
|
||||
b_offset2 = b_offset1 + ldb;
|
||||
b_offset3 = b_offset2 + ldb;
|
||||
b_offset4 = b_offset3 + ldb;
|
||||
b_offset += 2;
|
||||
|
||||
j = (cols >> 2);
|
||||
if (j > 0){
|
||||
do {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
|
||||
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
|
||||
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
|
||||
|
||||
a_offset1 += 4;
|
||||
a_offset2 += 4;
|
||||
b_offset1 += ldb * 4;
|
||||
b_offset2 += ldb * 4;
|
||||
b_offset3 += ldb * 4;
|
||||
b_offset4 += ldb * 4;
|
||||
|
||||
j--;
|
||||
} while (j > 0);
|
||||
}
|
||||
|
||||
|
||||
if (cols & 2){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
|
||||
a_offset1 += 2;
|
||||
a_offset2 += 2;
|
||||
b_offset1 += ldb*2;
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (cols & 1){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
}
|
||||
} // if (rows & 2)
|
||||
|
||||
|
||||
if (rows & 1) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset += lda;
|
||||
|
||||
b_offset1 = b_offset;
|
||||
b_offset2 = b_offset1 + ldb;
|
||||
b_offset3 = b_offset2 + ldb;
|
||||
b_offset4 = b_offset3 + ldb;
|
||||
|
||||
j = (cols >> 2);
|
||||
if (j > 0){
|
||||
do {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
|
||||
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
|
||||
|
||||
a_offset1 += 4;
|
||||
b_offset1 += ldb * 4;
|
||||
b_offset2 += ldb * 4;
|
||||
b_offset3 += ldb * 4;
|
||||
b_offset4 += ldb * 4;
|
||||
|
||||
j--;
|
||||
} while (j > 0);
|
||||
}
|
||||
|
||||
if (cols & 2){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
|
||||
a_offset1 += 2;
|
||||
b_offset1 += ldb * 2;
|
||||
}
|
||||
|
||||
if (cols & 1){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||
|
||||
dot[0]=0.0;
|
||||
dot[1]=0.0;
|
||||
#if !defined(__PPC__) && !defined(__SunOS)
|
||||
#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI)
|
||||
CREAL(result) = 0.0 ;
|
||||
CIMAG(result) = 0.0 ;
|
||||
#else
|
||||
@@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||
i++ ;
|
||||
|
||||
}
|
||||
#if !defined(__PPC__) && !defined(__SunOS)
|
||||
#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI)
|
||||
CREAL(result) = dot[0];
|
||||
CIMAG(result) = dot[1];
|
||||
#else
|
||||
|
||||
@@ -97,9 +97,18 @@ CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
|
||||
@@ -96,11 +96,20 @@ DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user