Compare commits
419 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
33f76a6c37 | ||
|
|
960dec234f | ||
|
|
6b92979f35 | ||
|
|
014fc13995 | ||
|
|
f1e05676a0 | ||
|
|
d221c50f27 | ||
|
|
f14013da7f | ||
|
|
4f371b0fbf | ||
|
|
02d60c1563 | ||
|
|
2e6963259b | ||
|
|
e94590e400 | ||
|
|
69d4687142 | ||
|
|
a731a9bbb9 | ||
|
|
1aa5907a2c | ||
|
|
ea8eec5d17 | ||
|
|
97ce6bbce2 | ||
|
|
a9aeb6745c | ||
|
|
0af9991cc9 | ||
|
|
19f3a4091c | ||
|
|
c623a965f9 | ||
|
|
e1062400c4 | ||
|
|
a66f4d80c8 | ||
|
|
dd22eb7621 | ||
|
|
2352331e60 | ||
|
|
430ee31e66 | ||
|
|
8164fd1328 | ||
|
|
531c6b96d6 | ||
|
|
1b980001dd | ||
|
|
2515e1152f | ||
|
|
ddcbed6690 | ||
|
|
f8ec538c82 | ||
|
|
ca4f7dceff | ||
|
|
1ddf9f1067 | ||
|
|
4c5fac5a2b | ||
|
|
320e2648cd | ||
|
|
9b732696c6 | ||
|
|
c9dcb3d4a4 | ||
|
|
3bb7f0138e | ||
|
|
c93ae92579 | ||
|
|
87ac1ceb0b | ||
|
|
9e40c080f2 | ||
|
|
903854c168 | ||
|
|
a2ff577a30 | ||
|
|
97a32cb0a5 | ||
|
|
f1746e7284 | ||
|
|
f6fcbd7906 | ||
|
|
1e8410f18c | ||
|
|
07454bf4d5 | ||
|
|
4046985913 | ||
|
|
75577f95a7 | ||
|
|
33d92c7a37 | ||
|
|
e57b11acca | ||
|
|
71e5669c3e | ||
|
|
e8d82c01d4 | ||
|
|
0b39cf95b0 | ||
|
|
76b2cec6ce | ||
|
|
276c1791ea | ||
|
|
c5bbfd8fee | ||
|
|
130c1741e5 | ||
|
|
8f782f0673 | ||
|
|
6a517dcb6a | ||
|
|
9f39f0a2c3 | ||
|
|
1a88c4ab26 | ||
|
|
0b44802164 | ||
|
|
2c242b4cef | ||
|
|
0bfb7336d2 | ||
|
|
403cde104e | ||
|
|
634f2bddda | ||
|
|
aeea14ee40 | ||
|
|
18bcc36a69 | ||
|
|
0e7f43c898 | ||
|
|
79e201fbba | ||
|
|
4326dcb460 | ||
|
|
e32f3b1447 | ||
|
|
d483e9270a | ||
|
|
01834aee33 | ||
|
|
b0558c11b9 | ||
|
|
f566787e6e | ||
|
|
e3368cbf18 | ||
|
|
d92bd5be24 | ||
|
|
46e4b12946 | ||
|
|
5e94aa4877 | ||
|
|
93f3e27574 | ||
|
|
785c389b0e | ||
|
|
c222b25b81 | ||
|
|
221da8bf05 | ||
|
|
eb285b4d20 | ||
|
|
cafdd999b8 | ||
|
|
92ca92a46c | ||
|
|
486c35c5dc | ||
|
|
0e05ea9bac | ||
|
|
5ba3699f41 | ||
|
|
8eefa530cd | ||
|
|
de40d47edf | ||
|
|
7c162b8a21 | ||
|
|
0544cbc806 | ||
|
|
120d20731f | ||
|
|
dc345d84df | ||
|
|
616921fd91 | ||
|
|
8a9e9a82a1 | ||
|
|
7ea5e07d1c | ||
|
|
cb6ef49857 | ||
|
|
63994e1cdb | ||
|
|
496e3019bc | ||
|
|
169be3f097 | ||
|
|
6ccbb089c2 | ||
|
|
59ebe3636a | ||
|
|
5a6bba3061 | ||
|
|
dff173e50e | ||
|
|
7e5cbb6f35 | ||
|
|
303bdb673b | ||
|
|
754433f420 | ||
|
|
7f0d523b42 | ||
|
|
c353d8b106 | ||
|
|
579be3aa9d | ||
|
|
449e8ea443 | ||
|
|
3bec250cf9 | ||
|
|
f03dd23e90 | ||
|
|
fb5eb47558 | ||
|
|
fa93d63365 | ||
|
|
90e6c66a57 | ||
|
|
32d97330b3 | ||
|
|
29eaf4b6d7 | ||
|
|
47c1bf7f4d | ||
|
|
2b55f0ad30 | ||
|
|
a5b32ab06c | ||
|
|
50545b19d0 | ||
|
|
b3cbd60d7a | ||
|
|
70199d1905 | ||
|
|
cfe63d8cc2 | ||
|
|
d55b10830f | ||
|
|
c1c10cbb21 | ||
|
|
5989841524 | ||
|
|
68a43db358 | ||
|
|
9694037b23 | ||
|
|
71faa1c1a7 | ||
|
|
3447d04eaf | ||
|
|
8b5cdcc64c | ||
|
|
4e00d96a78 | ||
|
|
ce9ea8f826 | ||
|
|
0b909203cb | ||
|
|
096da2f51a | ||
|
|
2f96a2c55b | ||
|
|
833bd0f8ff | ||
|
|
77b8f49556 | ||
|
|
1c3e20ce48 | ||
|
|
83b6be7976 | ||
|
|
081b188529 | ||
|
|
f3f969f681 | ||
|
|
8019e70211 | ||
|
|
8d2a796f49 | ||
|
|
8dc9fd4dfe | ||
|
|
abc67bdd74 | ||
|
|
1f62a82789 | ||
|
|
e9fb8f62b1 | ||
|
|
fbf4f48f4a | ||
|
|
b9ad450295 | ||
|
|
e011ad820a | ||
|
|
ff42e68652 | ||
|
|
23f322f997 | ||
|
|
093d37de8d | ||
|
|
d65e9a2bbd | ||
|
|
78100b8093 | ||
|
|
70f45749b9 | ||
|
|
e5dcdeb550 | ||
|
|
952cc2ba38 | ||
|
|
feaafbedd3 | ||
|
|
1c67567008 | ||
|
|
4e979bf75b | ||
|
|
daa4310db5 | ||
|
|
b8f3605132 | ||
|
|
b36018be6d | ||
|
|
3a100b2797 | ||
|
|
38742d5547 | ||
|
|
bd4c032f52 | ||
|
|
9dc9b7b95e | ||
|
|
9f5cdc49d4 | ||
|
|
b7b408a120 | ||
|
|
92b10212de | ||
|
|
b73bf01378 | ||
|
|
eb3c9f1db9 | ||
|
|
fd2ff2714f | ||
|
|
2ea2bd99c7 | ||
|
|
fbb894948c | ||
|
|
e711659c90 | ||
|
|
893e6e57c4 | ||
|
|
456ee2e1f0 | ||
|
|
9998f8ed8b | ||
|
|
80db5f11e1 | ||
|
|
52de4cc8fd | ||
|
|
44028581cc | ||
|
|
86ab939936 | ||
|
|
375b1875c8 | ||
|
|
6c85cb1869 | ||
|
|
995768bbc5 | ||
|
|
96ad579428 | ||
|
|
8d84403205 | ||
|
|
8729db117c | ||
|
|
0833a4846a | ||
|
|
50f7fc1401 | ||
|
|
d1b53806be | ||
|
|
a0f0a802fc | ||
|
|
700fe5b5ee | ||
|
|
bb2729c855 | ||
|
|
aae44d040d | ||
|
|
6362c34ee6 | ||
|
|
f60840c420 | ||
|
|
109e18cd96 | ||
|
|
ae1579be13 | ||
|
|
3ccf8885ac | ||
|
|
454847588e | ||
|
|
0257f26488 | ||
|
|
c45b7aef14 | ||
|
|
312060d0d6 | ||
|
|
cd765f094b | ||
|
|
64639f440f | ||
|
|
3a66c8cac1 | ||
|
|
4c35b8dbaa | ||
|
|
ed9af2f7da | ||
|
|
5fd1edead9 | ||
|
|
26478eb0d0 | ||
|
|
eeecd623d8 | ||
|
|
3ce6bcdb5f | ||
|
|
6fbe51072b | ||
|
|
611445c7f8 | ||
|
|
2cd9306bb5 | ||
|
|
c418c81224 | ||
|
|
025741f16a | ||
|
|
0ae49d2990 | ||
|
|
105e26e12a | ||
|
|
f41d52665d | ||
|
|
d573d24de7 | ||
|
|
31d6c2eb7d | ||
|
|
b7cc69ee62 | ||
|
|
aeef942c4f | ||
|
|
445ca2f418 | ||
|
|
13226e3101 | ||
|
|
1a6ea8ee6d | ||
|
|
c6ecb195e6 | ||
|
|
b28db31429 | ||
|
|
6baa9b07d7 | ||
|
|
a4896b5538 | ||
|
|
3938e59569 | ||
|
|
9d5079008f | ||
|
|
3518617f5b | ||
|
|
715f4650d9 | ||
|
|
10705183ce | ||
|
|
235599f17a | ||
|
|
b863b32ac5 | ||
|
|
dd04143d4a | ||
|
|
f3a6164bff | ||
|
|
dedd822d1a | ||
|
|
2181fb7047 | ||
|
|
a9b62c03f8 | ||
|
|
97762234f9 | ||
|
|
948d11fc51 | ||
|
|
c815b8fb85 | ||
|
|
e20709e976 | ||
|
|
934e601e93 | ||
|
|
a4c3668f99 | ||
|
|
867232c6a4 | ||
|
|
5aaf70ef95 | ||
|
|
ae2a0995cc | ||
|
|
83dae28ae2 | ||
|
|
da986d2e83 | ||
|
|
6bc487de35 | ||
|
|
cf2a8e410c | ||
|
|
eb1e9c8c92 | ||
|
|
f95989cbc1 | ||
|
|
f3065a0eed | ||
|
|
04226f1e97 | ||
|
|
0925ef70db | ||
|
|
371e6f73d4 | ||
|
|
d117dfd505 | ||
|
|
883c39773a | ||
|
|
b09b5be0a4 | ||
|
|
bfb5fbdb4d | ||
|
|
3da6d66da9 | ||
|
|
08fa83aba2 | ||
|
|
63d3ee8dfc | ||
|
|
1191db1a49 | ||
|
|
1f6071590d | ||
|
|
0caf1434c9 | ||
|
|
73128f3883 | ||
|
|
cad0d150db | ||
|
|
eba0aeb7cd | ||
|
|
0c07c356c1 | ||
|
|
82b75f97e5 | ||
|
|
7887c45077 | ||
|
|
3e67017ac8 | ||
|
|
b3ac6ee222 | ||
|
|
6082e556cd | ||
|
|
92315173d5 | ||
|
|
351d12b94e | ||
|
|
bf73aa141b | ||
|
|
71e96163db | ||
|
|
819e852ae7 | ||
|
|
4e466d739c | ||
|
|
4c6a457358 | ||
|
|
836c414e22 | ||
|
|
d403eb3c2f | ||
|
|
3cd97f1a80 | ||
|
|
9955f0996f | ||
|
|
430c11e135 | ||
|
|
fbacd2605d | ||
|
|
6fa89b06a1 | ||
|
|
68597002ea | ||
|
|
d2a6285549 | ||
|
|
d999688d1a | ||
|
|
928fe1b28e | ||
|
|
ccc28c6d60 | ||
|
|
ae43b75a6a | ||
|
|
54fc06fd70 | ||
|
|
1df9a2013d | ||
|
|
274ff5cdb8 | ||
|
|
eb2eddf241 | ||
|
|
8691825944 | ||
|
|
7dc8a76f60 | ||
|
|
df857551c0 | ||
|
|
85ccdce8c4 | ||
|
|
aeabe0a83f | ||
|
|
1b90989662 | ||
|
|
e3e8b5cdca | ||
|
|
69b16a894d | ||
|
|
6782e5767d | ||
|
|
48f5a89f92 | ||
|
|
4ae1610f37 | ||
|
|
911c3e2f4b | ||
|
|
fab49e49e5 | ||
|
|
b687fba5bc | ||
|
|
46a8c2519a | ||
|
|
e9437eebd2 | ||
|
|
3a39062cfc | ||
|
|
eaa0be1313 | ||
|
|
6ff013bae0 | ||
|
|
0d669e04bb | ||
|
|
17cdd9f9e1 | ||
|
|
6bcb06fcb1 | ||
|
|
b7315f8401 | ||
|
|
9b19e9e1b0 | ||
|
|
6bd67ddbab | ||
|
|
5da9484d93 | ||
|
|
844629af57 | ||
|
|
2beaa82c05 | ||
|
|
e8a2aed2b9 | ||
|
|
f262031685 | ||
|
|
5f6206fa2d | ||
|
|
f2cde2ccfb | ||
|
|
ba7838d2e1 | ||
|
|
a448884a63 | ||
|
|
17609f88f1 | ||
|
|
3a2df19db6 | ||
|
|
d2093a40d3 | ||
|
|
aa04b0925e | ||
|
|
258ac56e0a | ||
|
|
56837e9d92 | ||
|
|
bb5413863f | ||
|
|
32f5907fef | ||
|
|
ac10236cc8 | ||
|
|
8617d75548 | ||
|
|
c07d78b9e9 | ||
|
|
6355c25dde | ||
|
|
5e244d80f2 | ||
|
|
ede5efebab | ||
|
|
84908d60d2 | ||
|
|
596a22325a | ||
|
|
7f58f3ad0e | ||
|
|
c0d570a357 | ||
|
|
6b83079368 | ||
|
|
673e5a0495 | ||
|
|
bfa2cc7d64 | ||
|
|
e7c4d6705a | ||
|
|
2a1911cc14 | ||
|
|
9f7a9a32e3 | ||
|
|
2463938879 | ||
|
|
5d6525c87c | ||
|
|
6cb47ea3f0 | ||
|
|
459bb9291d | ||
|
|
3f1077ce6f | ||
|
|
eb45eb6942 | ||
|
|
f2becb777a | ||
|
|
5997b6b491 | ||
|
|
4b21b646ea | ||
|
|
7ec7b999a5 | ||
|
|
af9ac0898a | ||
|
|
c7b5a459b6 | ||
|
|
9b2f0323d6 | ||
|
|
9f6984fe4b | ||
|
|
42203dafdc | ||
|
|
a4f17a9297 | ||
|
|
733d97b2df | ||
|
|
ea747cf933 | ||
|
|
4de545aa7d | ||
|
|
6e9a93ec19 | ||
|
|
fde8a8e6a0 | ||
|
|
256fc15f5f | ||
|
|
ee498525e0 | ||
|
|
1fec0570f6 | ||
|
|
b5af7b9c78 | ||
|
|
f3c314550c | ||
|
|
847c20c9b7 | ||
|
|
4c22828812 | ||
|
|
e79712d969 | ||
|
|
be09551cdf | ||
|
|
ec1ef6aa9e | ||
|
|
11c59acfb1 | ||
|
|
bf0d92a310 | ||
|
|
db066151ee | ||
|
|
3a55dca2dc | ||
|
|
7d380f7d79 | ||
|
|
300f158d3b | ||
|
|
3635fdbf2b | ||
|
|
b6552b11eb | ||
|
|
3dc6b26eff | ||
|
|
5f36f18148 | ||
|
|
d47fe78b0e | ||
|
|
ebe2f47a0f | ||
|
|
20d417762f | ||
|
|
15cb124012 |
@@ -92,7 +92,7 @@ steps:
|
||||
- mkdir build && cd build
|
||||
- cmake $CMAKE_FLAGS ..
|
||||
- make -j
|
||||
- ctest
|
||||
- ctest -V
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
@@ -116,7 +116,7 @@ steps:
|
||||
- mkdir build && cd build
|
||||
- cmake $CMAKE_FLAGS ..
|
||||
- make -j
|
||||
- ctest
|
||||
- ctest -V
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
@@ -140,4 +140,4 @@ steps:
|
||||
- mkdir build && cd build
|
||||
- cmake $CMAKE_FLAGS ..
|
||||
- make -j
|
||||
- ctest
|
||||
- ctest -V
|
||||
|
||||
78
.github/workflows/nightly-Homebrew-build.yml
vendored
Normal file
78
.github/workflows/nightly-Homebrew-build.yml
vendored
Normal file
@@ -0,0 +1,78 @@
|
||||
# Only the "head" branch of the OpenBLAS package is tested
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- '**/nightly-Homebrew-build.yml'
|
||||
pull_request:
|
||||
branches:
|
||||
- develop
|
||||
paths:
|
||||
- '**/nightly-Homebrew-build.yml'
|
||||
schedule:
|
||||
- cron: 45 7 * * *
|
||||
# This is 7:45 AM UTC daily, late at night in the USA
|
||||
|
||||
# Since push and pull_request will still always be building and testing the `develop` branch,
|
||||
# it only makes sense to test if this file has been changed
|
||||
|
||||
name: Nightly-Homebrew-Build
|
||||
jobs:
|
||||
build-OpenBLAS-with-Homebrew:
|
||||
runs-on: macos-latest
|
||||
env:
|
||||
HOMEBREW_DEVELOPER: "ON"
|
||||
HOMEBREW_DISPLAY_INSTALL_TIMES: "ON"
|
||||
HOMEBREW_NO_ANALYTICS: "ON"
|
||||
HOMEBREW_NO_AUTO_UPDATE: "ON"
|
||||
HOMEBREW_NO_BOTTLE_SOURCE_FALLBACK: "ON"
|
||||
HOMEBREW_NO_INSTALL_CLEANUP: "ON"
|
||||
|
||||
steps:
|
||||
- name: Random delay for cron job
|
||||
run: |
|
||||
delay=$(( RANDOM % 600 ))
|
||||
printf 'Delaying for %s seconds on event %s' ${delay} "${{ github.event_name }}"
|
||||
sleep ${delay}
|
||||
if: github.event_name == 'schedule'
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
# This isn't even needed, technically. Homebrew will get `develop` via git
|
||||
|
||||
- name: Update Homebrew
|
||||
if: github.event_name != 'pull_request'
|
||||
run: brew update || true
|
||||
|
||||
- name: Install prerequisites
|
||||
run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas
|
||||
|
||||
- name: Install and bottle OpenBLAS
|
||||
run: brew install --fetch-HEAD --HEAD --build-bottle --keep-tmp openblas
|
||||
# the HEAD flags tell Homebrew to build the develop branch fetch via git
|
||||
|
||||
- name: Create bottle
|
||||
run: |
|
||||
brew bottle -v openblas
|
||||
mkdir bottles
|
||||
mv *.bottle.tar.gz bottles
|
||||
|
||||
- name: Upload bottle
|
||||
uses: actions/upload-artifact@v1
|
||||
with:
|
||||
name: openblas--HEAD.catalina.bottle.tar.gz
|
||||
path: bottles
|
||||
|
||||
- name: Show linkage
|
||||
run: brew linkage -v openblas
|
||||
|
||||
- name: Test openblas
|
||||
run: brew test --HEAD --verbose openblas
|
||||
|
||||
- name: Audit openblas formula
|
||||
run: |
|
||||
brew audit --strict openblas
|
||||
brew cat openblas
|
||||
|
||||
- name: Post logs on failure
|
||||
if: failure()
|
||||
run: brew gist-logs --with-hostname -v openblas
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -87,4 +87,5 @@ build.*
|
||||
*.swp
|
||||
benchmark/*.goto
|
||||
benchmark/smallscaling
|
||||
|
||||
CMakeCache.txt
|
||||
CMakeFiles/*
|
||||
|
||||
19
.travis.yml
19
.travis.yml
@@ -17,7 +17,7 @@ matrix:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
script:
|
||||
- set -e
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
@@ -160,18 +160,25 @@ matrix:
|
||||
os: osx
|
||||
osx_image: xcode10.1
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
- brew update
|
||||
- brew install gcc # for gfortran
|
||||
- brew install gcc@8 # for gfortran
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode8.3
|
||||
osx_image: xcode10.0
|
||||
env:
|
||||
- BTYPE="BINARY=32"
|
||||
- BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode10.1
|
||||
env:
|
||||
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
|
||||
|
||||
# whitelist
|
||||
branches:
|
||||
|
||||
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
||||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 8.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 9)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
|
||||
@@ -171,3 +171,12 @@ In chronological order:
|
||||
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
|
||||
* [2019-03-14] power9 dgemm/dtrmm kernel
|
||||
* [2019-04-29] power9 sgemm/strmm kernel
|
||||
|
||||
* Jiachen Wang <https://github.com/wjc404>
|
||||
* [2019-07-29] optimize AVX2 DGEMM
|
||||
* [2019-10-20] AVX512 DGEMM kernel (4x8)
|
||||
* [2019-11-06] optimize AVX512 SGEMM
|
||||
* [2019-11-12] AVX512 CGEMM & ZGEMM kernels
|
||||
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM
|
||||
* [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels
|
||||
* [2020-01-07] optimize AVX2 SGEMM and STRMM
|
||||
|
||||
163
Changelog.txt
163
Changelog.txt
@@ -1,45 +1,144 @@
|
||||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.9
|
||||
1-Mar-2020
|
||||
|
||||
common:
|
||||
* Fixed a miscompilation of the GETRF functions with CMAKE
|
||||
* Imported bugfix 390 from LAPACK (missing NaN propagation in xCOMBSSQ)
|
||||
* The size of the memory buffer used for splitting GEMM tasks across
|
||||
multiple threads can now be configured in the build system.
|
||||
|
||||
POWER:
|
||||
* Fixed several compilation problems related to endianness
|
||||
and ELF version on POWER8 and POWER9
|
||||
* Fixed use of the absolute value IAMIN/IAMAX instead of IMIN/IMAX
|
||||
* Fixed a race condition in the level3 blas code
|
||||
|
||||
MIPS64:
|
||||
* Fixed use of the absoltute value IAMIN/IAMAX instead of IMIN/IMAX
|
||||
|
||||
ARMV7:
|
||||
* Fixed a race condition in the level3 blas code
|
||||
* Fixed compilation on Android
|
||||
ARMV8:
|
||||
* Added support for Ampere EMAG8180
|
||||
* Added support for Neoverse N1
|
||||
* Improved performance of the blas_lock function
|
||||
* Fixed a race condition in the level3 blas code
|
||||
* Fixed a performance regression on TSV110-based servers
|
||||
|
||||
x86_64:
|
||||
* Fixed a long-standing error with undeclared register overwrites
|
||||
in the DSCAL microkernel for HASWELL,SKYLAKEX and ZEN
|
||||
* Fixed a long-standing bug in the SSE implementation of IAMAX
|
||||
* Fixed a CMAKE build failure with DYNAMIC_ARCH
|
||||
* Fixed cpu autodetection of Goldmont+, Cannon Lake and Ice Lake
|
||||
* Fixed a compilation failure on OSX with compiler name containing dash
|
||||
* Fixed compilation with MinGW on SkylakeX
|
||||
* Improved speed of the AVX512 GEMM3M kernel on SkylakeX
|
||||
* Added an AVX512 STRMM kernel for SkylakeX
|
||||
* Improved GEMM performance on Haswell and Zen
|
||||
|
||||
zarch:
|
||||
* fixed compilation of the DYNAMIC_ARCH code
|
||||
|
||||
====================================================================
|
||||
Version 0.3.8
|
||||
9-Feb-2020
|
||||
|
||||
common:
|
||||
` * LAPACK has been updated to 3.9.0 (plus patches up to
|
||||
January 2nd, 2020)
|
||||
* CMAKE support has been improved in several areas including
|
||||
cross-compilation
|
||||
* a thread race condition in the GEMM3M kernels was resolved
|
||||
* the "generic" (plain C) gemm beta kernel used by many targets
|
||||
has been sped up
|
||||
* an optimized version of the LAPACK trtrs functions has been added
|
||||
* an incompatibilty between the LAPACK tests and the OpenBLAS
|
||||
implementation of XERBLA was resolved, removing the numerous
|
||||
warnings about wrong error exits in the former
|
||||
* support for NetBSD has been added
|
||||
* support for compilation with g95 and non-GNU versions of ld
|
||||
has been improved
|
||||
* support for compilation with (upcoming) gcc 10 has been added
|
||||
|
||||
POWER:
|
||||
* worked around miscompilation of several POWER8 and POWER9
|
||||
kernels by older versions of gcc
|
||||
* added support for big-endian POWER8 and for compilation on AIX
|
||||
* corrected bugs in the big-endian support for PPC440 and PPC970
|
||||
* DYNAMIC_ARCH support is now available in CMAKE builds as well
|
||||
|
||||
ARMV8:
|
||||
* performance of DGEMM_BETA and SGEMM_NCOPY has been improved
|
||||
* compilation for 32bit works again
|
||||
* performance of the RPCC function has been improved
|
||||
* improved performance on small systems
|
||||
* DYNAMIC_ARCH support is now available in CMAKE builds as well
|
||||
* cross-compilation from OSX to IOS was simplified
|
||||
|
||||
x86_64:
|
||||
* a new AVX512 DGEMM kernel was added and the AVX512 SGEMM kernel
|
||||
was significantly improved
|
||||
* optimized AVX512 kernels for CGEMM and ZGEMM have been added
|
||||
* AVX2 kernels for STRMM, SGEMM, and CGEMM have been significantly
|
||||
sped up and optimized CGEMM3M and ZGEMM3M kernels have been added
|
||||
* added support for QEMU virtual cpus
|
||||
* a compilation problem with PGI and SUN compilers was fixed
|
||||
* Intel "Goldmont plus" is now autodetected
|
||||
* a potential crash on program exit on MS Windows has been fixed
|
||||
|
||||
x86:
|
||||
* an unwanted case sensitivity in the implementation of LSAME
|
||||
on older 32bit AMD cpus was fixed
|
||||
|
||||
zarch:
|
||||
* Z15 is now supported as Z14
|
||||
* DYNAMIC_ARCH is now available on ZARCH as well
|
||||
|
||||
====================================================================
|
||||
Version 0.3.7
|
||||
11-Aug 2019
|
||||
|
||||
common:
|
||||
* having the gmake special variables TARGET_ARCH or TARGET_MACH
|
||||
defined no longer causes build failures in ctest or utest
|
||||
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
|
||||
has the same effect as setting them to 1
|
||||
* a new test program was added to allow checking the library for
|
||||
thread safety
|
||||
* a new option USE_LOCKING was added to ensure thread safety when
|
||||
OpenBLAS itself is built without multithreading but will be
|
||||
called from multiple threads.
|
||||
* a build failure on Linux with glibc versions earlier than 2.5
|
||||
was fixed
|
||||
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
|
||||
on glibc 2.6 was fixed
|
||||
* NO_AFFINITY was added to the CMAKE options (and defaults to being
|
||||
active on Linux, as in the gmake builds)
|
||||
* having the gmake special variables TARGET_ARCH or TARGET_MACH
|
||||
defined no longer causes build failures in ctest or utest
|
||||
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
|
||||
has the same effect as setting them to 1
|
||||
* a new test program was added to allow checking the library for
|
||||
thread safety
|
||||
* a new option USE_LOCKING was added to ensure thread safety when
|
||||
OpenBLAS itself is built without multithreading but will be
|
||||
called from multiple threads.
|
||||
* a build failure on Linux with glibc versions earlier than 2.5
|
||||
was fixed
|
||||
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
|
||||
on glibc 2.6 was fixed
|
||||
* NO_AFFINITY was added to the CMAKE options (and defaults to being
|
||||
active on Linux, as in the gmake builds)
|
||||
|
||||
x86_64:
|
||||
* the build-time logic for detection of AVX512 availability in
|
||||
the processor and compiler was fixed
|
||||
* gmake builds on OSX now set the internal name of the library to
|
||||
libopenblas.0.dylib (consistent with CMAKE)
|
||||
* the Haswell DGEMM kernel received a significant speedup through
|
||||
improved prefetch and load instructions
|
||||
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
|
||||
increased by avoiding vpermpd instructions
|
||||
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
|
||||
to fix remaining errors in DGEMM, DSYMM and DTRMM
|
||||
* the build-time logic for detection of AVX512 availability in
|
||||
the processor and compiler was fixed
|
||||
* gmake builds on OSX now set the internal name of the library to
|
||||
libopenblas.0.dylib (consistent with CMAKE)
|
||||
* the Haswell DGEMM kernel received a significant speedup through
|
||||
improved prefetch and load instructions
|
||||
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
|
||||
increased by avoiding vpermpd instructions
|
||||
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
|
||||
to fix remaining errors in DGEMM, DSYMM and DTRMM
|
||||
|
||||
## POWER:
|
||||
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
|
||||
* added optimized kernels for POWER9 SGEMM and STRMM
|
||||
POWER:
|
||||
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
|
||||
* added optimized kernels for POWER9 SGEMM and STRMM
|
||||
|
||||
## ARMV7:
|
||||
* fixed the softfp implementations of xAMAX and IxAMAX
|
||||
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
|
||||
they were appropriate for only a subset of platforms
|
||||
ARMV7:
|
||||
* fixed the softfp implementations of xAMAX and IxAMAX
|
||||
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
|
||||
they were appropriate for only a subset of platforms
|
||||
|
||||
====================================================================
|
||||
Version 0.3.6
|
||||
|
||||
20
Makefile
20
Makefile
@@ -247,21 +247,21 @@ prof_lapack : lapack_prebuild
|
||||
|
||||
lapack_prebuild :
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
@@ -319,7 +319,7 @@ lapack-test :
|
||||
ifneq ($(CROSS), 1)
|
||||
( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
|
||||
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
|
||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
|
||||
endif
|
||||
|
||||
lapack-runtest:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
|
||||
ifeq ($(OSNAME), Android)
|
||||
CCOMMON_OPT += -mfpu=neon
|
||||
FCOMMON_OPT += -mfpu=neon
|
||||
CCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
else
|
||||
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||
|
||||
@@ -24,6 +24,23 @@ CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||
endif
|
||||
|
||||
# Use a72 tunings because Neoverse-N1 is only available
|
||||
# in GCC>=9
|
||||
ifeq ($(CORE), NEOVERSEN1)
|
||||
ifeq ($(GCCVERSIONGTEQ7), 1)
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||
@@ -39,7 +56,10 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq ($(CORE), TSV110)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
@@ -51,6 +51,7 @@ endif
|
||||
ifneq ($(OSNAME), AIX)
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
@@ -81,7 +82,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||
endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
|
||||
@-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
@@ -100,6 +101,7 @@ else
|
||||
#install on AIX has different options syntax
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
|
||||
@@ -42,7 +42,7 @@ all: getarch_2nd
|
||||
./getarch_2nd 1 >> $(TARGET_CONF)
|
||||
|
||||
config.h : c_check f_check getarch
|
||||
perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS)
|
||||
perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) $(CFLAGS)
|
||||
ifneq ($(ONLY_CBLAS), 1)
|
||||
perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS)
|
||||
else
|
||||
@@ -59,13 +59,13 @@ endif
|
||||
|
||||
|
||||
getarch : getarch.c cpuid.S dummy $(CPUIDEMU)
|
||||
$(HOSTCC) $(CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
|
||||
$(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
|
||||
|
||||
getarch_2nd : getarch_2nd.c config.h dummy
|
||||
ifndef TARGET_CORE
|
||||
$(HOSTCC) -I. $(CFLAGS) -o $(@F) getarch_2nd.c
|
||||
$(HOSTCC) -I. $(HOST_CFLAGS) -o $(@F) getarch_2nd.c
|
||||
else
|
||||
$(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
|
||||
$(HOSTCC) -I. $(HOST_CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
|
||||
endif
|
||||
|
||||
dummy:
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.8.dev
|
||||
VERSION = 0.3.9
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
@@ -97,6 +97,15 @@ VERSION = 0.3.8.dev
|
||||
# they need to wait for the preceding API calls to finish or risk data corruption.
|
||||
# NUM_PARALLEL = 2
|
||||
|
||||
# When multithreading, OpenBLAS needs to use a memory buffer for communicating
|
||||
# and collating results for individual subranges of the original matrix. Since
|
||||
# the original GotoBLAS of the early 2000s, the default size of this buffer has
|
||||
# been set at a value of 32<<20 (which is 32MB) on x86_64 , twice that on PPC.
|
||||
# If you expect to handle large problem sizes (beyond about 30000x30000) uncomment
|
||||
# this line and adjust the (32<<n) factor if necessary. Usually an insufficient value
|
||||
# manifests itself as a crash in the relevant scal kernel (sscal_k, dscal_k etc)
|
||||
# BUFFERSIZE = 25
|
||||
|
||||
# If you don't need to install the static library, please comment this in.
|
||||
# NO_STATIC = 1
|
||||
|
||||
|
||||
@@ -9,9 +9,11 @@ ifndef TOPDIR
|
||||
TOPDIR = .
|
||||
endif
|
||||
|
||||
# If ARCH is not set, we use the host system's architecture.
|
||||
# If ARCH is not set, we use the host system's architecture for getarch compile options.
|
||||
ifndef ARCH
|
||||
ARCH := $(shell uname -m)
|
||||
HOSTARCH := $(shell uname -m)
|
||||
else
|
||||
HOSTARCH = $(ARCH)
|
||||
endif
|
||||
|
||||
# Catch conflicting usage of ARCH in some BSD environments
|
||||
@@ -23,6 +25,8 @@ else ifeq ($(ARCH), i386)
|
||||
override ARCH=x86
|
||||
else ifeq ($(ARCH), aarch64)
|
||||
override ARCH=arm64
|
||||
else ifeq ($(ARCH), zarch)
|
||||
override ARCH=zarch
|
||||
endif
|
||||
|
||||
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
|
||||
@@ -143,7 +147,7 @@ endif
|
||||
|
||||
|
||||
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
|
||||
ifeq ($(ARCH), x86_64)
|
||||
ifeq ($(HOSTARCH), x86_64)
|
||||
ifeq ($(findstring pgcc,$(HOSTCC)),)
|
||||
GETARCH_FLAGS += -march=native
|
||||
endif
|
||||
@@ -210,7 +214,7 @@ ifndef GOTOBLAS_MAKEFILE
|
||||
export GOTOBLAS_MAKEFILE = 1
|
||||
|
||||
# Generating Makefile.conf and config.h
|
||||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
|
||||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
|
||||
|
||||
ifndef TARGET_CORE
|
||||
include $(TOPDIR)/Makefile.conf
|
||||
@@ -320,12 +324,15 @@ CCOMMON_OPT += -DMS_ABI
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
#Test for supporting MS_ABI
|
||||
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
||||
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
||||
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
|
||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||
ifeq ($(GCCVERSIONGT4), 1)
|
||||
# GCC Majar version > 4
|
||||
# GCC Major version > 4
|
||||
# It is compatible with MSVC ABI.
|
||||
CCOMMON_OPT += -DMS_ABI
|
||||
endif
|
||||
@@ -544,16 +551,37 @@ endif
|
||||
|
||||
ifeq ($(ARCH), arm64)
|
||||
DYNAMIC_CORE = ARMV8
|
||||
DYNAMIC_CORE += CORTEXA53
|
||||
DYNAMIC_CORE += CORTEXA57
|
||||
DYNAMIC_CORE += CORTEXA72
|
||||
DYNAMIC_CORE += CORTEXA73
|
||||
DYNAMIC_CORE += NEOVERSEN1
|
||||
DYNAMIC_CORE += FALKOR
|
||||
DYNAMIC_CORE += THUNDERX
|
||||
DYNAMIC_CORE += THUNDERX2T99
|
||||
DYNAMIC_CORE += TSV110
|
||||
DYNAMIC_CORE += EMAG8180
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), zarch)
|
||||
DYNAMIC_CORE = Z13
|
||||
DYNAMIC_CORE += Z14
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), power)
|
||||
DYNAMIC_CORE = POWER6
|
||||
DYNAMIC_CORE += POWER8
|
||||
ifneq ($(C_COMPILER), GCC)
|
||||
DYNAMIC_CORE += POWER9
|
||||
endif
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
ifeq ($(GCCVERSIONGT5), 1)
|
||||
DYNAMIC_CORE += POWER9
|
||||
else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
ifndef DYNAMIC_CORE
|
||||
@@ -697,7 +725,7 @@ endif
|
||||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
ifdef BINARY64
|
||||
CCOMMON_OPT += -tp p7-64
|
||||
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
|
||||
else
|
||||
CCOMMON_OPT += -tp p7
|
||||
endif
|
||||
@@ -757,6 +785,9 @@ else
|
||||
FCOMMON_OPT += -m32
|
||||
endif
|
||||
endif
|
||||
ifneq ($(NO_LAPACKE), 1)
|
||||
FCOMMON_OPT += -fno-second-underscore
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
@@ -1300,6 +1331,7 @@ export OSNAME
|
||||
export ARCH
|
||||
export CORE
|
||||
export LIBCORE
|
||||
export __BYTE_ORDER__
|
||||
export PGCPATH
|
||||
export CONFIG
|
||||
export CC
|
||||
|
||||
@@ -15,10 +15,12 @@ CCOMMON_OPT += -march=skylake-avx512
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
43
README.md
43
README.md
@@ -26,6 +26,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
|
||||
|
||||
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
|
||||
using Git from https://github.com/xianyi/OpenBLAS.git.
|
||||
Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
|
||||
Most can also be given directly on the make or cmake command line.
|
||||
|
||||
### Dependencies
|
||||
|
||||
@@ -101,7 +103,7 @@ The default installation directory is `/opt/OpenBLAS`.
|
||||
|
||||
## Supported CPUs and Operating Systems
|
||||
|
||||
Please read `GotoBLAS_01Readme.txt`.
|
||||
Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by the 2010 GotoBLAS.
|
||||
|
||||
### Additional supported CPUs
|
||||
|
||||
@@ -109,8 +111,8 @@ Please read `GotoBLAS_01Readme.txt`.
|
||||
|
||||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
|
||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||
- **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
|
||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||
@@ -129,8 +131,15 @@ Please read `GotoBLAS_01Readme.txt`.
|
||||
|
||||
#### ARM64
|
||||
|
||||
- **ARMv8**: Experimental
|
||||
- **ARM Cortex-A57**: Experimental
|
||||
- **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS
|
||||
- **Cortex-A53**: same as ARMV8 (different cpu specifications)
|
||||
- **Cortex A57**: Optimized Level-3 and Level-2 functions
|
||||
- **Cortex A72**: same as A57 ( different cpu specifications)
|
||||
- **Cortex A73**: same as A57 (different cpu specifications)
|
||||
- **Falkor**: same as A57 (different cpu specifications)
|
||||
- **ThunderX**: Optimized some Level-1 functions
|
||||
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
|
||||
- **TSV110**: Optimized some Level-3 helper functions
|
||||
|
||||
#### PPC/PPC64
|
||||
|
||||
@@ -139,18 +148,34 @@ Please read `GotoBLAS_01Readme.txt`.
|
||||
|
||||
#### IBM zEnterprise System
|
||||
|
||||
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
|
||||
- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)
|
||||
- **Z13**: Optimized Level-3 BLAS and Level-1,2
|
||||
- **Z14**: Optimized Level-3 BLAS and (single precision) Level-1,2
|
||||
|
||||
### Support for multiple targets in a single library
|
||||
|
||||
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake.
|
||||
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default.
|
||||
DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
|
||||
Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
|
||||
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
|
||||
For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
|
||||
The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the
|
||||
common code in the library, usually you will want to set this to the oldest model you expect to encounter.
|
||||
Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
|
||||
|
||||
### Supported OS
|
||||
|
||||
- **GNU/Linux**
|
||||
- **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts.
|
||||
- **Darwin/macOS/OSX/iOS**: Experimental. Although GotoBLAS2 already supports Darwin, we are not OSX/iOS experts.
|
||||
- **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **NetBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
- **AIX**: Supported on PPC up to POWER8
|
||||
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **SunOS**: Supported by the community. We don't actively test the library on this OS:
|
||||
|
||||
## Usage
|
||||
|
||||
@@ -205,7 +230,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
||||
Clang 3.0 will generate the wrong AVX binary code.
|
||||
* Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels.
|
||||
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||
* The number of CPUs/cores should be less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
|
||||
the library with `BIGNUMA=1`.
|
||||
* OpenBLAS does not set processor affinity by default.
|
||||
|
||||
@@ -88,6 +88,8 @@ CORTEXA53
|
||||
CORTEXA57
|
||||
CORTEXA72
|
||||
CORTEXA73
|
||||
NEOVERSEN1
|
||||
EMAG8180
|
||||
FALKOR
|
||||
THUNDERX
|
||||
THUNDERX2T99
|
||||
|
||||
@@ -38,7 +38,8 @@ environment:
|
||||
- COMPILER: MinGW64-gcc-7.2.0-mingw
|
||||
DYNAMIC_ARCH: OFF
|
||||
WITH_FORTRAN: ignore
|
||||
- COMPILER: MinGW64-gcc-7.2.0
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
COMPILER: MinGW-gcc-6.3.0-32
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
COMPILER: MinGW-gcc-5.3.0
|
||||
WITH_FORTRAN: ignore
|
||||
@@ -62,10 +63,10 @@ before_build:
|
||||
- set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
|
||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] set PATH=C:\msys64\usr\bin;C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
|
||||
@@ -12,9 +12,9 @@ include $(TOPDIR)/Makefile.system
|
||||
# ACML 6.1 custom
|
||||
ACML=/home/saar/acml6.1/gfortran64_mp/lib
|
||||
LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm
|
||||
|
||||
|
||||
# Atlas Ubuntu
|
||||
|
||||
# Atlas Ubuntu
|
||||
#ATLAS=/usr/lib/atlas-base
|
||||
#LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm
|
||||
|
||||
@@ -56,6 +56,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
|
||||
strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \
|
||||
strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \
|
||||
ssyr.goto dsyr.goto \
|
||||
ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
|
||||
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
|
||||
sger.goto dger.goto cger.goto zger.goto \
|
||||
@@ -83,6 +84,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||
sgemm.acml dgemm.acml cgemm.acml zgemm.acml \
|
||||
strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \
|
||||
strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \
|
||||
ssyr.acml dsyr.acml \
|
||||
ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
|
||||
ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
|
||||
sger.acml dger.acml cger.acml zger.acml \
|
||||
@@ -109,6 +111,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
|
||||
sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \
|
||||
strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \
|
||||
strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \
|
||||
ssyr.goto dsyr.atlas \
|
||||
ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \
|
||||
ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \
|
||||
sger.atlas dger.atlas cger.atlas zger.atlas\
|
||||
@@ -136,6 +139,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
|
||||
sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \
|
||||
strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \
|
||||
strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \
|
||||
ssyr.mkl dsyr.mkl \
|
||||
ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
|
||||
ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
|
||||
sger.mkl dger.mkl cger.mkl zger.mkl \
|
||||
@@ -162,6 +166,7 @@ else
|
||||
goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
|
||||
strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \
|
||||
strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \
|
||||
ssyr.goto dsyr.goto \
|
||||
ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
|
||||
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
|
||||
sger.goto dger.goto cger.goto zger.goto \
|
||||
@@ -188,6 +193,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||
sgemm.acml dgemm.acml cgemm.acml zgemm.acml \
|
||||
strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \
|
||||
strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \
|
||||
ssyr.acml dsyr.acml \
|
||||
ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
|
||||
ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
|
||||
sger.acml dger.acml cger.acml zger.acml \
|
||||
@@ -214,6 +220,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
|
||||
sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \
|
||||
strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \
|
||||
strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \
|
||||
ssyr.atlas dsyr.atlas \
|
||||
ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \
|
||||
ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \
|
||||
sger.atlas dger.atlas cger.atlas zger.atlas\
|
||||
@@ -243,6 +250,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
|
||||
sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \
|
||||
strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \
|
||||
strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \
|
||||
ssyr.mkl dsyr.mkl \
|
||||
ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
|
||||
ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
|
||||
sger.mkl dger.mkl cger.mkl zger.mkl \
|
||||
@@ -280,6 +288,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
|
||||
sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \
|
||||
strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \
|
||||
strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \
|
||||
ssyr.veclib dsyr.veclib \
|
||||
ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \
|
||||
ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \
|
||||
sger.veclib dger.veclib cger.veclib zger.veclib \
|
||||
@@ -768,6 +777,36 @@ ztrsm.veclib : ztrsm.$(SUFFIX)
|
||||
|
||||
ztrsm.essl : ztrsm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
##################################### Ssyr ####################################################
|
||||
ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
ssyr.acml : ssyr.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
ssyr.atlas : ssyr.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
ssyr.mkl : ssyr.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
ssyr.veclib : ssyr.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
##################################### Dsyr ####################################################
|
||||
dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
dsyr.acml : dsyr.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
dsyr.atlas : dsyr.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
dsyr.mkl : dsyr.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
dsyr.veclib : dsyr.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Ssyrk ####################################################
|
||||
ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME)
|
||||
@@ -2078,6 +2117,12 @@ ctrsm.$(SUFFIX) : trsm.c
|
||||
ztrsm.$(SUFFIX) : trsm.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
ssyr.$(SUFFIX) : syr.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
dsyr.$(SUFFIX) : syr.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
ssyrk.$(SUFFIX) : syrk.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
|
||||
@@ -129,7 +129,10 @@ int main(int argc, char *argv[]){
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg;
|
||||
double time1 = 0.0, timeg = 0.0;
|
||||
long nanos = 0;
|
||||
time_t seconds = 0;
|
||||
struct timespec time_start = { 0, 0 }, time_end = { 0, 0 };
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
@@ -163,35 +166,32 @@ int main(int argc, char *argv[]){
|
||||
timeg=0;
|
||||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
clock_gettime(CLOCK_REALTIME, &time_start);
|
||||
COPY (&m, x, &inc_x, y, &inc_y );
|
||||
clock_gettime(CLOCK_REALTIME, &time_end);
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
nanos = time_end.tv_nsec - time_start.tv_nsec;
|
||||
seconds = time_end.tv_sec - time_start.tv_sec;
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
time1 = seconds + nanos / 1.e9;
|
||||
timeg += time1;
|
||||
}
|
||||
|
||||
COPY (&m, x, &inc_x, y, &inc_y );
|
||||
timeg /= loops;
|
||||
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
|
||||
timeg += time1;
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MBytes %10.6f sec\n",
|
||||
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
|
||||
fprintf(stderr,
|
||||
" %10.2f MBytes %12.9f sec\n",
|
||||
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg / 1.e6, timeg);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -197,7 +197,7 @@ int main(int argc, char *argv[]){
|
||||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < n * COMPSIZE; i++){
|
||||
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -208,7 +208,7 @@ int main(int argc, char *argv[]){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
@@ -234,7 +234,7 @@ int main(int argc, char *argv[]){
|
||||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < n * COMPSIZE; i++){
|
||||
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -245,7 +245,7 @@ int main(int argc, char *argv[]){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
187
benchmark/syr.c
Normal file
187
benchmark/syr.c
Normal file
@@ -0,0 +1,187 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#undef SYR
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define SYR BLASFUNC(dsyr)
|
||||
#else
|
||||
#define SYR BLASFUNC(ssyr)
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x,*a;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
char *p;
|
||||
|
||||
char uplo='U';
|
||||
|
||||
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
|
||||
|
||||
blasint m, i, j;
|
||||
blasint inc_x= 1;
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Inc_x = %d\n", from, to, step,uplo,inc_x);
|
||||
|
||||
|
||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < m * COMPSIZE; i++){
|
||||
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
SYR (&uplo, &m, alpha, x, &inc_x, a, &m );
|
||||
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
27
c_check
27
c_check
@@ -18,11 +18,12 @@ $binary = $ENV{"BINARY"};
|
||||
$makefile = shift(@ARGV);
|
||||
$config = shift(@ARGV);
|
||||
|
||||
$compiler_name = join(" ", @ARGV);
|
||||
$compiler_name = shift(@ARGV);
|
||||
$flags = join(" ", @ARGV);
|
||||
|
||||
# First, we need to know the target OS and compiler name
|
||||
|
||||
$data = `$compiler_name -E ctest.c`;
|
||||
$data = `$compiler_name $flags -E ctest.c`;
|
||||
|
||||
if ($?) {
|
||||
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
|
||||
@@ -175,7 +176,7 @@ if ($defined == 0) {
|
||||
|
||||
# Do again
|
||||
|
||||
$data = `$compiler_name -E ctest.c`;
|
||||
$data = `$compiler_name $flags -E ctest.c`;
|
||||
|
||||
if ($?) {
|
||||
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
|
||||
@@ -188,14 +189,14 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
|
||||
} else {
|
||||
$tmpf = new File::Temp( UNLINK => 1 );
|
||||
$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$code = '"addvi.b $w0, $w1, 1"';
|
||||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
|
||||
print $tmpf "#include <msa.h>\n\n";
|
||||
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
|
||||
|
||||
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args");
|
||||
$args = "$msa_flags -o $tmpf.o $tmpf";
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$have_msa = 0;
|
||||
@@ -229,11 +230,14 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||
$no_avx512 = 0;
|
||||
} else {
|
||||
# $tmpf = new File::Temp( UNLINK => 1 );
|
||||
($fh,$tmpf) = tempfile( UNLINK => 1 );
|
||||
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
|
||||
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
|
||||
$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
|
||||
$args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf";
|
||||
if ($compiler eq "PGI") {
|
||||
$args = " -tp skylake -c -o $tmpf.o $tmpf";
|
||||
}
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_avx512 = 1;
|
||||
@@ -244,7 +248,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||
}
|
||||
}
|
||||
|
||||
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
|
||||
$data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
|
||||
|
||||
$data =~ /globl\s([_\.]*)(.*)/;
|
||||
|
||||
@@ -267,7 +271,7 @@ $linker_l = "";
|
||||
$linker_a = "";
|
||||
|
||||
{
|
||||
$link = `$compiler_name -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`;
|
||||
$link = `$compiler_name $flags -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $flags $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`;
|
||||
|
||||
$link =~ s/\-Y\sP\,/\-Y/g;
|
||||
|
||||
@@ -305,6 +309,7 @@ $linker_a = "";
|
||||
&& ($flags !~ /kernel32/)
|
||||
&& ($flags !~ /advapi32/)
|
||||
&& ($flags !~ /shell32/)
|
||||
&& ($flags !~ /omp/)
|
||||
) {
|
||||
$linker_l .= $flags . " "
|
||||
}
|
||||
|
||||
@@ -45,7 +45,11 @@ endif ()
|
||||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1)
|
||||
endif ()
|
||||
|
||||
if (POWER)
|
||||
set(DYNAMIC_CORE POWER6 POWER8 POWER9)
|
||||
endif ()
|
||||
|
||||
if (X86)
|
||||
@@ -73,7 +77,7 @@ if (DYNAMIC_ARCH)
|
||||
endif ()
|
||||
if (NOT NO_AVX512)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
|
||||
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
|
||||
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||
endif ()
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||
## Sets C related variables.
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -Wall")
|
||||
set(COMMON_PROF "${COMMON_PROF} -fno-inline")
|
||||
@@ -43,7 +43,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "PGI")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64")
|
||||
else ()
|
||||
@@ -51,7 +51,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PGI")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
|
||||
else ()
|
||||
@@ -59,7 +59,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64")
|
||||
|
||||
if (MIPS64)
|
||||
|
||||
@@ -87,7 +87,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "SUN")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -w")
|
||||
if (X86)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
|
||||
@@ -96,3 +96,10 @@ if (${CMAKE_C_COMPILER} STREQUAL "SUN")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL "SKYLAKEX")
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (NOT NO_AVX512)
|
||||
set (CCOMMON_OPT = "${CCOMMON_OPT} -march=skylake-avx512")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
@@ -115,7 +115,9 @@ set(SLASRC
|
||||
stplqt.f stplqt2.f stpmlqt.f
|
||||
ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
|
||||
ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
|
||||
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f)
|
||||
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
|
||||
scombssq.f sgesvdq.f slaorhr_col_getrfnp.f
|
||||
slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
|
||||
|
||||
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
|
||||
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
|
||||
@@ -210,7 +212,9 @@ set(CLASRC
|
||||
ctplqt.f ctplqt2.f ctpmlqt.f
|
||||
chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
|
||||
cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
|
||||
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f)
|
||||
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
|
||||
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
|
||||
cungtsqr.f cunhr_col.f )
|
||||
|
||||
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
|
||||
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
|
||||
@@ -299,7 +303,9 @@ set(DLASRC
|
||||
dtplqt.f dtplqt2.f dtpmlqt.f
|
||||
dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
|
||||
dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
|
||||
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f)
|
||||
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
|
||||
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
|
||||
dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
|
||||
|
||||
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
|
||||
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
|
||||
@@ -398,7 +404,9 @@ set(ZLASRC
|
||||
zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
|
||||
zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
|
||||
zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
|
||||
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f)
|
||||
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
|
||||
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
|
||||
zungtsqr.f zunhr_col.f)
|
||||
|
||||
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
|
||||
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
|
||||
|
||||
@@ -715,6 +715,8 @@ set(DSRC
|
||||
lapacke_dgesv_work.c
|
||||
lapacke_dgesvd.c
|
||||
lapacke_dgesvd_work.c
|
||||
lapacke_dgesvdq.c
|
||||
lapacke_dgesvdq_work.c
|
||||
lapacke_dgesvdx.c
|
||||
lapacke_dgesvdx_work.c
|
||||
lapacke_dgesvj.c
|
||||
@@ -1287,6 +1289,8 @@ set(SSRC
|
||||
lapacke_sgesv_work.c
|
||||
lapacke_sgesvd.c
|
||||
lapacke_sgesvd_work.c
|
||||
lapacke_sgesvdq.c
|
||||
lapacke_sgesvdq_work.c
|
||||
lapacke_sgesvdx.c
|
||||
lapacke_sgesvdx_work.c
|
||||
lapacke_sgesvj.c
|
||||
@@ -1853,6 +1857,8 @@ set(ZSRC
|
||||
lapacke_zgesv_work.c
|
||||
lapacke_zgesvd.c
|
||||
lapacke_zgesvd_work.c
|
||||
lapacke_zgesvdq.c
|
||||
lapacke_zgesvdq_work.c
|
||||
lapacke_zgesvdx.c
|
||||
lapacke_zgesvdx_work.c
|
||||
lapacke_zgesvj.c
|
||||
|
||||
@@ -105,8 +105,39 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||
# Perhaps this should be inside a different file as it grows larger
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define ${TCORE}\n"
|
||||
"#define CORE_${TCORE}\n"
|
||||
"#define CHAR_CORENAME \"${TCORE}\"\n")
|
||||
if ("${TCORE}" STREQUAL "ARMV7")
|
||||
if ("${TCORE}" STREQUAL "CORE2")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t1048576\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t256\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_CMOV\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSSE3\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t16384\n")
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 4)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "ARMV7")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t32\n"
|
||||
@@ -121,6 +152,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
elseif ("${TCORE}" STREQUAL "ARMV8")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
@@ -194,6 +229,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "NEOVERSEN1")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
||||
"#define L2_SIZE\t1048576\n\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t16\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_VFPV4\n"
|
||||
"#define HAVE_VFPV3\n"
|
||||
"#define HAVE_VFP\n"
|
||||
"#define HAVE_NEON\n"
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "FALKOR")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
@@ -274,6 +336,106 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "TSV110")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define ARMV8\n"
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t4\n"
|
||||
"#define L2_SIZE\t524288\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t8\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "EMAG8180")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define ARMV8\n"
|
||||
"#define L1_CODE_SIZE\t32768\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t4\n"
|
||||
"#define L2_SIZE\t5262144\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t8\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "POWER6")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE 32768\n"
|
||||
"#define L1_DATA_LINESIZE 128\n"
|
||||
"#define L2_SIZE 524288\n"
|
||||
"#define L2_LINESIZE 128 \n"
|
||||
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define L2_ASSOCIATIVE 8\n")
|
||||
set(SGEMM_UNROLL_M 4)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 8)
|
||||
elseif ("${TCORE}" STREQUAL "POWER8")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE 32768\n"
|
||||
"#define L1_DATA_LINESIZE 128\n"
|
||||
"#define L2_SIZE 524288\n"
|
||||
"#define L2_LINESIZE 128 \n"
|
||||
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define L2_ASSOCIATIVE 8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
set(DGEMM_UNROLL_M 16)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 8)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(SYMV_P 8)
|
||||
elseif ("${TCORE}" STREQUAL "POWER9")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE 32768\n"
|
||||
"#define L1_DATA_LINESIZE 128\n"
|
||||
"#define L2_SIZE 524288\n"
|
||||
"#define L2_LINESIZE 128 \n"
|
||||
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define L2_ASSOCIATIVE 8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
set(DGEMM_UNROLL_M 16)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 8)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(SYMV_P 8)
|
||||
endif()
|
||||
|
||||
# Or should this actually be NUM_CORES?
|
||||
@@ -309,6 +471,9 @@ else(NOT CMAKE_CROSSCOMPILING)
|
||||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
|
||||
else()
|
||||
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
|
||||
if (DEFINED TARGET_CORE)
|
||||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE})
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
|
||||
@@ -289,6 +289,10 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}")
|
||||
|
||||
if (BUFFERSIZE)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUFFERSIZE=${BUFFERSIZE}")
|
||||
endif ()
|
||||
|
||||
if (USE_SIMPLE_THREADED_LEVEL3)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
|
||||
endif ()
|
||||
|
||||
@@ -39,10 +39,18 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
||||
set(MIPS64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(X86_64 1)
|
||||
if (NOT BINARY)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(X86_64 1)
|
||||
else()
|
||||
set(X86 1)
|
||||
endif()
|
||||
else()
|
||||
set(X86 1)
|
||||
if (${BINARY} EQUAL "64")
|
||||
set(X86_64 1)
|
||||
else ()
|
||||
set(X86 1)
|
||||
endif()
|
||||
endif()
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
||||
set(X86 1)
|
||||
@@ -54,6 +62,22 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
|
||||
else()
|
||||
set(ARM 1)
|
||||
endif()
|
||||
elseif (${CMAKE_CROSSCOMPILING})
|
||||
if (${TARGET} STREQUAL "CORE2")
|
||||
if (NOT BINARY)
|
||||
set(X86 1)
|
||||
elseif (${BINARY} EQUAL "64")
|
||||
set(X86_64 1)
|
||||
else ()
|
||||
set(X86 1)
|
||||
endif()
|
||||
elseif (${TARGET} STREQUAL "ARMV7")
|
||||
set(ARM 1)
|
||||
else()
|
||||
set(ARM64 1)
|
||||
endif ()
|
||||
else ()
|
||||
message(WARNING "Target ARCH could not be determined, got \"${CMAKE_SYSTEM_PROCESSOR}\"")
|
||||
endif()
|
||||
|
||||
if (X86_64)
|
||||
@@ -92,4 +116,3 @@ set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||
endif()
|
||||
file(REMOVE "avx512.tmp" "avx512.o")
|
||||
endif()
|
||||
|
||||
|
||||
@@ -53,16 +53,16 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||
BLASULONG ret;
|
||||
|
||||
do {
|
||||
while (*address) {YIELDING;};
|
||||
|
||||
__asm__ __volatile__(
|
||||
"mov x4, #1 \n\t"
|
||||
"sevl \n\t"
|
||||
"1: \n\t"
|
||||
"wfe \n\t"
|
||||
"2: \n\t"
|
||||
"ldaxr x2, [%1] \n\t"
|
||||
"cbnz x2, 1b \n\t"
|
||||
"2: \n\t"
|
||||
"stxr w3, x4, [%1] \n\t"
|
||||
"cbnz w3, 1b \n\t"
|
||||
"cbnz w3, 2b \n\t"
|
||||
"mov %0, #0 \n\t"
|
||||
: "=r"(ret), "=r"(address)
|
||||
: "1"(address)
|
||||
@@ -78,7 +78,20 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
#define BLAS_LOCK_DEFINED
|
||||
|
||||
#if !defined(OS_DARWIN) && !defined (OS_ANDROID)
|
||||
static __inline BLASULONG rpcc(void){
|
||||
BLASULONG ret = 0;
|
||||
blasint shift;
|
||||
|
||||
__asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret));
|
||||
__asm__ __volatile__ ("mrs %0,cntfrq_el0; clz %w0, %w0":"=&r"(shift));
|
||||
|
||||
return ret << shift;
|
||||
}
|
||||
|
||||
#define RPCC_DEFINED
|
||||
#define RPCC64BIT
|
||||
#endif
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
@@ -103,12 +116,16 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.text ;\
|
||||
.align 4 ;\
|
||||
.global REALNAME ;\
|
||||
.type REALNAME, %function ;\
|
||||
.macro PROLOGUE
|
||||
.text ;
|
||||
.p2align 2 ;
|
||||
.global REALNAME ;
|
||||
#ifndef __APPLE__
|
||||
.type REALNAME, %function ;
|
||||
#endif
|
||||
REALNAME:
|
||||
.endm
|
||||
|
||||
|
||||
#define EPILOGUE
|
||||
|
||||
|
||||
146
common_lapack.h
146
common_lapack.h
@@ -293,4 +293,150 @@ blasint zlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLO
|
||||
blasint xlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
blasint strtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint dtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint qtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint ctrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ztrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint xtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
blasint strtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint dtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint qtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint ctrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ztrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint xtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
#endif
|
||||
|
||||
165
common_macro.h
165
common_macro.h
@@ -641,7 +641,7 @@
|
||||
#define IMATCOPY_K_CT DIMATCOPY_K_CT
|
||||
#define IMATCOPY_K_RT DIMATCOPY_K_RT
|
||||
|
||||
#define GEADD_K DGEADD_K
|
||||
#define GEADD_K DGEADD_K
|
||||
#else
|
||||
|
||||
#define AMAX_K SAMAX_K
|
||||
@@ -944,7 +944,7 @@
|
||||
#define IMATCOPY_K_CT SIMATCOPY_K_CT
|
||||
#define IMATCOPY_K_RT SIMATCOPY_K_RT
|
||||
|
||||
#define GEADD_K SGEADD_K
|
||||
#define GEADD_K SGEADD_K
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
@@ -1770,7 +1770,7 @@
|
||||
#define IMATCOPY_K_CTC ZIMATCOPY_K_CTC
|
||||
#define IMATCOPY_K_RTC ZIMATCOPY_K_RTC
|
||||
|
||||
#define GEADD_K ZGEADD_K
|
||||
#define GEADD_K ZGEADD_K
|
||||
|
||||
#else
|
||||
|
||||
@@ -2193,7 +2193,7 @@
|
||||
#define IMATCOPY_K_CTC CIMATCOPY_K_CTC
|
||||
#define IMATCOPY_K_RTC CIMATCOPY_K_RTC
|
||||
|
||||
#define GEADD_K CGEADD_K
|
||||
#define GEADD_K CGEADD_K
|
||||
|
||||
#endif
|
||||
#endif
|
||||
@@ -2806,3 +2806,160 @@ typedef struct {
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
#define TRTRS_UNU_SINGLE qtrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE qtrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE qtrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE qtrtrs_UTN_single
|
||||
#define TRTRS_LNU_SINGLE qtrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE qtrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE qtrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE qtrtrs_LTN_single
|
||||
#define TRTRS_UNU_PARALLEL qtrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL qtrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL qtrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL qtrtrs_UTN_parallel
|
||||
#define TRTRS_LNU_PARALLEL qtrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL qtrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL qtrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL qtrtrs_LTN_parallel
|
||||
|
||||
#elif defined(DOUBLE)
|
||||
#define TRTRS_UNU_SINGLE dtrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE dtrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE dtrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE dtrtrs_UTN_single
|
||||
#define TRTRS_LNU_SINGLE dtrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE dtrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE dtrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE dtrtrs_LTN_single
|
||||
#define TRTRS_UNU_PARALLEL dtrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL dtrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL dtrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL dtrtrs_UTN_parallel
|
||||
#define TRTRS_LNU_PARALLEL dtrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL dtrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL dtrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL dtrtrs_LTN_parallel
|
||||
#else
|
||||
#define TRTRS_UNU_SINGLE strtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE strtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE strtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE strtrs_UTN_single
|
||||
#define TRTRS_LNU_SINGLE strtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE strtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE strtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE strtrs_LTN_single
|
||||
#define TRTRS_UNU_PARALLEL strtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL strtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL strtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL strtrs_UTN_parallel
|
||||
#define TRTRS_LNU_PARALLEL strtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL strtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL strtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL strtrs_LTN_parallel
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
#define TRTRS_UNU_SINGLE xtrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE xtrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE xtrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE xtrtrs_UTN_single
|
||||
#define TRTRS_URU_SINGLE xtrtrs_URU_single
|
||||
#define TRTRS_URN_SINGLE xtrtrs_URN_single
|
||||
#define TRTRS_UCU_SINGLE xtrtrs_UCU_single
|
||||
#define TRTRS_UCN_SINGLE xtrtrs_UCN_single
|
||||
#define TRTRS_LNU_SINGLE xtrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE xtrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE xtrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE xtrtrs_LTN_single
|
||||
#define TRTRS_LRU_SINGLE xtrtrs_LRU_single
|
||||
#define TRTRS_LRN_SINGLE xtrtrs_LRN_single
|
||||
#define TRTRS_LCU_SINGLE xtrtrs_LCU_single
|
||||
#define TRTRS_LCN_SINGLE xtrtrs_LCN_single
|
||||
#define TRTRS_UNU_PARALLEL xtrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL xtrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL xtrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL xtrtrs_UTN_parallel
|
||||
#define TRTRS_URU_PARALLEL xtrtrs_URU_parallel
|
||||
#define TRTRS_URN_PARALLEL xtrtrs_URN_parallel
|
||||
#define TRTRS_UCU_PARALLEL xtrtrs_UCU_parallel
|
||||
#define TRTRS_UCN_PARALLEL xtrtrs_UCN_parallel
|
||||
#define TRTRS_LNU_PARALLEL xtrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL xtrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL xtrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL xtrtrs_LTN_parallel
|
||||
#define TRTRS_LRU_PARALLEL xtrtrs_LRU_parallel
|
||||
#define TRTRS_LRN_PARALLEL xtrtrs_LRN_parallel
|
||||
#define TRTRS_LCU_PARALLEL xtrtrs_LCU_parallel
|
||||
#define TRTRS_LCN_PARALLEL xtrtrs_LCN_parallel
|
||||
#elif defined(DOUBLE)
|
||||
#define TRTRS_UNU_SINGLE ztrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE ztrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE ztrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE ztrtrs_UTN_single
|
||||
#define TRTRS_URU_SINGLE ztrtrs_URU_single
|
||||
#define TRTRS_URN_SINGLE ztrtrs_URN_single
|
||||
#define TRTRS_UCU_SINGLE ztrtrs_UCU_single
|
||||
#define TRTRS_UCN_SINGLE ztrtrs_UCN_single
|
||||
#define TRTRS_LNU_SINGLE ztrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE ztrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE ztrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE ztrtrs_LTN_single
|
||||
#define TRTRS_LRU_SINGLE ztrtrs_LRU_single
|
||||
#define TRTRS_LRN_SINGLE ztrtrs_LRN_single
|
||||
#define TRTRS_LCU_SINGLE ztrtrs_LCU_single
|
||||
#define TRTRS_LCN_SINGLE ztrtrs_LCN_single
|
||||
#define TRTRS_UNU_PARALLEL ztrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL ztrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL ztrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL ztrtrs_UTN_parallel
|
||||
#define TRTRS_URU_PARALLEL ztrtrs_URU_parallel
|
||||
#define TRTRS_URN_PARALLEL ztrtrs_URN_parallel
|
||||
#define TRTRS_UCU_PARALLEL ztrtrs_UCU_parallel
|
||||
#define TRTRS_UCN_PARALLEL ztrtrs_UCN_parallel
|
||||
#define TRTRS_LNU_PARALLEL ztrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL ztrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL ztrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL ztrtrs_LTN_parallel
|
||||
#define TRTRS_LRU_PARALLEL ztrtrs_LRU_parallel
|
||||
#define TRTRS_LRN_PARALLEL ztrtrs_LRN_parallel
|
||||
#define TRTRS_LCU_PARALLEL ztrtrs_LCU_parallel
|
||||
#define TRTRS_LCN_PARALLEL ztrtrs_LCN_parallel
|
||||
#else
|
||||
#define TRTRS_UNU_SINGLE ctrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE ctrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE ctrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE ctrtrs_UTN_single
|
||||
#define TRTRS_URU_SINGLE ctrtrs_URU_single
|
||||
#define TRTRS_URN_SINGLE ctrtrs_URN_single
|
||||
#define TRTRS_UCU_SINGLE ctrtrs_UCU_single
|
||||
#define TRTRS_UCN_SINGLE ctrtrs_UCN_single
|
||||
#define TRTRS_LNU_SINGLE ctrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE ctrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE ctrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE ctrtrs_LTN_single
|
||||
#define TRTRS_LRU_SINGLE ctrtrs_LRU_single
|
||||
#define TRTRS_LRN_SINGLE ctrtrs_LRN_single
|
||||
#define TRTRS_LCU_SINGLE ctrtrs_LCU_single
|
||||
#define TRTRS_LCN_SINGLE ctrtrs_LCN_single
|
||||
#define TRTRS_UNU_PARALLEL ctrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL ctrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL ctrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL ctrtrs_UTN_parallel
|
||||
#define TRTRS_URU_PARALLEL ctrtrs_URU_parallel
|
||||
#define TRTRS_URN_PARALLEL ctrtrs_URN_parallel
|
||||
#define TRTRS_UCU_PARALLEL ctrtrs_UCU_parallel
|
||||
#define TRTRS_UCN_PARALLEL ctrtrs_UCN_parallel
|
||||
#define TRTRS_LNU_PARALLEL ctrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL ctrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL ctrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL ctrtrs_LTN_parallel
|
||||
#define TRTRS_LRU_PARALLEL ctrtrs_LRU_parallel
|
||||
#define TRTRS_LRN_PARALLEL ctrtrs_LRN_parallel
|
||||
#define TRTRS_LCU_PARALLEL ctrtrs_LCU_parallel
|
||||
#define TRTRS_LCN_PARALLEL ctrtrs_LCN_parallel
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -39,6 +39,35 @@
|
||||
#ifndef COMMON_POWER
|
||||
#define COMMON_POWER
|
||||
|
||||
#define str(x) #x
|
||||
|
||||
#ifdef OS_AIX
|
||||
#define XXSPLTD(T,A,z) xxpermdi T, A, A, 0b##z##z
|
||||
#define XXMRGHD(T,A,B) xxpermdi T, A, B, 0b00
|
||||
#define XXMRGLD(T,A,B) xxpermdi T, A, B, 0b11
|
||||
#define XXSWAPD(T,A) xxpermdi T, A, A, 0b10
|
||||
#define XVMOVDP(T,A) xvcpsgndp T, A, A
|
||||
|
||||
#define XXSPLTD_S(T,A,z) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) " \n\t"
|
||||
#define XXMRGHD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b00 \n\t"
|
||||
#define XXMRGLD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b11 \n\t"
|
||||
#define XXSWAPD_S(T,A) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b10 \n\t"
|
||||
|
||||
#else
|
||||
#define XXSPLTD(T,A,z) xxspltd T, A, z
|
||||
#define XXMRGHD(T,A,B) xxmrghd T, A, B
|
||||
#define XXMRGLD(T,A,B) xxmrgld T, A, B
|
||||
#define XXSWAPD(T,A) xxswapd T, A
|
||||
#define XVMOVDP(T,A) xvmovdp T, A
|
||||
|
||||
#define XXSPLTD_S(T,A,z) "xxspltd " str(T) ", " str(A) ", " str(z)" \n\t"
|
||||
#define XXMRGHD_S(T,A,B) "xxmrghd " str(T) ", " str(A) ", " str(B)" \n\t"
|
||||
#define XXMRGLD_S(T,A,B) "xxmrgld " str(T) ", " str(A) ", " str(B)" \n\t"
|
||||
#define XXSWAPD_S(T,A) "xxswapd " str(T) ", " str(A) " \n\t"
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
||||
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
||||
@@ -241,7 +270,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
#define HAVE_PREFETCH
|
||||
#endif
|
||||
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) )
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970)
|
||||
#define DCBT_ARG 0
|
||||
#else
|
||||
#define DCBT_ARG 8
|
||||
|
||||
@@ -194,10 +194,6 @@ int trsm_thread(int mode, BLASLONG m, BLASLONG n,
|
||||
|
||||
int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
|
||||
|
||||
int beta_thread(int mode, BLASLONG m, BLASLONG n,
|
||||
double alpha_r, double alpha_i,
|
||||
void *c, BLASLONG ldc, int (*fuction)());
|
||||
|
||||
int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k,
|
||||
void *offsetA, BLASLONG lda,
|
||||
void *offsetB, BLASLONG jb,
|
||||
|
||||
@@ -225,7 +225,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
#endif
|
||||
#define HUGE_PAGESIZE ( 2 << 20)
|
||||
|
||||
#ifndef BUFFERSIZE
|
||||
#define BUFFER_SIZE (32 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE (32 << BUFFERSIZE)
|
||||
#endif
|
||||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
#define CPU_CORTEXA57 3
|
||||
#define CPU_CORTEXA72 4
|
||||
#define CPU_CORTEXA73 5
|
||||
#define CPU_NEOVERSEN1 11
|
||||
// Qualcomm
|
||||
#define CPU_FALKOR 6
|
||||
// Cavium
|
||||
@@ -41,6 +42,8 @@
|
||||
#define CPU_THUNDERX2T99 8
|
||||
//Hisilicon
|
||||
#define CPU_TSV110 9
|
||||
// Ampere
|
||||
#define CPU_EMAG8180 10
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
@@ -52,7 +55,9 @@ static char *cpuname[] = {
|
||||
"FALKOR",
|
||||
"THUNDERX",
|
||||
"THUNDERX2T99",
|
||||
"TSV110"
|
||||
"TSV110",
|
||||
"EMAG8180",
|
||||
"NEOVERSEN1"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
@@ -65,7 +70,9 @@ static char *cpuname_lower[] = {
|
||||
"falkor",
|
||||
"thunderx",
|
||||
"thunderx2t99",
|
||||
"tsv110"
|
||||
"tsv110",
|
||||
"emag8180",
|
||||
"neoversen1"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
@@ -140,6 +147,8 @@ int detect(void)
|
||||
return CPU_CORTEXA72;
|
||||
else if (strstr(cpu_part, "0xd09"))
|
||||
return CPU_CORTEXA73;
|
||||
else if (strstr(cpu_part, "0xd0c"))
|
||||
return CPU_NEOVERSEN1;
|
||||
}
|
||||
// Qualcomm
|
||||
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
|
||||
@@ -152,6 +161,9 @@ int detect(void)
|
||||
// HiSilicon
|
||||
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
|
||||
return CPU_TSV110;
|
||||
// Ampere
|
||||
else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000"))
|
||||
return CPU_EMAG8180;
|
||||
}
|
||||
|
||||
p = (char *) NULL ;
|
||||
@@ -206,6 +218,33 @@ void get_subdirname(void)
|
||||
printf("arm64");
|
||||
}
|
||||
|
||||
void get_cpucount(void)
|
||||
{
|
||||
int n=0;
|
||||
|
||||
#ifdef linux
|
||||
FILE *infile;
|
||||
char buffer[2048], *p,*t;
|
||||
p = (char *) NULL ;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if (!strncmp("processor", buffer, 9))
|
||||
n++;
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
printf("#define NUM_CORES %d\n",n);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
void get_cpuconfig(void)
|
||||
{
|
||||
|
||||
@@ -251,6 +290,20 @@ void get_cpuconfig(void)
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
case CPU_NEOVERSEN1:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
case CPU_FALKOR:
|
||||
printf("#define FALKOR\n");
|
||||
@@ -308,7 +361,20 @@ void get_cpuconfig(void)
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
|
||||
case CPU_EMAG8180:
|
||||
// Minimum parameters for ARMv8 (based on A53)
|
||||
printf("#define EMAG8180\n");
|
||||
printf("#define L1_CODE_SIZE 32768\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
|
||||
}
|
||||
get_cpucount();
|
||||
}
|
||||
|
||||
|
||||
@@ -351,5 +417,3 @@ void get_features(void)
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
52
cpuid_x86.c
52
cpuid_x86.c
@@ -1197,7 +1197,11 @@ int get_cpuname(void){
|
||||
case 3:
|
||||
case 5:
|
||||
case 6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CPUTYPE_CORE2;
|
||||
#else
|
||||
return CPUTYPE_PENTIUM2;
|
||||
#endif
|
||||
case 7:
|
||||
case 8:
|
||||
case 10:
|
||||
@@ -1379,8 +1383,8 @@ int get_cpuname(void){
|
||||
break;
|
||||
case 7: // family 6 exmodel 7
|
||||
switch (model) {
|
||||
case 10: // Goldmont Plus
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 10: // Goldmont Plus
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 14: // Ice Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
@@ -1427,7 +1431,11 @@ int get_cpuname(void){
|
||||
case 0x5:
|
||||
return CPUTYPE_AMDK6;
|
||||
case 0x6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CPUTYPE_BARCELONA;
|
||||
#else
|
||||
return CPUTYPE_ATHLON;
|
||||
#endif
|
||||
case 0xf:
|
||||
switch (exfamily) {
|
||||
case 0:
|
||||
@@ -1810,7 +1818,11 @@ int get_coretype(void){
|
||||
case 4:
|
||||
case 5:
|
||||
case 6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CORE_CORE2;
|
||||
#else
|
||||
return CORE_P6;
|
||||
#endif
|
||||
case 7:
|
||||
return CORE_KATMAI;
|
||||
case 8:
|
||||
@@ -1994,6 +2006,38 @@ int get_coretype(void){
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 6:
|
||||
if (model == 6)
|
||||
#ifndef NO_AVX512
|
||||
return CORE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
#endif
|
||||
break;
|
||||
case 7:
|
||||
if (model == 10)
|
||||
return CORE_NEHALEM;
|
||||
if (model == 14)
|
||||
#ifndef NO_AVX512
|
||||
return CORE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
#endif
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 14) { // Kaby Lake
|
||||
@@ -2017,7 +2061,11 @@ int get_coretype(void){
|
||||
|
||||
if (vendor == VENDOR_AMD){
|
||||
if (family <= 0x5) return CORE_80486;
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
if (family <= 0xe) return CORE_BARCELONA;
|
||||
#else
|
||||
if (family <= 0xe) return CORE_ATHLON;
|
||||
#endif
|
||||
if (family == 0xf){
|
||||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
||||
else if (exfamily == 5) return CORE_BOBCAT;
|
||||
|
||||
@@ -30,17 +30,20 @@
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
#define CPU_Z15 3
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13",
|
||||
"Z14"
|
||||
"Z14",
|
||||
"Z15"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13",
|
||||
"z14"
|
||||
"z14",
|
||||
"z15"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
@@ -66,6 +69,8 @@ int detect(void)
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
if (strstr(p, "3906")) return CPU_Z14;
|
||||
if (strstr(p, "3907")) return CPU_Z14;
|
||||
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14
|
||||
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
@@ -1503,6 +1503,8 @@ C $ ' .' )
|
||||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
||||
@@ -1504,6 +1504,8 @@ C $ ' .' )
|
||||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
||||
@@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
6 NUMBER OF VALUES OF N
|
||||
7 NUMBER OF VALUES OF N
|
||||
1 2 3 5 7 9 35 VALUES OF N
|
||||
3 NUMBER OF VALUES OF ALPHA
|
||||
0.0 1.0 0.7 VALUES OF ALPHA
|
||||
|
||||
@@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
6 NUMBER OF VALUES OF N
|
||||
7 NUMBER OF VALUES OF N
|
||||
0 1 2 3 5 9 35 VALUES OF N
|
||||
3 NUMBER OF VALUES OF ALPHA
|
||||
0.0 1.0 0.7 VALUES OF ALPHA
|
||||
|
||||
@@ -338,7 +338,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
@@ -398,7 +398,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
@@ -463,7 +463,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
|
||||
@@ -332,13 +332,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
#else
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
START_RPCC();
|
||||
|
||||
@@ -104,7 +104,7 @@ typedef struct {
|
||||
#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
|
||||
GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
|
||||
BETA[0], BETA[1], NULL, 0, NULL, 0, \
|
||||
(FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC)
|
||||
(FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
|
||||
#endif
|
||||
|
||||
#ifndef ICOPYB_OPERATION
|
||||
@@ -408,13 +408,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
/* Make sure if no one is using another buffer */
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
@@ -441,7 +441,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
}
|
||||
WMB;
|
||||
}
|
||||
|
||||
current = mypos;
|
||||
|
||||
@@ -458,7 +459,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
START_RPCC();
|
||||
|
||||
/* thread has to wait */
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting2);
|
||||
|
||||
@@ -477,6 +478,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
@@ -517,6 +519,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
if (is + min_i >= m_to) {
|
||||
/* Thread doesn't need this buffer any more */
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -541,13 +544,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
/* Make sure if no one is using another buffer */
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
@@ -595,7 +598,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
START_RPCC();
|
||||
|
||||
/* thread has to wait */
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting2);
|
||||
|
||||
@@ -613,6 +616,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
@@ -677,13 +681,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
/* Make sure if no one is using another buffer */
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
@@ -731,7 +735,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
START_RPCC();
|
||||
|
||||
/* thread has to wait */
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting2);
|
||||
|
||||
@@ -748,8 +752,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
}
|
||||
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
}
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
|
||||
@@ -787,7 +792,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
#endif
|
||||
if (is + min_i >= m_to) {
|
||||
/* Thread doesn't need this buffer any more */
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -804,7 +810,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
for (i = 0; i < args -> nthreads; i++) {
|
||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;MB;};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -840,6 +846,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
*range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
#else
|
||||
CRITICAL_SECTION level3_lock;
|
||||
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
blas_arg_t newarg;
|
||||
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
@@ -869,6 +884,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
pthread_mutex_lock(&level3_lock);
|
||||
#else
|
||||
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
newarg.m = args -> m;
|
||||
newarg.n = args -> n;
|
||||
newarg.k = args -> k;
|
||||
@@ -973,6 +996,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
free(job);
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
pthread_mutex_unlock(&level3_lock);
|
||||
#else
|
||||
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -351,8 +351,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
/* Make sure if no one is using workspace */
|
||||
START_RPCC();
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
STOP_RPCC(waiting1);
|
||||
MB;
|
||||
|
||||
#if defined(FUSED_GEMM) && !defined(TIMING)
|
||||
|
||||
@@ -365,12 +366,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
/* Split local region of B into parts */
|
||||
for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, js + div_n) - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
/* Copy part of local region of B into workspace */
|
||||
START_RPCC();
|
||||
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
|
||||
@@ -391,10 +396,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
}
|
||||
#endif
|
||||
|
||||
WMB;
|
||||
/* Set flag so other threads can access local region of B */
|
||||
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
|
||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
WMB;
|
||||
}
|
||||
|
||||
/* Get regions of B from other threads and apply kernel */
|
||||
@@ -413,8 +418,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
/* Wait until other region of B is initialized */
|
||||
START_RPCC();
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
STOP_RPCC(waiting2);
|
||||
MB;
|
||||
|
||||
/* Apply kernel with local region of A and part of other region of B */
|
||||
START_RPCC();
|
||||
@@ -430,8 +436,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
/* Clear synchronization flag if this thread is done with other region of B */
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
@@ -473,8 +479,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
/* Clear synchronization flag if this thread is done with region of B */
|
||||
if (is + min_i >= m_to) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -493,10 +499,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
START_RPCC();
|
||||
for (i = 0; i < args -> nthreads; i++) {
|
||||
for (js = 0; js < DIVIDE_RATE; js++) {
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
|
||||
}
|
||||
}
|
||||
STOP_RPCC(waiting3);
|
||||
MB;
|
||||
|
||||
#ifdef TIMING
|
||||
BLASLONG waiting = waiting1 + waiting2 + waiting3;
|
||||
@@ -701,7 +708,7 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
WMB;
|
||||
/* Execute parallel computation */
|
||||
exec_blas(nthreads, queue);
|
||||
}
|
||||
|
||||
@@ -135,10 +135,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
START_RPCC();
|
||||
|
||||
GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
|
||||
@@ -201,10 +205,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
START_RPCC();
|
||||
|
||||
GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
|
||||
@@ -292,10 +300,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
START_RPCC();
|
||||
|
||||
GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb,
|
||||
@@ -358,10 +370,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
START_RPCC();
|
||||
|
||||
GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb,
|
||||
|
||||
@@ -122,10 +122,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = 0; jjs < ls - js; jjs += min_jj){
|
||||
min_jj = ls - js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE);
|
||||
#else
|
||||
@@ -142,10 +146,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
||||
min_jj = min_l - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE);
|
||||
#else
|
||||
@@ -195,10 +203,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
|
||||
#else
|
||||
@@ -246,10 +258,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
||||
min_jj = min_l - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE);
|
||||
#else
|
||||
@@ -267,10 +283,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
|
||||
min_jj = js - ls - min_l - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda,
|
||||
sb + min_l * (min_l + jjs) * COMPSIZE);
|
||||
@@ -324,10 +344,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
|
||||
#else
|
||||
|
||||
@@ -21,9 +21,13 @@ else
|
||||
ifeq ($(ARCH),power)
|
||||
COMMONOBJS += dynamic_power.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),zarch)
|
||||
COMMONOBJS += dynamic_zarch.$(SUFFIX)
|
||||
else
|
||||
COMMONOBJS += dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
COMMONOBJS += parameter.$(SUFFIX)
|
||||
endif
|
||||
@@ -85,9 +89,13 @@ else
|
||||
ifeq ($(ARCH),power)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),zarch)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX)
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
|
||||
endif
|
||||
|
||||
@@ -462,11 +462,15 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
// Could also just use WaitForMultipleObjects
|
||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
||||
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
|
||||
|
||||
#ifndef OS_WINDOWSSTORE
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
TerminateThread(blas_threads[i],0);
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
if (WAIT_OBJECT_0 != wait_thread_value) {
|
||||
TerminateThread(blas_threads[i],0);
|
||||
}
|
||||
#endif
|
||||
|
||||
CloseHandle(blas_threads[i]);
|
||||
}
|
||||
|
||||
|
||||
@@ -329,7 +329,7 @@ int support_avx512(){
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 1){
|
||||
if((ebx & (1<<7)) == 0){
|
||||
ret=0; //OS does not even support AVX2
|
||||
}
|
||||
if((ebx & (1<<31)) != 0){
|
||||
@@ -586,6 +586,8 @@ static gotoblas_t *get_coretype(void){
|
||||
}
|
||||
return NULL;
|
||||
case 7:
|
||||
if (model == 10) // Goldmont Plus
|
||||
return &gotoblas_NEHALEM;
|
||||
if (model == 14) {
|
||||
// Ice Lake
|
||||
if (support_avx512())
|
||||
|
||||
@@ -37,17 +37,26 @@
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if (defined OS_LINUX || defined OS_ANDROID)
|
||||
#include <asm/hwcap.h>
|
||||
#include <sys/auxv.h>
|
||||
#endif
|
||||
|
||||
extern gotoblas_t gotoblas_ARMV8;
|
||||
extern gotoblas_t gotoblas_CORTEXA53;
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
extern gotoblas_t gotoblas_CORTEXA72;
|
||||
extern gotoblas_t gotoblas_CORTEXA73;
|
||||
extern gotoblas_t gotoblas_FALKOR;
|
||||
extern gotoblas_t gotoblas_THUNDERX;
|
||||
extern gotoblas_t gotoblas_THUNDERX2T99;
|
||||
extern gotoblas_t gotoblas_TSV110;
|
||||
extern gotoblas_t gotoblas_EMAG8180;
|
||||
extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#define NUM_CORETYPES 4
|
||||
#define NUM_CORETYPES 11
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
@@ -63,17 +72,31 @@ extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
static char *corename[] = {
|
||||
"armv8",
|
||||
"cortexa53",
|
||||
"cortexa57",
|
||||
"cortexa72",
|
||||
"cortexa73",
|
||||
"falkor",
|
||||
"thunderx",
|
||||
"thunderx2t99",
|
||||
"tsv110",
|
||||
"emag8180",
|
||||
"neoversen1",
|
||||
"unknown"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
|
||||
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1];
|
||||
if (gotoblas == &gotoblas_THUNDERX) return corename[ 2];
|
||||
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3];
|
||||
if (gotoblas == &gotoblas_CORTEXA53) return corename[ 1];
|
||||
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 2];
|
||||
if (gotoblas == &gotoblas_CORTEXA72) return corename[ 3];
|
||||
if (gotoblas == &gotoblas_CORTEXA73) return corename[ 4];
|
||||
if (gotoblas == &gotoblas_FALKOR) return corename[ 5];
|
||||
if (gotoblas == &gotoblas_THUNDERX) return corename[ 6];
|
||||
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7];
|
||||
if (gotoblas == &gotoblas_TSV110) return corename[ 8];
|
||||
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
|
||||
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
@@ -94,9 +117,16 @@ static gotoblas_t *force_coretype(char *coretype) {
|
||||
switch (found)
|
||||
{
|
||||
case 0: return (&gotoblas_ARMV8);
|
||||
case 1: return (&gotoblas_CORTEXA57);
|
||||
case 2: return (&gotoblas_THUNDERX);
|
||||
case 3: return (&gotoblas_THUNDERX2T99);
|
||||
case 1: return (&gotoblas_CORTEXA53);
|
||||
case 2: return (&gotoblas_CORTEXA57);
|
||||
case 3: return (&gotoblas_CORTEXA72);
|
||||
case 4: return (&gotoblas_CORTEXA73);
|
||||
case 5: return (&gotoblas_FALKOR);
|
||||
case 6: return (&gotoblas_THUNDERX);
|
||||
case 7: return (&gotoblas_THUNDERX2T99);
|
||||
case 8: return (&gotoblas_TSV110);
|
||||
case 9: return (&gotoblas_EMAG8180);
|
||||
case 10: return (&gotoblas_NEOVERSEN1);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
@@ -105,13 +135,17 @@ static gotoblas_t *force_coretype(char *coretype) {
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
int implementer, variant, part, arch, revision, midr_el1;
|
||||
|
||||
|
||||
#if (defined OS_LINUX || defined OS_ANDROID)
|
||||
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
|
||||
char coremsg[128];
|
||||
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
|
||||
openblas_warning(1, coremsg);
|
||||
return NULL;
|
||||
}
|
||||
#else
|
||||
return NULL;
|
||||
#endif
|
||||
|
||||
get_cpu_ftr(MIDR_EL1, midr_el1);
|
||||
/*
|
||||
@@ -130,10 +164,16 @@ static gotoblas_t *get_coretype(void) {
|
||||
case 0x41: // ARM
|
||||
switch (part)
|
||||
{
|
||||
case 0xd07: // Cortex A57
|
||||
case 0xd08: // Cortex A72
|
||||
case 0xd03: // Cortex A53
|
||||
return &gotoblas_CORTEXA53;
|
||||
case 0xd07: // Cortex A57
|
||||
return &gotoblas_CORTEXA57;
|
||||
case 0xd08: // Cortex A72
|
||||
return &gotoblas_CORTEXA72;
|
||||
case 0xd09: // Cortex A73
|
||||
return &gotoblas_CORTEXA73;
|
||||
case 0xd0c: // Neoverse N1
|
||||
return &gotoblas_NEOVERSEN1;
|
||||
}
|
||||
break;
|
||||
case 0x42: // Broadcom
|
||||
@@ -152,6 +192,27 @@ static gotoblas_t *get_coretype(void) {
|
||||
return &gotoblas_THUNDERX2T99;
|
||||
}
|
||||
break;
|
||||
case 0x48: // HiSilicon
|
||||
switch (part)
|
||||
{
|
||||
case 0xd01: // tsv110
|
||||
return &gotoblas_TSV110;
|
||||
}
|
||||
break;
|
||||
case 0x50: // Ampere
|
||||
switch (part)
|
||||
{
|
||||
case 0x000: // Skylark/EMAG8180
|
||||
return &gotoblas_EMAG8180;
|
||||
}
|
||||
break;
|
||||
case 0x51: // Qualcomm
|
||||
switch (part)
|
||||
{
|
||||
case 0xc00: // Falkor
|
||||
return &gotoblas_FALKOR;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -3,7 +3,9 @@
|
||||
|
||||
extern gotoblas_t gotoblas_POWER6;
|
||||
extern gotoblas_t gotoblas_POWER8;
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
extern gotoblas_t gotoblas_POWER9;
|
||||
#endif
|
||||
|
||||
extern void openblas_warning(int verbose, const char *msg);
|
||||
|
||||
@@ -19,7 +21,9 @@ static char *corename[] = {
|
||||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_POWER6) return corename[1];
|
||||
if (gotoblas == &gotoblas_POWER8) return corename[2];
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
#endif
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
@@ -29,8 +33,10 @@ static gotoblas_t *get_coretype(void) {
|
||||
return &gotoblas_POWER6;
|
||||
if (__builtin_cpu_is("power8"))
|
||||
return &gotoblas_POWER8;
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
if (__builtin_cpu_is("power9"))
|
||||
return &gotoblas_POWER9;
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -53,7 +59,9 @@ static gotoblas_t *force_coretype(char * coretype) {
|
||||
{
|
||||
case 1: return (&gotoblas_POWER6);
|
||||
case 2: return (&gotoblas_POWER8);
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
case 3: return (&gotoblas_POWER9);
|
||||
#endif
|
||||
default: return NULL;
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
|
||||
131
driver/others/dynamic_zarch.c
Normal file
131
driver/others/dynamic_zarch.c
Normal file
@@ -0,0 +1,131 @@
|
||||
|
||||
#include "common.h"
|
||||
|
||||
extern gotoblas_t gotoblas_Z13;
|
||||
extern gotoblas_t gotoblas_Z14;
|
||||
//extern gotoblas_t gotoblas_Z15;
|
||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
//extern gotoblas_t gotoblas_Z14;
|
||||
//#endif
|
||||
|
||||
#define NUM_CORETYPES 4
|
||||
|
||||
extern void openblas_warning(int verbose, const char* msg);
|
||||
|
||||
static char* corename[] = {
|
||||
"unknown",
|
||||
"Z13",
|
||||
"Z14",
|
||||
// "Z15",
|
||||
"ZARCH_GENERIC",
|
||||
};
|
||||
|
||||
char* gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_Z13) return corename[1];
|
||||
if (gotoblas == &gotoblas_Z14) return corename[2];
|
||||
// if (gotoblas == &gotoblas_Z15) return corename[3];
|
||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
// if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
//#endif
|
||||
return corename[0]; // try generic?
|
||||
}
|
||||
|
||||
// __builtin_cpu_is is not supported by zarch
|
||||
static gotoblas_t* get_coretype(void) {
|
||||
FILE* infile;
|
||||
char buffer[512], * p;
|
||||
|
||||
p = (char*)NULL;
|
||||
infile = fopen("/proc/sysinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)) {
|
||||
if (!strncmp("Type", buffer, 4)) {
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if (strstr(p, "2964")) return &gotoblas_Z13;
|
||||
if (strstr(p, "2965")) return &gotoblas_Z13;
|
||||
if (strstr(p, "3906")) return &gotoblas_Z14;
|
||||
if (strstr(p, "3907")) return &gotoblas_Z14;
|
||||
if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14
|
||||
if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14
|
||||
|
||||
return NULL; // should be ZARCH_GENERIC
|
||||
}
|
||||
|
||||
static gotoblas_t* force_coretype(char* coretype) {
|
||||
|
||||
int i;
|
||||
int found = -1;
|
||||
char message[128];
|
||||
|
||||
for (i = 0; i < NUM_CORETYPES; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype, corename[i], 20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 1: return (&gotoblas_Z13);
|
||||
case 2: return (&gotoblas_Z14);
|
||||
// case 3: return (&gotoblas_Z15);
|
||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
// case 3: return (&gotoblas_POWER9);
|
||||
//#endif
|
||||
default: return NULL;
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_init(void) {
|
||||
|
||||
char coremsg[128];
|
||||
char coren[22];
|
||||
char* p;
|
||||
|
||||
|
||||
if (gotoblas) return;
|
||||
|
||||
p = getenv("OPENBLAS_CORETYPE");
|
||||
if (p)
|
||||
{
|
||||
gotoblas = force_coretype(p);
|
||||
}
|
||||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
}
|
||||
|
||||
if (gotoblas == NULL)
|
||||
{
|
||||
snprintf(coremsg, 128, "Falling back to Z14 core\n");
|
||||
openblas_warning(1, coremsg);
|
||||
gotoblas = &gotoblas_Z14;
|
||||
}
|
||||
|
||||
if (gotoblas && gotoblas->init) {
|
||||
strncpy(coren, gotoblas_corename(), 20);
|
||||
sprintf(coremsg, "Core: %s\n", coren);
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas->init();
|
||||
}
|
||||
else {
|
||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_quit(void) {
|
||||
gotoblas = NULL;
|
||||
}
|
||||
@@ -129,7 +129,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
@@ -192,7 +192,7 @@ void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
@@ -312,7 +312,7 @@ int get_num_procs(void) {
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
@@ -404,7 +404,7 @@ extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
@@ -412,7 +412,7 @@ int blas_get_cpu_number(void){
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
@@ -436,7 +436,7 @@ int blas_get_cpu_number(void){
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
@@ -822,7 +822,7 @@ static void *alloc_qalloc(void *address){
|
||||
|
||||
static void alloc_windows_free(struct alloc_t *alloc_info){
|
||||
|
||||
VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT);
|
||||
VirtualFree(alloc_info, 0, MEM_RELEASE);
|
||||
|
||||
}
|
||||
|
||||
@@ -935,7 +935,7 @@ static void alloc_hugetlb_free(struct alloc_t *alloc_info){
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT);
|
||||
VirtualFree(alloc_info, 0, MEM_LARGE_PAGES | MEM_RELEASE);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1673,7 +1673,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
@@ -1736,7 +1736,7 @@ void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
@@ -1855,7 +1855,7 @@ int get_num_procs(void) {
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
@@ -1945,7 +1945,7 @@ extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
@@ -1953,7 +1953,7 @@ int blas_get_cpu_number(void){
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
@@ -1977,7 +1977,7 @@ int blas_get_cpu_number(void){
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
@@ -2310,7 +2310,7 @@ static void *alloc_qalloc(void *address){
|
||||
|
||||
static void alloc_windows_free(struct release_t *release){
|
||||
|
||||
VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
|
||||
VirtualFree(release -> address, 0, MEM_RELEASE);
|
||||
|
||||
}
|
||||
|
||||
@@ -2432,7 +2432,7 @@ static void alloc_hugetlb_free(struct release_t *release){
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
|
||||
VirtualFree(release -> address, 0, MEM_LARGE_PAGES | MEM_RELEASE);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -38,21 +38,29 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifndef SMP
|
||||
#define blas_cpu_number 1
|
||||
#else
|
||||
|
||||
int blas_cpu_number = 1;
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
|
||||
return blas_cpu_number;
|
||||
}
|
||||
#ifdef OS_LINUX
|
||||
#include <sys/sysinfo.h>
|
||||
#include <sched.h>
|
||||
#include <errno.h>
|
||||
#include <linux/unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
|
||||
#define FIXED_PAGESIZE 4096
|
||||
|
||||
|
||||
void *sa = NULL;
|
||||
void *sb = NULL;
|
||||
static double static_buffer[BUFFER_SIZE/sizeof(double)];
|
||||
@@ -60,7 +68,7 @@ static double static_buffer[BUFFER_SIZE/sizeof(double)];
|
||||
void *blas_memory_alloc(int numproc){
|
||||
|
||||
if (sa == NULL){
|
||||
#if 1
|
||||
#if 0
|
||||
sa = (void *)qalloc(QFAST, BUFFER_SIZE);
|
||||
#else
|
||||
sa = (void *)malloc(BUFFER_SIZE);
|
||||
@@ -75,3 +83,296 @@ void blas_memory_free(void *free_area){
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#ifndef SMP
|
||||
|
||||
#define blas_cpu_number 1
|
||||
#define blas_num_threads 1
|
||||
|
||||
/* Dummy Function */
|
||||
int goto_get_num_procs (void) { return 1;};
|
||||
void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int n;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
#else
|
||||
#if !__GLIBC_PREREQ(2, 3)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
|
||||
if (ret!=0) return nums;
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,&cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
#endif
|
||||
return nums;
|
||||
#else
|
||||
if (nums >= CPU_SETSIZE) {
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) {
|
||||
return nums;
|
||||
}
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) {
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
} else {
|
||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
|
||||
if (ret!=0) {
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT(&cpuset);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef OS_ANDROID
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_AIX
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
|
||||
if (nums == 0) {
|
||||
|
||||
SYSTEM_INFO sysinfo;
|
||||
|
||||
GetSystemInfo(&sysinfo);
|
||||
|
||||
nums = sysinfo.dwNumberOfProcessors;
|
||||
}
|
||||
|
||||
return nums;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
|
||||
int m[2];
|
||||
size_t len;
|
||||
|
||||
if (nums == 0) {
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
sysctl(m, 2, &nums, &len, NULL, 0);
|
||||
}
|
||||
|
||||
return nums;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_DARWIN)
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
size_t len;
|
||||
if (nums == 0){
|
||||
len = sizeof(int);
|
||||
sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
|
||||
}
|
||||
return nums;
|
||||
}
|
||||
/*
|
||||
void set_stack_limit(int limitMB){
|
||||
int result=0;
|
||||
struct rlimit rl;
|
||||
rlim_t StackSize;
|
||||
|
||||
StackSize=limitMB*1024*1024;
|
||||
result=getrlimit(RLIMIT_STACK, &rl);
|
||||
if(result==0){
|
||||
if(rl.rlim_cur < StackSize){
|
||||
rl.rlim_cur=StackSize;
|
||||
result=setrlimit(RLIMIT_STACK, &rl);
|
||||
if(result !=0){
|
||||
fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
OpenBLAS uses the numbers of CPU cores in multithreading.
|
||||
It can be set by openblas_set_num_threads(int num_threads);
|
||||
*/
|
||||
int blas_cpu_number = 0;
|
||||
/*
|
||||
The numbers of threads in the thread pool.
|
||||
This value is equal or large than blas_cpu_number. This means some threads are sleep.
|
||||
*/
|
||||
int blas_num_threads = 0;
|
||||
|
||||
int goto_get_num_procs (void) {
|
||||
return blas_cpu_number;
|
||||
}
|
||||
|
||||
void openblas_fork_handler()
|
||||
{
|
||||
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
|
||||
// built with "make USE_OPENMP=0".
|
||||
// Hanging can still happen when OpenBLAS is built against the libgomp
|
||||
// implementation of OpenMP. The problem is tracked at:
|
||||
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
|
||||
// In the mean time build with USE_OPENMP=0 or link against another
|
||||
// implementation of OpenMP.
|
||||
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
||||
int err;
|
||||
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
|
||||
if(err != 0)
|
||||
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
extern int openblas_num_threads_env();
|
||||
extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
int blas_omp_num = 0;
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
// blas_goto_num = 0;
|
||||
#ifndef USE_OPENMP
|
||||
blas_goto_num=openblas_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
|
||||
if (blas_goto_num == 0) {
|
||||
blas_goto_num=openblas_goto_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// blas_omp_num = 0;
|
||||
blas_omp_num=openblas_omp_num_threads_env();
|
||||
if (blas_omp_num < 0) blas_omp_num = 0;
|
||||
|
||||
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#ifdef DEBUG
|
||||
printf( "Adjusted number of threads : %3d\n", blas_num_threads);
|
||||
#endif
|
||||
|
||||
blas_cpu_number = blas_num_threads;
|
||||
|
||||
return blas_num_threads;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
int openblas_get_num_procs(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
return get_num_procs();
|
||||
#endif
|
||||
}
|
||||
|
||||
int openblas_get_num_threads(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
// init blas_cpu_number if needed
|
||||
blas_get_cpu_number();
|
||||
return blas_cpu_number;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -78,10 +78,10 @@ char tmpstr[20];
|
||||
#ifdef DYNAMIC_ARCH
|
||||
strcat(tmp_config_str, gotoblas_corename());
|
||||
#endif
|
||||
if (openblas_get_parallel() == 0)
|
||||
sprintf(tmpstr, " SINGLE_THREADED");
|
||||
else
|
||||
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
|
||||
if (openblas_get_parallel() == 0)
|
||||
sprintf(tmpstr, " SINGLE_THREADED");
|
||||
else
|
||||
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
|
||||
strcat(tmp_config_str, tmpstr);
|
||||
return tmp_config_str;
|
||||
}
|
||||
|
||||
897
dynamic.c
897
dynamic.c
@@ -1,897 +0,0 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define strncasecmp _strnicmp
|
||||
#define strcasecmp _stricmp
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_X86
|
||||
#define EXTERN extern
|
||||
#else
|
||||
#define EXTERN
|
||||
#endif
|
||||
|
||||
#ifdef DYNAMIC_LIST
|
||||
extern gotoblas_t gotoblas_PRESCOTT;
|
||||
|
||||
#ifdef DYN_ATHLON
|
||||
extern gotoblas_t gotoblas_ATHLON;
|
||||
#else
|
||||
#define gotoblas_ATHLON gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_KATMAI
|
||||
extern gotoblas_t gotoblas_KATMAI;
|
||||
#else
|
||||
#define gotoblas_KATMAI gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BANIAS
|
||||
extern gotoblas_t gotoblas_BANIAS;
|
||||
#else
|
||||
#define gotoblas_BANIAS gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_COPPERMINE
|
||||
extern gotoblas_t gotoblas_COPPERMINE;
|
||||
#else
|
||||
#define gotoblas_COPPERMINE gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_NORTHWOOD
|
||||
extern gotoblas_t gotoblas_NORTHWOOD;
|
||||
#else
|
||||
#define gotoblas_NORTHWOOD gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_CORE2
|
||||
extern gotoblas_t gotoblas_CORE2;
|
||||
#else
|
||||
#define gotoblas_CORE2 gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_NEHALEM
|
||||
extern gotoblas_t gotoblas_NEHALEM;
|
||||
#else
|
||||
#define gotoblas_NEHALEM gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BARCELONA
|
||||
extern gotoblas_t gotoblas_BARCELONA;
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_BARCELONA gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_BARCELONA gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_ATOM
|
||||
extern gotoblas_t gotoblas_ATOM;
|
||||
elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_ATOM gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_ATOM gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_NANO
|
||||
extern gotoblas_t gotoblas_NANO;
|
||||
#else
|
||||
#define gotoblas_NANO gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_PENRYN
|
||||
extern gotoblas_t gotoblas_PENRYN;
|
||||
#else
|
||||
#define gotoblas_PENRYN gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_DUNNINGTON
|
||||
extern gotoblas_t gotoblas_DUNNINGTON;
|
||||
#else
|
||||
#define gotoblas_DUNNINGTON gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_OPTERON
|
||||
extern gotoblas_t gotoblas_OPTERON;
|
||||
#else
|
||||
#define gotoblas_OPTERON gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_OPTERON_SSE3
|
||||
extern gotoblas_t gotoblas_OPTERON_SSE3;
|
||||
#else
|
||||
#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BOBCAT
|
||||
extern gotoblas_t gotoblas_BOBCAT;
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_BOBCAT gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_BOBCAT gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_SANDYBRIDGE
|
||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BULLDOZER
|
||||
extern gotoblas_t gotoblas_BULLDOZER;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_BULLDOZER gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_BULLDOZER gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_PILEDRIVER
|
||||
extern gotoblas_t gotoblas_PILEDRIVER;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_PILEDRIVER gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_PILEDRIVER gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_STEAMROLLER
|
||||
extern gotoblas_t gotoblas_STEAMROLLER;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_STEAMROLLER gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_STEAMROLLER gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_EXCAVATOR
|
||||
extern gotoblas_t gotoblas_EXCAVATOR;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_EXCAVATOR gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_EXCAVATOR gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_HASWELL
|
||||
extern gotoblas_t gotoblas_HASWELL;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_HASWELL gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_ZEN
|
||||
extern gotoblas_t gotoblas_ZEN;
|
||||
#elif defined(DYN_HASWELL)
|
||||
#define gotoblas_ZEN gotoblas_HASWELL
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_ZEN gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_ZEN gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_SKYLAKEX
|
||||
extern gotoblas_t gotoblas_SKYLAKEX;
|
||||
#elif defined(DYN_HASWELL)
|
||||
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_SKYLAKEX gotoblas_PRESCOTT
|
||||
#endif
|
||||
|
||||
|
||||
#else // not DYNAMIC_LIST
|
||||
EXTERN gotoblas_t gotoblas_KATMAI;
|
||||
EXTERN gotoblas_t gotoblas_COPPERMINE;
|
||||
EXTERN gotoblas_t gotoblas_NORTHWOOD;
|
||||
EXTERN gotoblas_t gotoblas_BANIAS;
|
||||
EXTERN gotoblas_t gotoblas_ATHLON;
|
||||
|
||||
extern gotoblas_t gotoblas_PRESCOTT;
|
||||
extern gotoblas_t gotoblas_CORE2;
|
||||
extern gotoblas_t gotoblas_NEHALEM;
|
||||
extern gotoblas_t gotoblas_BARCELONA;
|
||||
#ifdef DYNAMIC_OLDER
|
||||
extern gotoblas_t gotoblas_ATOM;
|
||||
extern gotoblas_t gotoblas_NANO;
|
||||
extern gotoblas_t gotoblas_PENRYN;
|
||||
extern gotoblas_t gotoblas_DUNNINGTON;
|
||||
extern gotoblas_t gotoblas_OPTERON;
|
||||
extern gotoblas_t gotoblas_OPTERON_SSE3;
|
||||
extern gotoblas_t gotoblas_BOBCAT;
|
||||
#else
|
||||
#define gotoblas_ATOM gotoblas_NEHALEM
|
||||
#define gotoblas_NANO gotoblas_NEHALEM
|
||||
#define gotoblas_PENRYN gotoblas_CORE2
|
||||
#define gotoblas_DUNNINGTON gotoblas_CORE2
|
||||
#define gotoblas_OPTERON gotoblas_CORE2
|
||||
#define gotoblas_OPTERON_SSE3 gotoblas_CORE2
|
||||
#define gotoblas_BOBCAT gotoblas_CORE2
|
||||
#endif
|
||||
|
||||
#ifndef NO_AVX
|
||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||
extern gotoblas_t gotoblas_BULLDOZER;
|
||||
extern gotoblas_t gotoblas_PILEDRIVER;
|
||||
extern gotoblas_t gotoblas_STEAMROLLER;
|
||||
extern gotoblas_t gotoblas_EXCAVATOR;
|
||||
#ifdef NO_AVX2
|
||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||
#else
|
||||
extern gotoblas_t gotoblas_HASWELL;
|
||||
extern gotoblas_t gotoblas_ZEN;
|
||||
#ifndef NO_AVX512
|
||||
extern gotoblas_t gotoblas_SKYLAKEX;
|
||||
#else
|
||||
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
//Use NEHALEM kernels for sandy bridge
|
||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
||||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
||||
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
||||
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
||||
#define gotoblas_EXCAVATOR gotoblas_BARCELONA
|
||||
#define gotoblas_ZEN gotoblas_BARCELONA
|
||||
#endif
|
||||
|
||||
#endif // DYNAMIC_LIST
|
||||
|
||||
#define VENDOR_INTEL 1
|
||||
#define VENDOR_AMD 2
|
||||
#define VENDOR_CENTAUR 3
|
||||
#define VENDOR_HYGON 4
|
||||
#define VENDOR_UNKNOWN 99
|
||||
|
||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||
|
||||
#ifndef NO_AVX
|
||||
static inline void xgetbv(int op, int * eax, int * edx){
|
||||
//Use binary code for xgetbv
|
||||
__asm__ __volatile__
|
||||
(".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
|
||||
}
|
||||
#endif
|
||||
|
||||
int support_avx(){
|
||||
#ifndef NO_AVX
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
cpuid(1, &eax, &ebx, &ecx, &edx);
|
||||
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
|
||||
xgetbv(0, &eax, &edx);
|
||||
if((eax & 6) == 6){
|
||||
ret=1; //OS support AVX
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int support_avx2(){
|
||||
#ifndef NO_AVX2
|
||||
int eax, ebx, ecx=0, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 0)
|
||||
ret=1; //OS supports AVX2
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int support_avx512(){
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 1){
|
||||
ret=0; //OS does not even support AVX2
|
||||
}
|
||||
if((ebx & (1<<31)) != 0){
|
||||
xgetbv(0, &eax, &edx);
|
||||
if((eax & 0xe0) == 0xe0)
|
||||
ret=1; //OS supports AVX512VL
|
||||
}
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
#define FALLBACK_VERBOSE 1
|
||||
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
|
||||
#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
|
||||
#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
|
||||
#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
|
||||
|
||||
static int get_vendor(void){
|
||||
int eax, ebx, ecx, edx;
|
||||
|
||||
union
|
||||
{
|
||||
char vchar[16];
|
||||
int vint[4];
|
||||
} vendor;
|
||||
|
||||
cpuid(0, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
*(&vendor.vint[0]) = ebx;
|
||||
*(&vendor.vint[1]) = edx;
|
||||
*(&vendor.vint[2]) = ecx;
|
||||
|
||||
vendor.vchar[12] = '\0';
|
||||
|
||||
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
|
||||
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
|
||||
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
|
||||
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
|
||||
|
||||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
|
||||
|
||||
return VENDOR_UNKNOWN;
|
||||
}
|
||||
|
||||
static gotoblas_t *get_coretype(void){
|
||||
|
||||
int eax, ebx, ecx, edx;
|
||||
int family, exfamily, model, vendor, exmodel;
|
||||
|
||||
cpuid(1, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
family = BITMASK(eax, 8, 0x0f);
|
||||
exfamily = BITMASK(eax, 20, 0xff);
|
||||
model = BITMASK(eax, 4, 0x0f);
|
||||
exmodel = BITMASK(eax, 16, 0x0f);
|
||||
|
||||
vendor = get_vendor();
|
||||
|
||||
if (vendor == VENDOR_INTEL){
|
||||
switch (family) {
|
||||
case 0x6:
|
||||
switch (exmodel) {
|
||||
case 0:
|
||||
if (model <= 0x7) return &gotoblas_KATMAI;
|
||||
if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE;
|
||||
if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS;
|
||||
if (model == 14) return &gotoblas_BANIAS;
|
||||
if (model == 15) return &gotoblas_CORE2;
|
||||
return NULL;
|
||||
|
||||
case 1:
|
||||
if (model == 6) return &gotoblas_CORE2;
|
||||
if (model == 7) return &gotoblas_PENRYN;
|
||||
if (model == 13) return &gotoblas_DUNNINGTON;
|
||||
if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM;
|
||||
if (model == 12) return &gotoblas_ATOM;
|
||||
return NULL;
|
||||
|
||||
case 2:
|
||||
//Intel Core (Clarkdale) / Core (Arrandale)
|
||||
// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
|
||||
// Xeon (Clarkdale), 32nm
|
||||
if (model == 5) return &gotoblas_NEHALEM;
|
||||
|
||||
//Intel Xeon Processor 5600 (Westmere-EP)
|
||||
//Xeon Processor E7 (Westmere-EX)
|
||||
//Xeon E7540
|
||||
if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
|
||||
|
||||
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
|
||||
//Intel Core i7-3000 / Xeon E5
|
||||
if (model == 10 || model == 13) {
|
||||
if(support_avx())
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 3:
|
||||
//Intel Sandy Bridge 22nm (Ivy Bridge?)
|
||||
if (model == 10 || model == 14) {
|
||||
if(support_avx())
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Haswell
|
||||
if (model == 12 || model == 15) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Broadwell
|
||||
if (model == 13) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
if (model == 7) return &gotoblas_ATOM; //Bay Trail
|
||||
return NULL;
|
||||
case 4:
|
||||
//Intel Haswell
|
||||
if (model == 5 || model == 6) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Broadwell
|
||||
if (model == 7 || model == 15) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Skylake
|
||||
if (model == 14) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Braswell / Avoton
|
||||
if (model == 12 || model == 13) {
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
return NULL;
|
||||
case 5:
|
||||
//Intel Broadwell
|
||||
if (model == 6) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
if (model == 5) {
|
||||
// Intel Skylake X
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
//Intel Skylake
|
||||
if (model == 14) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Phi Knights Landing
|
||||
if (model == 7) {
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Apollo Lake or Denverton
|
||||
if (model == 12 || model == 15) {
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
return NULL;
|
||||
case 6:
|
||||
if (model == 6) {
|
||||
// Cannon Lake
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 7:
|
||||
if (model == 10) // Goldmont plus
|
||||
return &gotoblas_NEHALEM;
|
||||
if (model == 14) {
|
||||
// Ice Lake
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 14 ) { // Kaby Lake, Coffee Lake
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
case 0xf:
|
||||
if (model <= 0x2) return &gotoblas_NORTHWOOD;
|
||||
return &gotoblas_PRESCOTT;
|
||||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
|
||||
if (family <= 0xe) {
|
||||
// Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
|
||||
cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
|
||||
if ( (eax & 0xffff) >= 0x01) {
|
||||
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
||||
if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0)
|
||||
return NULL;
|
||||
}
|
||||
else
|
||||
return NULL;
|
||||
|
||||
return &gotoblas_ATHLON;
|
||||
}
|
||||
if (family == 0xf){
|
||||
if ((exfamily == 0) || (exfamily == 2)) {
|
||||
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
|
||||
else return &gotoblas_OPTERON;
|
||||
} else if (exfamily == 5) {
|
||||
return &gotoblas_BOBCAT;
|
||||
} else if (exfamily == 6) {
|
||||
if(model == 1){
|
||||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||
if(support_avx())
|
||||
return &gotoblas_BULLDOZER;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 2 || model == 3){
|
||||
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
|
||||
if(support_avx())
|
||||
return &gotoblas_PILEDRIVER;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 5){
|
||||
if(support_avx())
|
||||
return &gotoblas_EXCAVATOR;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 0 || model == 8){
|
||||
if (exmodel == 1) {
|
||||
//AMD Trinity
|
||||
if(support_avx())
|
||||
return &gotoblas_PILEDRIVER;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if (exmodel == 3) {
|
||||
//AMD STEAMROLLER
|
||||
if(support_avx())
|
||||
return &gotoblas_STEAMROLLER;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if (exmodel == 6) {
|
||||
if(support_avx())
|
||||
return &gotoblas_EXCAVATOR;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
} else if (exfamily == 8) {
|
||||
if (model == 1 || model == 8) {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
} else if (exfamily == 9) {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else {
|
||||
return &gotoblas_BARCELONA;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_CENTAUR) {
|
||||
switch (family) {
|
||||
case 0x6:
|
||||
return &gotoblas_NANO;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static char *corename[] = {
|
||||
"Unknown",
|
||||
"Katmai",
|
||||
"Coppermine",
|
||||
"Northwood",
|
||||
"Prescott",
|
||||
"Banias",
|
||||
"Atom",
|
||||
"Core2",
|
||||
"Penryn",
|
||||
"Dunnington",
|
||||
"Nehalem",
|
||||
"Athlon",
|
||||
"Opteron",
|
||||
"Opteron_SSE3",
|
||||
"Barcelona",
|
||||
"Nano",
|
||||
"Sandybridge",
|
||||
"Bobcat",
|
||||
"Bulldozer",
|
||||
"Piledriver",
|
||||
"Haswell",
|
||||
"Steamroller",
|
||||
"Excavator",
|
||||
"Zen",
|
||||
"SkylakeX"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
|
||||
if (gotoblas == &gotoblas_KATMAI) return corename[ 1];
|
||||
if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2];
|
||||
if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3];
|
||||
if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4];
|
||||
if (gotoblas == &gotoblas_BANIAS) return corename[ 5];
|
||||
if (gotoblas == &gotoblas_ATOM) return corename[ 6];
|
||||
if (gotoblas == &gotoblas_CORE2) return corename[ 7];
|
||||
if (gotoblas == &gotoblas_PENRYN) return corename[ 8];
|
||||
if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9];
|
||||
if (gotoblas == &gotoblas_NEHALEM) return corename[10];
|
||||
if (gotoblas == &gotoblas_ATHLON) return corename[11];
|
||||
if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12];
|
||||
if (gotoblas == &gotoblas_OPTERON) return corename[13];
|
||||
if (gotoblas == &gotoblas_BARCELONA) return corename[14];
|
||||
if (gotoblas == &gotoblas_NANO) return corename[15];
|
||||
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
|
||||
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
|
||||
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
||||
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
|
||||
if (gotoblas == &gotoblas_HASWELL) return corename[20];
|
||||
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
|
||||
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
|
||||
if (gotoblas == &gotoblas_ZEN) return corename[23];
|
||||
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
|
||||
static gotoblas_t *force_coretype(char *coretype){
|
||||
|
||||
int i ;
|
||||
int found = -1;
|
||||
char message[128];
|
||||
//char mname[20];
|
||||
|
||||
for ( i=1 ; i <= 24; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype,corename[i],20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (found < 0)
|
||||
{
|
||||
//strncpy(mname,coretype,20);
|
||||
snprintf(message, 128, "Core not found: %s\n",coretype);
|
||||
openblas_warning(1, message);
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 24: return (&gotoblas_SKYLAKEX);
|
||||
case 23: return (&gotoblas_ZEN);
|
||||
case 22: return (&gotoblas_EXCAVATOR);
|
||||
case 21: return (&gotoblas_STEAMROLLER);
|
||||
case 20: return (&gotoblas_HASWELL);
|
||||
case 19: return (&gotoblas_PILEDRIVER);
|
||||
case 18: return (&gotoblas_BULLDOZER);
|
||||
case 17: return (&gotoblas_BOBCAT);
|
||||
case 16: return (&gotoblas_SANDYBRIDGE);
|
||||
case 15: return (&gotoblas_NANO);
|
||||
case 14: return (&gotoblas_BARCELONA);
|
||||
case 13: return (&gotoblas_OPTERON);
|
||||
case 12: return (&gotoblas_OPTERON_SSE3);
|
||||
case 11: return (&gotoblas_ATHLON);
|
||||
case 10: return (&gotoblas_NEHALEM);
|
||||
case 9: return (&gotoblas_DUNNINGTON);
|
||||
case 8: return (&gotoblas_PENRYN);
|
||||
case 7: return (&gotoblas_CORE2);
|
||||
case 6: return (&gotoblas_ATOM);
|
||||
case 5: return (&gotoblas_BANIAS);
|
||||
case 4: return (&gotoblas_PRESCOTT);
|
||||
case 3: return (&gotoblas_NORTHWOOD);
|
||||
case 2: return (&gotoblas_COPPERMINE);
|
||||
case 1: return (&gotoblas_KATMAI);
|
||||
}
|
||||
return(NULL);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void gotoblas_dynamic_init(void) {
|
||||
|
||||
char coremsg[128];
|
||||
char coren[22];
|
||||
char *p;
|
||||
|
||||
|
||||
if (gotoblas) return;
|
||||
|
||||
p = getenv("OPENBLAS_CORETYPE");
|
||||
if ( p )
|
||||
{
|
||||
gotoblas = force_coretype(p);
|
||||
}
|
||||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
}
|
||||
|
||||
#ifdef ARCH_X86
|
||||
if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
|
||||
#else
|
||||
if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
|
||||
/* sanity check, if 64bit pointer we can't have a 32 bit cpu */
|
||||
if (sizeof(void*) == 8) {
|
||||
if (gotoblas == &gotoblas_KATMAI ||
|
||||
gotoblas == &gotoblas_COPPERMINE ||
|
||||
gotoblas == &gotoblas_NORTHWOOD ||
|
||||
gotoblas == &gotoblas_BANIAS ||
|
||||
gotoblas == &gotoblas_ATHLON)
|
||||
gotoblas = &gotoblas_PRESCOTT;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (gotoblas && gotoblas -> init) {
|
||||
strncpy(coren,gotoblas_corename(),20);
|
||||
sprintf(coremsg, "Core: %s\n",coren);
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas -> init();
|
||||
} else {
|
||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_quit(void) {
|
||||
|
||||
gotoblas = NULL;
|
||||
|
||||
}
|
||||
@@ -50,7 +50,10 @@ BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {
|
||||
gotoblas_init();
|
||||
break;
|
||||
case DLL_PROCESS_DETACH:
|
||||
gotoblas_quit();
|
||||
// If the process is about to exit, don't bother releasing any resources
|
||||
// The kernel is much better at bulk releasing then.
|
||||
if (!reserved)
|
||||
gotoblas_quit();
|
||||
break;
|
||||
case DLL_THREAD_ATTACH:
|
||||
break;
|
||||
|
||||
@@ -618,19 +618,6 @@
|
||||
# functions added for lapack-3.7.0
|
||||
|
||||
slarfy,
|
||||
slasyf_rk,
|
||||
ssyconvf_rook,
|
||||
ssytf2_rk,
|
||||
ssytrf_rk,
|
||||
ssytrs_3,
|
||||
ssytri_3,
|
||||
ssytri_3x,
|
||||
ssycon_3,
|
||||
ssysv_rk,
|
||||
slasyf_aa,
|
||||
ssysv_aa,
|
||||
ssytrf_aa,
|
||||
ssytrs_aa,
|
||||
strevc3,
|
||||
sgelqt,
|
||||
sgelqt3,
|
||||
@@ -647,33 +634,8 @@
|
||||
stplqt,
|
||||
stplqt2,
|
||||
stpmlqt,
|
||||
ssytrd_2stage,
|
||||
ssytrd_sy2sb,
|
||||
ssytrd_sb2st,
|
||||
ssb2st_kernels,
|
||||
ssyevd_2stage,
|
||||
ssyev_2stage,
|
||||
ssyevx_2stage,
|
||||
ssyevr_2stage,
|
||||
ssbev_2stage,
|
||||
ssbevx_2stage,
|
||||
ssbevd_2stage,
|
||||
ssygv_2stage,
|
||||
dlarfy,
|
||||
dlasyf_rk,
|
||||
dsyconvf,
|
||||
dsyconvf_rook,
|
||||
dsytf2_rk,
|
||||
dsytrf_rk,
|
||||
dsytrs_3,
|
||||
dsytri_3,
|
||||
dsytri_3x,
|
||||
dsycon_3,
|
||||
dsysv_rk,
|
||||
dlasyf_aa,
|
||||
dsysv_aa,
|
||||
dsytrf_aa,
|
||||
dsytrs_aa,
|
||||
dtrevc3,
|
||||
dgelqt,
|
||||
dgelqt3,
|
||||
@@ -690,45 +652,8 @@
|
||||
dtplqt,
|
||||
dtplqt2,
|
||||
dtpmlqt,
|
||||
dsytrd_2stage,
|
||||
dsytrd_sy2sb,
|
||||
dsytrd_sb2st,
|
||||
dsb2st_kernels,
|
||||
dsyevd_2stage,
|
||||
dsyev_2stage,
|
||||
dsyevx_2stage,
|
||||
dsyevr_2stage,
|
||||
dsbev_2stage,
|
||||
dsbevx_2stage,
|
||||
dsbevd_2stage,
|
||||
dsygv_2stage,
|
||||
chetf2_rk,
|
||||
chetrf_rk,
|
||||
chetri_3,
|
||||
chetri_3x,
|
||||
chetrs_3,
|
||||
checon_3,
|
||||
chesv_rk,
|
||||
chesv_aa,
|
||||
chetrf_aa,
|
||||
chetrs_aa,
|
||||
clahef_aa,
|
||||
clahef_rk,
|
||||
clarfy,
|
||||
clasyf_rk,
|
||||
clasyf_aa,
|
||||
csyconvf,
|
||||
csyconvf_rook,
|
||||
csytf2_rk,
|
||||
csytrf_rk,
|
||||
csytrf_aa,
|
||||
csytrs_3,
|
||||
csytrs_aa,
|
||||
csytri_3,
|
||||
csytri_3x,
|
||||
csycon_3,
|
||||
csysv_rk,
|
||||
csysv_aa,
|
||||
ctrevc3,
|
||||
cgelqt,
|
||||
cgelqt3,
|
||||
@@ -745,45 +670,8 @@
|
||||
ctplqt,
|
||||
ctplqt2,
|
||||
ctpmlqt,
|
||||
chetrd_2stage,
|
||||
chetrd_he2hb,
|
||||
chetrd_hb2st,
|
||||
chb2st_kernels,
|
||||
cheevd_2stage,
|
||||
cheev_2stage,
|
||||
cheevx_2stage,
|
||||
cheevr_2stage,
|
||||
chbev_2stage,
|
||||
chbevx_2stage,
|
||||
chbevd_2stage,
|
||||
chegv_2stage,
|
||||
zhetf2_rk,
|
||||
zhetrf_rk,
|
||||
zhetri_3,
|
||||
zhetri_3x,
|
||||
zhetrs_3,
|
||||
zhecon_3,
|
||||
zhesv_rk,
|
||||
zhesv_aa,
|
||||
zhetrf_aa,
|
||||
zhetrs_aa,
|
||||
zlahef_aa,
|
||||
zlahef_rk,
|
||||
zlarfy,
|
||||
zlasyf_rk,
|
||||
zlasyf_aa,
|
||||
zsyconvf,
|
||||
zsyconvf_rook,
|
||||
zsytrs_aa,
|
||||
zsytf2_rk,
|
||||
zsytrf_rk,
|
||||
zsytrf_aa,
|
||||
zsytrs_3,
|
||||
zsytri_3,
|
||||
zsytri_3x,
|
||||
zsycon_3,
|
||||
zsysv_rk,
|
||||
zsysv_aa,
|
||||
ztrevc3,
|
||||
ztplqt,
|
||||
ztplqt2,
|
||||
@@ -800,18 +688,6 @@
|
||||
zlaswlq,
|
||||
zlamswlq,
|
||||
zgemlq,
|
||||
zhetrd_2stage,
|
||||
zhetrd_he2hb,
|
||||
zhetrd_hb2st,
|
||||
zhb2st_kernels,
|
||||
zheevd_2stage,
|
||||
zheev_2stage,
|
||||
zheevx_2stage,
|
||||
zheevr_2stage,
|
||||
zhbev_2stage,
|
||||
zhbevx_2stage,
|
||||
zhbevd_2stage,
|
||||
zhegv_2stage,
|
||||
sladiv1,
|
||||
dladiv1,
|
||||
iparam2stage,
|
||||
@@ -819,24 +695,18 @@
|
||||
# functions added for lapack-3.8.0
|
||||
|
||||
ilaenv2stage,
|
||||
ssysv_aa_2stage,
|
||||
ssytrf_aa_2stage,
|
||||
ssytrs_aa_2stage,
|
||||
chesv_aa_2stage,
|
||||
chetrf_aa_2stage,
|
||||
chetrs_aa_2stage,
|
||||
csysv_aa_2stage,
|
||||
csytrf_aa_2stage,
|
||||
csytrs_aa_2stage,
|
||||
dsysv_aa_2stage,
|
||||
dsytrf_aa_2stage,
|
||||
dsytrs_aa_2stage,
|
||||
zhesv_aa_2stage,
|
||||
zhetrf_aa_2stage,
|
||||
zhetrs_aa_2stage,
|
||||
zsysv_aa_2stage,
|
||||
zsytrf_aa_2stage,
|
||||
zsytrs_aa_2stage
|
||||
|
||||
# functions added for lapack-3.9.0
|
||||
cgesvdq,
|
||||
cungtsqr,
|
||||
dcombssq,
|
||||
dgesvdq,
|
||||
dorgtsqr,
|
||||
scombssq,
|
||||
sgesvdq,
|
||||
sorgtsqr,
|
||||
zgesvdq,
|
||||
zungtsqr
|
||||
);
|
||||
|
||||
@lapack_extendedprecision_objs = (
|
||||
@@ -3489,6 +3359,15 @@
|
||||
LAPACKE_zsytrf_aa_2stage_work,
|
||||
LAPACKE_zsytrs_aa_2stage,
|
||||
LAPACKE_zsytrs_aa_2stage_work,
|
||||
|
||||
# new functions from 3.9.0
|
||||
LAPACKE_dgesvdq,
|
||||
LAPACKE_dgesvdq_work,
|
||||
LAPACKE_sgesvdq,
|
||||
LAPACKE_sgesvdq_work,
|
||||
LAPACKE_zgesvdq,
|
||||
LAPACKE_zgesvdq_work
|
||||
|
||||
);
|
||||
|
||||
#These function may need 2 underscores.
|
||||
@@ -3509,6 +3388,65 @@
|
||||
zlahef_rook, zlasyf_rook,
|
||||
zsytf2_rook, zsytrf_rook, zsytrs_rook,
|
||||
zsytri_rook, zsycon_rook, zsysv_rook,
|
||||
# 3.7.0
|
||||
slasyf_rk, ssyconvf_rook, ssytf2_rk,
|
||||
ssytrf_rk, ssytrs_3, ssytri_3,
|
||||
ssytri_3x, ssycon_3, ssysv_rk,
|
||||
slasyf_aa, ssysv_aa, ssytrf_aa,
|
||||
ssytrs_aa, ssytrd_2stage, ssytrd_sy2sb,
|
||||
ssytrd_sb2st, ssb2st_kernels, ssyevd_2stage,
|
||||
ssyev_2stage, ssyevx_2stage, ssyevr_2stage,
|
||||
ssbev_2stage, ssbevx_2stage, ssbevd_2stage,
|
||||
ssygv_2stage, dlasyf_rk, dsyconvf_rook,
|
||||
dsytf2_rk, dsytrf_rk, dsytrs_3,
|
||||
dsytri_3, dsytri_3x, dsycon_3,
|
||||
dsysv_rk, dlasyf_aa, dsysv_aa,
|
||||
dsytrf_aa, dsytrs_aa, dsytrd_2stage,
|
||||
dsytrd_sy2sb, dsytrd_sb2st, dsb2st_kernels,
|
||||
dsyevd_2stage, dsyev_2stage, dsyevx_2stage,
|
||||
dsyevr_2stage, dsbev_2stage, dsbevx_2stage,
|
||||
dsbevd_2stage, dsygv_2stage, chetf2_rk,
|
||||
chetrf_rk, chetri_3, chetri_3x,
|
||||
chetrs_3, checon_3, chesv_rk,
|
||||
chesv_aa, chetrf_aa, chetrs_aa,
|
||||
clahef_aa, clahef_rk, clasyf_rk,
|
||||
clasyf_aa, csytf2_rk, csytrf_rk,
|
||||
csytrf_aa, csytrs_3, csytrs_aa,
|
||||
csytri_3, csytri_3x, csycon_3,
|
||||
csysv_rk, csysv_aa, csyconvf_rook,
|
||||
chetrd_2stage, chetrd_he2hb, chetrd_hb2st,
|
||||
chb2st_kernels, cheevd_2stage, cheev_2stage,
|
||||
cheevx_2stage, cheevr_2stage, chbev_2stage,
|
||||
chbevx_2stage, chbevd_2stage, chegv_2stage,
|
||||
zhetf2_rk, zhetrf_rk, zhetri_3,
|
||||
zhetri_3x, zhetrs_3, zhecon_3,
|
||||
zhesv_rk, zhesv_aa, zhetrf_aa,
|
||||
zhetrs_aa, zlahef_aa, zlahef_rk,
|
||||
zlasyf_rk, zlasyf_aa, zsyconvf_rook,
|
||||
zsytrs_aa, zsytf2_rk, zsytrf_rk,
|
||||
zsytrf_aa, zsytrs_3, zsytri_3,
|
||||
zsytri_3x, zsycon_3, zsysv_rk,
|
||||
zsysv_aa, zhetrd_2stage, zhetrd_he2hb,
|
||||
zhetrd_hb2st, zhb2st_kernels, zheevd_2stage,
|
||||
zheev_2stage, zheevx_2stage, zheevr_2stage,
|
||||
zhbev_2stage, zhbevx_2stage, zhbevd_2stage,
|
||||
zhegv_2stage,
|
||||
# 3.8.0
|
||||
ssysv_aa_2stage, ssytrf_aa_2stage,
|
||||
ssytrs_aa_2stage, chesv_aa_2stage,
|
||||
chetrf_aa_2stage, chetrs_aa_2stage,
|
||||
csysv_aa_2stage, csytrf_aa_2stage,
|
||||
csytrs_aa_2stage, dsysv_aa_2stage,
|
||||
dsytrf_aa_2stage, dsytrs_aa_2stage,
|
||||
zhesv_aa_2stage, zhetrf_aa_2stage,
|
||||
zhetrs_aa_2stage, zsysv_aa_2stage,
|
||||
zsytrf_aa_2stage, zsytrs_aa_2stage,
|
||||
# 3.9.0
|
||||
claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col,
|
||||
dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col,
|
||||
slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col,
|
||||
zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col
|
||||
|
||||
);
|
||||
|
||||
|
||||
|
||||
12
f_check
12
f_check
@@ -19,7 +19,7 @@ $nofortran = 0;
|
||||
|
||||
$compiler = join(" ", @ARGV);
|
||||
$compiler_bin = shift(@ARGV);
|
||||
|
||||
|
||||
# f77 is too ambiguous
|
||||
$compiler = "" if $compiler eq "f77";
|
||||
|
||||
@@ -71,7 +71,7 @@ if ($compiler eq "") {
|
||||
|
||||
if ($data =~ /GNU/) {
|
||||
|
||||
$data =~ /(\d)\.(\d).(\d)/;
|
||||
$data =~ /(\d+)\.(\d+).(\d+)/;
|
||||
$major = $1;
|
||||
$minor = $2;
|
||||
|
||||
@@ -130,6 +130,11 @@ if ($compiler eq "") {
|
||||
if ($data =~ / zho_ge__/) {
|
||||
$need2bu = 1;
|
||||
}
|
||||
if ($vendor =~ /G95/) {
|
||||
if ($ENV{NO_LAPACKE} != 1) {
|
||||
$need2bu = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($vendor eq "") {
|
||||
@@ -277,6 +282,8 @@ $linker_a = "";
|
||||
if ($link ne "") {
|
||||
|
||||
$link =~ s/\-Y\sP\,/\-Y/g;
|
||||
|
||||
$link =~ s/\-R\s*/\-rpath\@/g;
|
||||
|
||||
$link =~ s/\-rpath\s+/\-rpath\@/g;
|
||||
|
||||
@@ -327,6 +334,7 @@ if ($link ne "") {
|
||||
&& ($flags !~ /kernel32/)
|
||||
&& ($flags !~ /advapi32/)
|
||||
&& ($flags !~ /shell32/)
|
||||
&& ($flags !~ /omp/)
|
||||
&& ($flags !~ /^\-l$/)
|
||||
) {
|
||||
$linker_l .= $flags . " ";
|
||||
|
||||
47
getarch.c
47
getarch.c
@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#ifdef OS_WINDOWS
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
@@ -1028,6 +1028,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_NEOVERSEN1
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "NEOVERSEN1"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DNEOVERSEN1 " \
|
||||
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" \
|
||||
"-march=armv8.2-a -mtune=cortex-a72"
|
||||
#define LIBNAME "neoversen1"
|
||||
#define CORENAME "NEOVERSEN1"
|
||||
#else
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FORCE_FALKOR
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
@@ -1093,6 +1111,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_EMAG8180
|
||||
#define ARMV8
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "EMAG8180"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DEMAG8180 " \
|
||||
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "emag8180"
|
||||
#define CORENAME "EMAG8180"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ZARCH_GENERIC
|
||||
#define FORCE
|
||||
@@ -1201,7 +1235,7 @@ static int get_num_cores(void) {
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
SYSTEM_INFO sysinfo;
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
int m[2], count;
|
||||
size_t len;
|
||||
#endif
|
||||
@@ -1215,7 +1249,7 @@ static int get_num_cores(void) {
|
||||
GetSystemInfo(&sysinfo);
|
||||
return sysinfo.dwNumberOfProcessors;
|
||||
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
@@ -1298,6 +1332,13 @@ int main(int argc, char *argv[]){
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n");
|
||||
#endif
|
||||
#if defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0
|
||||
printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n");
|
||||
#endif
|
||||
|
||||
#ifdef MAKE_NB_JOBS
|
||||
#if MAKE_NB_JOBS > 0
|
||||
printf("MAKE += -j %d\n", MAKE_NB_JOBS);
|
||||
|
||||
@@ -394,7 +394,7 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS)
|
||||
SLAPACKOBJS = \
|
||||
sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \
|
||||
spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \
|
||||
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX)
|
||||
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX)
|
||||
|
||||
|
||||
#DLAPACKOBJS = \
|
||||
@@ -405,14 +405,14 @@ SLAPACKOBJS = \
|
||||
DLAPACKOBJS = \
|
||||
dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \
|
||||
dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \
|
||||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX)
|
||||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX)
|
||||
|
||||
|
||||
QLAPACKOBJS = \
|
||||
qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \
|
||||
qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \
|
||||
qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \
|
||||
|
||||
qlaswp.$(SUFFIX) qtrtrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \
|
||||
qtrtrs.$(SUFFIX)
|
||||
|
||||
#CLAPACKOBJS = \
|
||||
# cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
|
||||
@@ -423,7 +423,7 @@ QLAPACKOBJS = \
|
||||
CLAPACKOBJS = \
|
||||
cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
|
||||
cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \
|
||||
clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX)
|
||||
clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX)
|
||||
|
||||
|
||||
#ZLAPACKOBJS = \
|
||||
@@ -435,13 +435,14 @@ CLAPACKOBJS = \
|
||||
ZLAPACKOBJS = \
|
||||
zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \
|
||||
zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \
|
||||
zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX)
|
||||
zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX)
|
||||
|
||||
|
||||
XLAPACKOBJS = \
|
||||
xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \
|
||||
xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \
|
||||
xlaswp.$(SUFFIX) xgetrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \
|
||||
xlaswp.$(SUFFIX) xtrtrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \
|
||||
xtrtrs.$(SUFFIX)
|
||||
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
SBLASOBJS += $(SLAPACKOBJS)
|
||||
@@ -2031,7 +2032,7 @@ sgetrs.$(SUFFIX) sgetrs.$(PSUFFIX) : lapack/getrs.c
|
||||
dgetrs.$(SUFFIX) dgetrs.$(PSUFFIX) : lapack/getrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : getrs.c
|
||||
qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : lapack/getrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
@@ -2040,7 +2041,25 @@ cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : zgetrs.c
|
||||
xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
strtrs.$(SUFFIX) strtrs.$(PSUFFIX) : lapack/trtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
dtrtrs.$(SUFFIX) dtrtrs.$(PSUFFIX) : lapack/trtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
qtrtrs.$(SUFFIX) qtrtrs.$(PSUFFIX) : lapack/trtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ctrtrs.$(SUFFIX) ctrtrs.$(PSUFFIX) : lapack/ztrtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ztrtrs.$(SUFFIX) ztrtrs.$(PSUFFIX) : lapack/ztrtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
xtrtrs.$(SUFFIX) xtrtrs.$(PSUFFIX) : lapack/ztrtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : lapack/gesv.c
|
||||
|
||||
@@ -44,19 +44,19 @@
|
||||
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "QGESV "
|
||||
#define ERROR_NAME "QGESV"
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "DGESV "
|
||||
#define ERROR_NAME "DGESV"
|
||||
#else
|
||||
#define ERROR_NAME "SGESV "
|
||||
#define ERROR_NAME "SGESV"
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "XGESV "
|
||||
#define ERROR_NAME "XGESV"
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "ZGESV "
|
||||
#define ERROR_NAME "ZGESV"
|
||||
#else
|
||||
#define ERROR_NAME "CGESV "
|
||||
#define ERROR_NAME "CGESV"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -89,7 +89,7 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv,
|
||||
if (args.m < 0) info = 1;
|
||||
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
||||
if (args.n < 0) info = 2;
|
||||
if (args.m < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
||||
if (args.n < 0) info = 2;
|
||||
if (args.m < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -102,7 +102,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
|
||||
if (trans < 0) info = 1;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (uplo < 0) info = 1;
|
||||
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
|
||||
if (diag < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -99,7 +99,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
|
||||
if (diag < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
171
interface/lapack/trtrs.c
Normal file
171
interface/lapack/trtrs.c
Normal file
@@ -0,0 +1,171 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "QTRTRS"
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "DTRTRS"
|
||||
#else
|
||||
#define ERROR_NAME "STRTRS"
|
||||
#endif
|
||||
|
||||
static blasint (*trtrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_SINGLE, TRTRS_UNN_SINGLE, TRTRS_UTU_SINGLE, TRTRS_UTN_SINGLE, TRTRS_LNU_SINGLE, TRTRS_LNN_SINGLE, TRTRS_LTU_SINGLE, TRTRS_LTN_SINGLE,
|
||||
};
|
||||
|
||||
#ifdef SMP
|
||||
static blasint (*trtrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_PARALLEL, TRTRS_UNN_PARALLEL, TRTRS_UTU_PARALLEL, TRTRS_UTN_PARALLEL, TRTRS_LNU_PARALLEL, TRTRS_LNN_PARALLEL, TRTRS_LTU_PARALLEL, TRTRS_LTN_PARALLEL,
|
||||
};
|
||||
#endif
|
||||
|
||||
int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
|
||||
FLOAT *b, blasint *ldB, blasint *Info){
|
||||
|
||||
char uplo_arg = *UPLO;
|
||||
char trans_arg = *TRANS;
|
||||
char diag_arg = *DIAG;
|
||||
|
||||
blas_arg_t args;
|
||||
|
||||
blasint info;
|
||||
int uplo, trans, diag;
|
||||
FLOAT *buffer;
|
||||
#ifdef PPC440
|
||||
extern
|
||||
#endif
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
args.m = *N;
|
||||
args.n = *NRHS;
|
||||
args.a = (void *)a;
|
||||
args.lda = *ldA;
|
||||
args.b = (void *)b;
|
||||
args.ldb = *ldB;
|
||||
|
||||
info = 0;
|
||||
|
||||
TOUPPER(trans_arg);
|
||||
trans = -1;
|
||||
if (trans_arg == 'N') trans = 0;
|
||||
if (trans_arg == 'T') trans = 1;
|
||||
if (trans_arg == 'R') trans = 0;
|
||||
if (trans_arg == 'C') trans = 1;
|
||||
|
||||
uplo = -1;
|
||||
if (uplo_arg == 'U') uplo = 0;
|
||||
if (uplo_arg == 'L') uplo = 1;
|
||||
|
||||
diag = -1;
|
||||
if (diag_arg == 'U') diag = 0;
|
||||
if (diag_arg == 'N') diag = 1;
|
||||
|
||||
if (args.ldb < MAX(1, args.m)) info = 9;
|
||||
if (args.lda < MAX(1, args.m)) info = 7;
|
||||
if (args.n < 0) info = 5;
|
||||
if (args.m < 0) info = 4;
|
||||
if (trans < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (diag < 0) info = 3;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
args.alpha = NULL;
|
||||
args.beta = NULL;
|
||||
|
||||
*Info = 0;
|
||||
|
||||
if (args.m == 0) return 0;
|
||||
|
||||
if (diag) {
|
||||
if (AMIN_K(args.m, args.a, args.lda + 1) == ZERO) {
|
||||
*Info = IAMIN_K(args.m, args.a, args.lda + 1);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
#ifndef PPC440
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
(trtrs_single[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
(trtrs_parallel[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef PPC440
|
||||
blas_memory_free(buffer);
|
||||
#endif
|
||||
|
||||
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
||||
if (args.n < 0) info = 2;
|
||||
if (args.m < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
||||
if (args.n < 0) info = 2;
|
||||
if (args.m < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -102,7 +102,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
|
||||
if (trans < 0) info = 1;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -91,7 +91,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -91,7 +91,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
if (uplo < 0) info = 1;
|
||||
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
|
||||
if (diag < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
|
||||
if (diag < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
171
interface/lapack/ztrtrs.c
Normal file
171
interface/lapack/ztrtrs.c
Normal file
@@ -0,0 +1,171 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "XTRTRS"
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "ZTRTRS"
|
||||
#else
|
||||
#define ERROR_NAME "CTRTRS"
|
||||
#endif
|
||||
|
||||
static blasint (*trtrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_SINGLE, TRTRS_UNN_SINGLE, TRTRS_UTU_SINGLE, TRTRS_UTN_SINGLE, TRTRS_URU_SINGLE, TRTRS_URN_SINGLE, TRTRS_UCU_SINGLE, TRTRS_UCN_SINGLE, TRTRS_LNU_SINGLE, TRTRS_LNN_SINGLE, TRTRS_LTU_SINGLE, TRTRS_LTN_SINGLE, TRTRS_LRU_SINGLE, TRTRS_LRN_SINGLE, TRTRS_LCU_SINGLE, TRTRS_LCN_SINGLE,
|
||||
};
|
||||
|
||||
#ifdef SMP
|
||||
static blasint (*trtrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_PARALLEL, TRTRS_UNN_PARALLEL, TRTRS_UTU_PARALLEL, TRTRS_UTN_PARALLEL, TRTRS_URU_PARALLEL, TRTRS_URN_PARALLEL, TRTRS_UCU_PARALLEL, TRTRS_UCN_PARALLEL, TRTRS_LNU_PARALLEL, TRTRS_LNN_PARALLEL, TRTRS_LTU_PARALLEL, TRTRS_LTN_PARALLEL, TRTRS_LRU_PARALLEL, TRTRS_LRN_PARALLEL, TRTRS_LCU_PARALLEL, TRTRS_LCN_PARALLEL,
|
||||
};
|
||||
#endif
|
||||
|
||||
int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
|
||||
FLOAT *b, blasint *ldB, blasint *Info){
|
||||
|
||||
char uplo_arg = *UPLO;
|
||||
char trans_arg = *TRANS;
|
||||
char diag_arg = *DIAG;
|
||||
|
||||
blas_arg_t args;
|
||||
|
||||
blasint info;
|
||||
int uplo, trans, diag;
|
||||
FLOAT *buffer;
|
||||
#ifdef PPC440
|
||||
extern
|
||||
#endif
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
args.m = *N;
|
||||
args.n = *NRHS;
|
||||
args.a = (void *)a;
|
||||
args.lda = *ldA;
|
||||
args.b = (void *)b;
|
||||
args.ldb = *ldB;
|
||||
|
||||
info = 0;
|
||||
|
||||
TOUPPER(trans_arg);
|
||||
trans = -1;
|
||||
if (trans_arg == 'N') trans = 0;
|
||||
if (trans_arg == 'T') trans = 1;
|
||||
if (trans_arg == 'R') trans = 2;
|
||||
if (trans_arg == 'C') trans = 3;
|
||||
|
||||
uplo = -1;
|
||||
if (uplo_arg == 'U') uplo = 0;
|
||||
if (uplo_arg == 'L') uplo = 1;
|
||||
|
||||
diag = -1;
|
||||
if (diag_arg == 'U') diag = 0;
|
||||
if (diag_arg == 'N') diag = 1;
|
||||
|
||||
if (args.ldb < MAX(1, args.m)) info = 9;
|
||||
if (args.lda < MAX(1, args.m)) info = 7;
|
||||
if (args.n < 0) info = 5;
|
||||
if (args.m < 0) info = 4;
|
||||
if (trans < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (diag < 0) info = 3;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
args.alpha = NULL;
|
||||
args.beta = NULL;
|
||||
|
||||
*Info = 0;
|
||||
|
||||
if (args.m == 0) return 0;
|
||||
|
||||
if (diag) {
|
||||
if (AMIN_K(args.m, args.a, args.lda + 1) == ZERO) {
|
||||
*Info = IAMIN_K(args.m, args.a, args.lda + 1);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
#ifndef PPC440
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
(trtrs_single[(uplo << 3) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
(trtrs_parallel[(uplo << 3) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef PPC440
|
||||
blas_memory_free(buffer);
|
||||
#endif
|
||||
|
||||
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
@@ -47,7 +47,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}MAXKERNEL}" "" "max_k" false "" "" false ${float_type})
|
||||
endif ()
|
||||
if (DEFINED ${float_char}MINKERNEL)
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}MINKERNEL}" "" "min_k" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}MINKERNEL}" "USE_MIN" "min_k" false "" "" false ${float_type})
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false ${float_type})
|
||||
@@ -55,7 +55,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||
GenerateNamedObjects("${KERNELDIR}/${I${float_char}MAXKERNEL}" "" "i*max_k" false "" "" false ${float_type})
|
||||
endif ()
|
||||
if (DEFINED I${float_char}MINKERNEL)
|
||||
GenerateNamedObjects("${KERNELDIR}/${I${float_char}MINKERNEL}" "" "i*min_k" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${I${float_char}MINKERNEL}" "USE_MIN" "i*min_k" false "" "" false ${float_type})
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}ASUMKERNEL}" "" "asum_k" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "" "axpy_k" false "" "" false ${float_type})
|
||||
@@ -121,8 +121,10 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||
|
||||
# Makefile.L3
|
||||
set(USE_TRMM false)
|
||||
|
||||
if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex")
|
||||
if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) )
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9))
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
|
||||
|
||||
@@ -5,6 +5,11 @@ endif
|
||||
TOPDIR = ..
|
||||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
endif
|
||||
|
||||
AVX2OPT =
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# AVX2 support was added in 4.7.0
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
USE_GEMM3M = 0
|
||||
OS := $(shell uname)
|
||||
|
||||
ifeq ($(ARCH), x86)
|
||||
USE_GEMM3M = 1
|
||||
@@ -24,9 +25,11 @@ ifeq ($(TARGET), LOONGSON3B)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), GENERIC)
|
||||
ifneq ($(DYNAMIC_ARCH), 1)
|
||||
ifeq ($(TARGET), GENERIC)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), HASWELL)
|
||||
USE_TRMM = 1
|
||||
@@ -57,8 +60,6 @@ USE_TRMM = 1
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
SKERNELOBJS += \
|
||||
sgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||
$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
|
||||
@@ -436,7 +437,15 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s
|
||||
m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
|
||||
rm sgemmotcopy.s sgemmotcopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
|
||||
@@ -444,12 +453,26 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s
|
||||
m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
|
||||
rm sgemmitcopy.s sgemmitcopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s
|
||||
m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
|
||||
rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
@@ -460,7 +483,14 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s
|
||||
m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
|
||||
rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
@@ -496,7 +526,14 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s
|
||||
m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
|
||||
rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
@@ -512,7 +549,14 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s
|
||||
m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
|
||||
rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
@@ -537,37 +581,107 @@ endif
|
||||
endif
|
||||
|
||||
$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s
|
||||
m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||
rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s
|
||||
m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||
rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND)
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s
|
||||
m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
|
||||
rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s
|
||||
m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
|
||||
rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
|
||||
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
||||
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s
|
||||
m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
|
||||
rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s
|
||||
m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
|
||||
rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s
|
||||
m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
|
||||
rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s
|
||||
m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
|
||||
rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s
|
||||
m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
|
||||
rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||
@@ -584,28 +698,84 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
|
||||
|
||||
ifdef USE_TRMM
|
||||
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s
|
||||
m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
|
||||
rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s
|
||||
m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
|
||||
rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s
|
||||
m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
|
||||
rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s
|
||||
m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
|
||||
rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s
|
||||
m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
|
||||
rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s
|
||||
m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
|
||||
rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s
|
||||
m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
|
||||
rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
@@ -620,52 +790,165 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s
|
||||
m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
|
||||
rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s
|
||||
m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
|
||||
rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s
|
||||
m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@
|
||||
rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s
|
||||
m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
|
||||
rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s
|
||||
m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
|
||||
rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s
|
||||
m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
|
||||
rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s
|
||||
m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
|
||||
rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s
|
||||
m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
|
||||
rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s
|
||||
m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
|
||||
rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s
|
||||
m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s
|
||||
m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s
|
||||
m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s
|
||||
m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s
|
||||
m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s
|
||||
m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s
|
||||
m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
|
||||
else
|
||||
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
@@ -677,7 +960,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
@@ -804,7 +1094,14 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT
|
||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
|
||||
|
||||
$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s
|
||||
m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
|
||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
|
||||
rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
|
||||
else
|
||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)
|
||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
|
||||
@@ -1940,7 +2237,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY)
|
||||
|
||||
endif
|
||||
|
||||
$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||
$(D<GEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY)
|
||||
@@ -2044,7 +2341,14 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||
|
||||
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
|
||||
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
||||
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||
@@ -2083,7 +2387,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||
m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
|
||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
|
||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
|
||||
@@ -91,12 +91,10 @@ IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
ifneq ($(OS_DARWIN)$(CROSS),11)
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
endif
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
SDOTKERNEL = dot.S
|
||||
@@ -104,48 +102,35 @@ CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
ifeq ($(OS_DARWIN)$(CROSS),11)
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
else
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
ifeq ($(SGEMM_UNROLL_N), 16)
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 4)
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
@@ -202,5 +187,3 @@ ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
endif
|
||||
|
||||
3
kernel/arm64/KERNEL.EMAG8180
Normal file
3
kernel/arm64/KERNEL.EMAG8180
Normal file
@@ -0,0 +1,3 @@
|
||||
include $(KERNELDIR)/KERNEL.CORTEXA57
|
||||
|
||||
|
||||
189
kernel/arm64/KERNEL.NEOVERSEN1
Normal file
189
kernel/arm64/KERNEL.NEOVERSEN1
Normal file
@@ -0,0 +1,189 @@
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
DDOTKERNEL = dot_thunderx2t99.c
|
||||
SDOTKERNEL = dot_thunderx2t99.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 16)
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 4)
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
@@ -109,13 +109,29 @@ ZGEMVTKERNEL = zgemv_t.S
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
ifeq ($(SGEMM_UNROLL_N), 16)
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 4)
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
|
||||
252
kernel/arm64/dgemm_beta.S
Normal file
252
kernel/arm64/dgemm_beta.S
Normal file
@@ -0,0 +1,252 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0
|
||||
#define N x1
|
||||
#define BETA d0
|
||||
#define LDC x6
|
||||
#define C00 x7
|
||||
|
||||
#define A01 x8
|
||||
#define A02 x9
|
||||
#define A03 x10
|
||||
#define A04 x11
|
||||
|
||||
#define beta0 d11
|
||||
#define betaV0 v11.d[0]
|
||||
#define I x16
|
||||
|
||||
#define prfm_size 640
|
||||
#define calc_size 128
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
.macro INIT_ZERO
|
||||
fmul v0.2d, v0.2d, betaV0
|
||||
fmul v1.2d, v1.2d, betaV0
|
||||
fmul v2.2d, v2.2d, betaV0
|
||||
fmul v3.2d, v3.2d, betaV0
|
||||
fmul v4.2d, v4.2d, betaV0
|
||||
fmul v5.2d, v5.2d, betaV0
|
||||
fmul v6.2d, v6.2d, betaV0
|
||||
fmul v7.2d, v7.2d, betaV0
|
||||
.endm
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
|
||||
ldr LDC, [sp]
|
||||
SAVE_REGS
|
||||
|
||||
.Lgemm_beta_BEGIN:
|
||||
|
||||
fmov beta0, BETA
|
||||
cmp N, #0
|
||||
ble .Lgemm_beta_L999
|
||||
|
||||
fcmp BETA, #0.0
|
||||
beq .Lgemm_beta_zero_01
|
||||
|
||||
.Lgemm_beta_01:
|
||||
|
||||
lsl LDC, LDC, #3
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_02:
|
||||
|
||||
mov A01, C00
|
||||
add C00, C00, LDC
|
||||
asr I, M, #4
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_04
|
||||
add A02, A01, #32
|
||||
add A03, A02, #32
|
||||
add A04, A03, #32
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_03:
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
ldp q2, q3, [A02]
|
||||
ldp q4, q5, [A03]
|
||||
ldp q6, q7, [A04]
|
||||
|
||||
fmul v0.2d, v0.2d, betaV0
|
||||
fmul v1.2d, v1.2d, betaV0
|
||||
|
||||
fmul v2.2d, v2.2d, betaV0
|
||||
fmul v3.2d, v3.2d, betaV0
|
||||
|
||||
prfm PLDL1KEEP, [A01, prfm_size]
|
||||
|
||||
fmul v4.2d, v4.2d, betaV0
|
||||
fmul v5.2d, v5.2d, betaV0
|
||||
|
||||
prfm PLDL1KEEP, [A03, prfm_size]
|
||||
|
||||
fmul v6.2d, v6.2d, betaV0
|
||||
fmul v7.2d, v7.2d, betaV0
|
||||
|
||||
st1 {v0.2d, v1.2d}, [A01]
|
||||
add A01, A01, calc_size
|
||||
st1 {v2.2d, v3.2d}, [A02]
|
||||
add A02, A02, calc_size
|
||||
st1 {v4.2d, v5.2d}, [A03]
|
||||
add A03, A03, calc_size
|
||||
st1 {v6.2d, v7.2d}, [A04]
|
||||
add A04, A04, calc_size
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lgemm_beta_03
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_04:
|
||||
|
||||
and I, M , #15 // M%16
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_06
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_05:
|
||||
|
||||
ldr d12, [A01]
|
||||
fmul d12, d12, beta0
|
||||
str d12, [A01]
|
||||
add A01, A01, #8
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lgemm_beta_05
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_06:
|
||||
|
||||
subs N , N, #1 // N--
|
||||
bne .Lgemm_beta_02
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_L999:
|
||||
|
||||
mov x0, #0
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
.Lgemm_beta_zero_01:
|
||||
INIT_ZERO
|
||||
lsl LDC, LDC, #3
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_02:
|
||||
mov A01, C00
|
||||
add C00, C00, LDC
|
||||
|
||||
asr I, M, #4
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_zero_04
|
||||
|
||||
add A02, A01, #64
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_03:
|
||||
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [A01]
|
||||
add A01, A01, calc_size
|
||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [A02]
|
||||
add A02, A02, calc_size
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lgemm_beta_zero_03
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_04:
|
||||
|
||||
and I, M, #15
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_zero_06
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_05:
|
||||
|
||||
str beta0, [A01]
|
||||
add A01, A01, #8
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lgemm_beta_zero_05
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_06:
|
||||
|
||||
subs N, N, #1
|
||||
bne .Lgemm_beta_zero_02
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_L999:
|
||||
|
||||
mov x0, #0
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
@@ -54,37 +54,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#if !defined(DOUBLE)
|
||||
ldr s4, [X], #4
|
||||
fcmp s4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs s4, s4
|
||||
fcmp SCALE, s4
|
||||
bge KERNEL_F1_SCALE_GE_X_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */
|
||||
fdiv s2, SCALE, s4
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_X_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_X_\@: */
|
||||
fdiv s2, s4, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
#else
|
||||
ldr d4, [X], #8
|
||||
fcmp d4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs d4, d4
|
||||
fcmp SCALE, d4
|
||||
bge KERNEL_F1_SCALE_GE_X_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */
|
||||
fdiv d2, SCALE, d4
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_X_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_X_\@: */
|
||||
fdiv d2, d4, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
#endif
|
||||
KERNEL_F1_NEXT_\@:
|
||||
2: /* KERNEL_F1_NEXT_\@: */
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
259
kernel/arm64/sgemm_beta.S
Executable file
259
kernel/arm64/sgemm_beta.S
Executable file
@@ -0,0 +1,259 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0
|
||||
#define N x1
|
||||
#define BETA s0
|
||||
#define LDC x6
|
||||
#define C00 x7
|
||||
|
||||
#define A01 x8
|
||||
#define A02 x9
|
||||
#define A03 x10
|
||||
#define A04 x11
|
||||
#define I x12
|
||||
|
||||
#define beta0 s11
|
||||
#define betaV0 v11.s[0]
|
||||
|
||||
#define prfm_size 640
|
||||
#define calc_size 128
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
.macro INIT_ZERO
|
||||
fmul v0.4s, v0.4s, betaV0
|
||||
fmul v1.4s, v1.4s, betaV0
|
||||
fmul v2.4s, v2.4s, betaV0
|
||||
fmul v3.4s, v3.4s, betaV0
|
||||
fmul v4.4s, v4.4s, betaV0
|
||||
fmul v5.4s, v5.4s, betaV0
|
||||
fmul v6.4s, v6.4s, betaV0
|
||||
fmul v7.4s, v7.4s, betaV0
|
||||
.endm
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
|
||||
ldr LDC, [sp]
|
||||
SAVE_REGS
|
||||
|
||||
.Lgemm_beta_BEGIN:
|
||||
|
||||
fmov beta0, BETA
|
||||
cmp N, #0
|
||||
ble .Lgemm_beta_L999
|
||||
|
||||
fcmp BETA, #0.0
|
||||
beq .Lgemm_beta_zero_01
|
||||
|
||||
.Lgemm_beta_01:
|
||||
|
||||
lsl LDC, LDC, #2
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_02:
|
||||
|
||||
mov A01, C00
|
||||
add C00, C00, LDC
|
||||
asr I, M, #5
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_04
|
||||
add A02, A01, #32
|
||||
add A03, A02, #32
|
||||
add A04, A03, #32
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_03:
|
||||
|
||||
prfm PLDL1KEEP, [A01, prfm_size]
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
ldp q2, q3, [A02]
|
||||
ldp q4, q5, [A03]
|
||||
ldp q6, q7, [A04]
|
||||
|
||||
fmul v0.4s, v0.4s, betaV0
|
||||
fmul v1.4s, v1.4s, betaV0
|
||||
|
||||
fmul v2.4s, v2.4s, betaV0
|
||||
fmul v3.4s, v3.4s, betaV0
|
||||
|
||||
fmul v4.4s, v4.4s, betaV0
|
||||
fmul v5.4s, v5.4s, betaV0
|
||||
|
||||
fmul v6.4s, v6.4s, betaV0
|
||||
fmul v7.4s, v7.4s, betaV0
|
||||
|
||||
prfm PLDL1KEEP, [A01, prfm_size + 64]
|
||||
|
||||
st1 {v0.4s, v1.4s}, [A01]
|
||||
add A01, A01, calc_size
|
||||
st1 {v2.4s, v3.4s}, [A02]
|
||||
add A02, A02, calc_size
|
||||
st1 {v4.4s, v5.4s}, [A03]
|
||||
add A03, A03, calc_size
|
||||
st1 {v6.4s, v7.4s}, [A04]
|
||||
add A04, A04, calc_size
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lgemm_beta_03
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_04:
|
||||
|
||||
and I, M , #31
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_06
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_05:
|
||||
|
||||
ldr s12, [A01]
|
||||
fmul s12, s12, beta0
|
||||
str s12, [A01]
|
||||
add A01, A01, #4
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lgemm_beta_05
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_06:
|
||||
|
||||
subs N , N, #1 // N--
|
||||
bne .Lgemm_beta_02
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_L999:
|
||||
|
||||
mov x0, #0
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_01:
|
||||
|
||||
INIT_ZERO
|
||||
lsl LDC, LDC, #2
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_02:
|
||||
|
||||
mov A01, C00
|
||||
add C00, C00, LDC
|
||||
|
||||
asr I, M, #5
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_zero_04
|
||||
add A02, A01, #32
|
||||
add A03, A02, #32
|
||||
add A04, A03, #32
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_03:
|
||||
|
||||
st1 {v0.4s, v1.4s}, [A01]
|
||||
add A01, A01, calc_size
|
||||
st1 {v2.4s, v3.4s}, [A02]
|
||||
add A02, A02, calc_size
|
||||
st1 {v4.4s, v5.4s}, [A03]
|
||||
add A03, A03, calc_size
|
||||
st1 {v6.4s, v7.4s}, [A04]
|
||||
add A04, A04, calc_size
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lgemm_beta_zero_03
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_04:
|
||||
|
||||
and I, M, #31
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_zero_06
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_05:
|
||||
|
||||
str beta0, [A01]
|
||||
add A01, A01, #4
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lgemm_beta_zero_05
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_06:
|
||||
|
||||
subs N, N, #1
|
||||
bne .Lgemm_beta_zero_02
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_L999:
|
||||
mov x0, #0
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
333
kernel/arm64/sgemm_ncopy_4.S
Normal file
333
kernel/arm64/sgemm_ncopy_4.S
Normal file
@@ -0,0 +1,333 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0
|
||||
#define N x1
|
||||
#define A00 x2
|
||||
#define LDA x3
|
||||
#define B00 x4
|
||||
|
||||
#define A01 x5
|
||||
#define A02 x6
|
||||
#define A03 x7
|
||||
#define A04 x8
|
||||
|
||||
#define I x9
|
||||
#define J x10
|
||||
|
||||
#define TEMP1 x11
|
||||
#define TEMP2 x12
|
||||
|
||||
#define A_PREFETCH 2560
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
.macro COPY4x4
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01], #16
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v9.s[0], v0.s[1]
|
||||
ins v10.s[0], v0.s[2]
|
||||
ins v11.s[0], v0.s[3]
|
||||
|
||||
ldr q1, [A02], #16
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v9.s[1], v1.s[1]
|
||||
ins v10.s[1], v1.s[2]
|
||||
ins v11.s[1], v1.s[3]
|
||||
|
||||
ldr q2, [A03], #16
|
||||
ins v8.s[2], v2.s[0]
|
||||
ins v9.s[2], v2.s[1]
|
||||
ins v10.s[2], v2.s[2]
|
||||
ins v11.s[2], v2.s[3]
|
||||
|
||||
ldr q3, [A04], #16
|
||||
ins v8.s[3], v3.s[0]
|
||||
ins v9.s[3], v3.s[1]
|
||||
ins v10.s[3], v3.s[2]
|
||||
ins v11.s[3], v3.s[3]
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00]
|
||||
add B00, B00, #64
|
||||
|
||||
.endm
|
||||
|
||||
.macro COPY1x4
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01], #4
|
||||
ldr s1, [A02], #4
|
||||
ldr s2, [A03], #4
|
||||
ldr s3, [A04], #4
|
||||
|
||||
stp s0, s1, [B00]
|
||||
add B00, B00, #8
|
||||
stp s2, s3, [B00]
|
||||
add B00, B00, #8
|
||||
.endm
|
||||
|
||||
.macro COPY4x2
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01], #16
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v9.s[0], v0.s[1]
|
||||
ins v10.s[0], v0.s[2]
|
||||
ins v11.s[0], v0.s[3]
|
||||
|
||||
ldr q1, [A02], #16
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v9.s[1], v1.s[1]
|
||||
ins v10.s[1], v1.s[2]
|
||||
ins v11.s[1], v1.s[3]
|
||||
|
||||
st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00]
|
||||
add B00, B00, #32
|
||||
.endm
|
||||
|
||||
|
||||
.macro COPY1x2
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01], #4
|
||||
ldr s1, [A02], #4
|
||||
|
||||
stp s0, s1, [B00]
|
||||
add B00, B00, #8
|
||||
.endm
|
||||
|
||||
.macro COPY4x1
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01], #16
|
||||
str q0, [B00], #16
|
||||
.endm
|
||||
|
||||
|
||||
.macro COPY1x1
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01], #4
|
||||
str s0, [B00], #4
|
||||
.endm
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
|
||||
SAVE_REGS
|
||||
|
||||
lsl LDA, LDA, #2 // LDA = LDA * SIZE
|
||||
|
||||
.Ldgemm_ncopy_L4_BEGIN:
|
||||
|
||||
asr J, N, #2 // J = N / 4
|
||||
cmp J, #0
|
||||
ble .Ldgemm_ncopy_L2_BEGIN
|
||||
|
||||
.align 5
|
||||
.Ldgemm_ncopy_L4_M4_BEGIN:
|
||||
|
||||
mov A01, A00
|
||||
add A02, A01, LDA
|
||||
add A03, A02, LDA
|
||||
add A04, A03, LDA
|
||||
add A00, A04, LDA
|
||||
|
||||
asr I, M, #2 // I = M / 4
|
||||
cmp I, #0
|
||||
ble .Ldgemm_ncopy_L4_M4_40
|
||||
|
||||
.align 5
|
||||
.Ldgemm_ncopy_L4_M4_20:
|
||||
|
||||
COPY4x4
|
||||
|
||||
subs I , I , #1
|
||||
bne .Ldgemm_ncopy_L4_M4_20
|
||||
|
||||
.Ldgemm_ncopy_L4_M4_40:
|
||||
|
||||
and I, M , #3
|
||||
cmp I, #0
|
||||
ble .Ldgemm_ncopy_L4_M4_END
|
||||
|
||||
.align 5
|
||||
.Ldgemm_ncopy_L4_M4_60:
|
||||
|
||||
COPY1x4
|
||||
|
||||
subs I , I , #1
|
||||
bne .Ldgemm_ncopy_L4_M4_60
|
||||
|
||||
.Ldgemm_ncopy_L4_M4_END:
|
||||
|
||||
subs J , J, #1 // j--
|
||||
bne .Ldgemm_ncopy_L4_M4_BEGIN
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Ldgemm_ncopy_L2_BEGIN:
|
||||
|
||||
tst N, #3
|
||||
ble .Ldgemm_ncopy_L999
|
||||
|
||||
tst N, #2
|
||||
ble .Ldgemm_ncopy_L1_BEGIN
|
||||
|
||||
.Ldgemm_ncopy_L2_M4_BEGIN:
|
||||
mov A01, A00
|
||||
add A02, A01, LDA
|
||||
add A00, A02, LDA
|
||||
|
||||
asr I, M, #2 // I = M / 4
|
||||
cmp I, #0
|
||||
ble .Ldgemm_ncopy_L2_M4_40
|
||||
|
||||
.align 5
|
||||
.Ldgemm_ncopy_L2_M4_20:
|
||||
|
||||
COPY4x2
|
||||
|
||||
subs I , I , #1
|
||||
bne .Ldgemm_ncopy_L2_M4_20
|
||||
|
||||
.Ldgemm_ncopy_L2_M4_40:
|
||||
|
||||
and I, M , #3
|
||||
cmp I, #0
|
||||
ble .Ldgemm_ncopy_L2_M4_END
|
||||
|
||||
.align 5
|
||||
.Ldgemm_ncopy_L2_M4_60:
|
||||
|
||||
COPY1x2
|
||||
|
||||
subs I , I , #1
|
||||
bne .Ldgemm_ncopy_L2_M4_60
|
||||
|
||||
.Ldgemm_ncopy_L2_M4_END:
|
||||
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Ldgemm_ncopy_L1_BEGIN:
|
||||
|
||||
tst N, #1
|
||||
ble .Ldgemm_ncopy_L999
|
||||
|
||||
.Ldgemm_ncopy_L1_M4_BEGIN:
|
||||
|
||||
mov A01, A00
|
||||
|
||||
asr I, M, #2 // I = M / 4
|
||||
cmp I, #0
|
||||
ble .Ldgemm_ncopy_L1_M4_40
|
||||
|
||||
.align 5
|
||||
.Ldgemm_ncopy_L1_M4_20:
|
||||
|
||||
COPY4x1
|
||||
|
||||
subs I , I , #1
|
||||
bne .Ldgemm_ncopy_L1_M4_20
|
||||
|
||||
|
||||
.Ldgemm_ncopy_L1_M4_40:
|
||||
|
||||
and I, M , #3
|
||||
cmp I, #0
|
||||
ble .Ldgemm_ncopy_L1_M4_END
|
||||
|
||||
.align 5
|
||||
.Ldgemm_ncopy_L1_M4_60:
|
||||
|
||||
COPY1x1
|
||||
|
||||
subs I , I , #1
|
||||
bne .Ldgemm_ncopy_L1_M4_60
|
||||
|
||||
|
||||
.Ldgemm_ncopy_L1_M4_END:
|
||||
|
||||
.Ldgemm_ncopy_L999:
|
||||
|
||||
mov x0, #0
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
||||
824
kernel/arm64/sgemm_tcopy_16.S
Normal file
824
kernel/arm64/sgemm_tcopy_16.S
Normal file
@@ -0,0 +1,824 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0
|
||||
#define N x1
|
||||
#define A x2
|
||||
#define LDA x3
|
||||
#define B x4
|
||||
|
||||
#define M8 x5
|
||||
|
||||
#define A01 x6
|
||||
#define A02 x7
|
||||
#define A03 x8
|
||||
#define A04 x9
|
||||
#define A05 x10
|
||||
#define A06 x11
|
||||
#define A07 x12
|
||||
#define A08 x13
|
||||
|
||||
#define B01 x14
|
||||
#define B02 x15
|
||||
#define B03 x16
|
||||
#define B04 x17
|
||||
#define B00 x22
|
||||
|
||||
|
||||
#define I x18
|
||||
#define J x19
|
||||
|
||||
#define TEMP1 x20
|
||||
|
||||
#define A_PREFETCH 256
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
|
||||
.macro COPY16x8
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
//prfm PSTL1KEEP, [B00, M8]
|
||||
|
||||
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
|
||||
add A01, A01, #64
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||
add TEMP1, B00, #64
|
||||
|
||||
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
|
||||
add A02, A02, #64
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
|
||||
add A03, A03, #64
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
|
||||
add A04, A04, #64
|
||||
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05]
|
||||
add A05, A05, #64
|
||||
|
||||
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06]
|
||||
add A06, A06, #64
|
||||
|
||||
st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07]
|
||||
add A07, A07, #64
|
||||
|
||||
st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08]
|
||||
add A08, A08, #64
|
||||
|
||||
st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
add B00, B00, M8
|
||||
|
||||
.endm
|
||||
|
||||
.macro COPY8x8
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
ldp q2, q3, [A02]
|
||||
add A01, A01, #32
|
||||
add A02, A02, #32
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
|
||||
ldp q4, q5, [A03]
|
||||
ldp q6, q7, [A04]
|
||||
add A03, A03, #32
|
||||
add A04, A04, #32
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
|
||||
ldp q8, q9, [A05]
|
||||
ldp q10, q11, [A06]
|
||||
add A05, A05, #32
|
||||
add A06, A06, #32
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
|
||||
ldp q12, q13, [A07]
|
||||
ldp q14, q15, [A08]
|
||||
add A07, A07, #32
|
||||
add A08, A08, #32
|
||||
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
.endm
|
||||
|
||||
.macro COPY4x8
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
ldr q1, [A02]
|
||||
ldr q2, [A03]
|
||||
ldr q3, [A04]
|
||||
add A01, A01, #16
|
||||
add A02, A02, #16
|
||||
add A03, A03, #16
|
||||
add A04, A04, #16
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
|
||||
add B02, B02, #64
|
||||
|
||||
ldr q4, [A05]
|
||||
ldr q5, [A06]
|
||||
ldr q6, [A07]
|
||||
ldr q7, [A08]
|
||||
|
||||
add A05, A05, #16
|
||||
add A06, A06, #16
|
||||
add A07, A07, #16
|
||||
add A08, A08, #16
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B02]
|
||||
add B02, B02, #64
|
||||
.endm
|
||||
|
||||
.macro COPY2x8
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
ldr d1, [A02]
|
||||
ldr d2, [A03]
|
||||
ldr d3, [A04]
|
||||
|
||||
add A01, A01, #8
|
||||
add A02, A02, #8
|
||||
add A03, A03, #8
|
||||
add A04, A04, #8
|
||||
|
||||
stp d0, d1, [B03]
|
||||
add B03, B03, #16
|
||||
stp d2, d3, [B03]
|
||||
add B03, B03, #16
|
||||
|
||||
ldr d4, [A05]
|
||||
ldr d5, [A06]
|
||||
ldr d6, [A07]
|
||||
ldr d7, [A08]
|
||||
|
||||
add A05, A05, #8
|
||||
add A06, A06, #8
|
||||
add A07, A07, #8
|
||||
add A08, A08, #8
|
||||
|
||||
stp d4, d5, [B03]
|
||||
add B03, B03, #16
|
||||
stp d6, d7, [B03]
|
||||
add B03, B03, #16
|
||||
|
||||
.endm
|
||||
|
||||
.macro COPY1x8
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
ldr s1, [A02]
|
||||
ldr s2, [A03]
|
||||
ldr s3, [A04]
|
||||
|
||||
add A01, A01, #4
|
||||
add A02, A02, #4
|
||||
add A03, A03, #4
|
||||
add A04, A04, #4
|
||||
|
||||
stp s0, s1, [B04]
|
||||
add B04, B04, #8
|
||||
stp s2, s3, [B04]
|
||||
add B04, B04, #8
|
||||
|
||||
ldr s4, [A05]
|
||||
ldr s5, [A06]
|
||||
ldr s6, [A07]
|
||||
ldr s7, [A08]
|
||||
|
||||
ldr d4, [A05], #8
|
||||
ldr d5, [A06], #8
|
||||
ldr d6, [A07], #8
|
||||
ldr d7, [A08], #8
|
||||
|
||||
stp s4, s5, [B04]
|
||||
add B04, B04, #8
|
||||
stp s6, s7, [B04]
|
||||
add B04, B04, #8
|
||||
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
.macro COPY16x4
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
|
||||
add A01, A01, #64
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||
add TEMP1, B00, #64
|
||||
|
||||
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
|
||||
add A02, A02, #64
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
|
||||
add A03, A03, #64
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
|
||||
add A04, A04, #64
|
||||
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
|
||||
|
||||
add B00, B00, M8
|
||||
.endm
|
||||
|
||||
.macro COPY8x4
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
ldp q2, q3, [A02]
|
||||
add A01, A01, #32
|
||||
add A02, A02, #32
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
|
||||
ldp q4, q5, [A03]
|
||||
ldp q6, q7, [A04]
|
||||
add A03, A03, #32
|
||||
add A04, A04, #32
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
.endm
|
||||
|
||||
.macro COPY4x4
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
ldr q1, [A02]
|
||||
ldr q2, [A03]
|
||||
ldr q3, [A04]
|
||||
add A01, A01, #16
|
||||
add A02, A02, #16
|
||||
add A03, A03, #16
|
||||
add A04, A04, #16
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
|
||||
|
||||
add B02, B02, #64
|
||||
.endm
|
||||
|
||||
.macro COPY2x4
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
ldr d1, [A02]
|
||||
ldr d2, [A03]
|
||||
ldr d3, [A04]
|
||||
|
||||
add A01, A01, #8
|
||||
add A02, A02, #8
|
||||
add A03, A03, #8
|
||||
add A04, A04, #8
|
||||
|
||||
stp d0, d1, [B03]
|
||||
add B03, B03, #16
|
||||
stp d2, d3, [B03]
|
||||
|
||||
add B03, B03, #16
|
||||
.endm
|
||||
|
||||
.macro COPY1x4
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
ldr s1, [A02]
|
||||
ldr s2, [A03]
|
||||
ldr s3, [A04]
|
||||
|
||||
add A01, A01, #4
|
||||
add A02, A02, #4
|
||||
add A03, A03, #4
|
||||
add A04, A04, #4
|
||||
|
||||
stp s0, s1, [B04]
|
||||
add B04, B04, #8
|
||||
stp s2, s3, [B04]
|
||||
add B04, B04, #8
|
||||
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
|
||||
.macro COPY16x2
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
|
||||
add A01, A01, #64
|
||||
|
||||
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
|
||||
add A02, A02, #64
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||
add TEMP1, B00, #64
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
|
||||
add B00, B00, M8
|
||||
.endm
|
||||
|
||||
.macro COPY8x2
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ld1 {v0.4s, v1.4s}, [A01]
|
||||
ld1 {v2.4s, v3.4s}, [A02]
|
||||
add A01, A01, #32
|
||||
add A02, A02, #32
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
.endm
|
||||
|
||||
.macro COPY4x2
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
ldr q1, [A02]
|
||||
add A01, A01, #16
|
||||
add A02, A02, #16
|
||||
|
||||
stp q0, q1, [B02]
|
||||
add B02, B02, #32
|
||||
.endm
|
||||
|
||||
.macro COPY2x2
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
ldr d1, [A02]
|
||||
|
||||
add A01, A01, #8
|
||||
add A02, A02, #8
|
||||
|
||||
stp d0, d1, [B03]
|
||||
add B03, B03, #16
|
||||
.endm
|
||||
|
||||
.macro COPY1x2
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
ldr s1, [A02]
|
||||
|
||||
add A01, A01, #4
|
||||
add A02, A02, #4
|
||||
|
||||
stp s0, s1, [B04]
|
||||
|
||||
add B04, B04, #8
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
|
||||
.macro COPY16x1
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
|
||||
add A01, A01, #64
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||
add B00, B00, M8
|
||||
.endm
|
||||
|
||||
.macro COPY8x1
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
add A01, A01, #32
|
||||
stp q0, q1, [B01]
|
||||
|
||||
add B01, B01, #32
|
||||
.endm
|
||||
|
||||
.macro COPY4x1
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
add A01, A01, #16
|
||||
str q0, [B02]
|
||||
|
||||
add B02, B02, #16
|
||||
.endm
|
||||
|
||||
.macro COPY2x1
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
add A01, A01, #8
|
||||
str d0, [B03]
|
||||
|
||||
add B03, B03, #8
|
||||
.endm
|
||||
|
||||
.macro COPY1x1
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
add A01, A01, #4
|
||||
str s0, [B04]
|
||||
|
||||
add B04, B04, #4
|
||||
.endm
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
|
||||
SAVE_REGS
|
||||
|
||||
lsl LDA, LDA, #2 // LDA = LDA * SIZE
|
||||
|
||||
lsl TEMP1, M, #2 // TEMP1 = M * SIZE
|
||||
|
||||
and B01 , N , #-16
|
||||
and B02 , N , #-8
|
||||
and B03 , N , #-4
|
||||
and B04 , N , #-2
|
||||
|
||||
mul B01, B01, TEMP1
|
||||
mul B02, B02, TEMP1
|
||||
mul B03, B03, TEMP1
|
||||
mul B04, B04, TEMP1
|
||||
|
||||
add B01 , B01, B
|
||||
add B02 , B02, B
|
||||
add B03 , B03, B
|
||||
add B04 , B04, B
|
||||
|
||||
lsl M8, M, #6 // M8 = M * 16 * SIZE
|
||||
|
||||
.Lsgemm_tcopy_L8_BEGIN:
|
||||
asr J, M, #3 // J = M / 8
|
||||
cmp J, #0
|
||||
ble .Lsgemm_tcopy_L4_BEGIN
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_M16_BEGIN:
|
||||
|
||||
mov A01, A
|
||||
add A02, A01, LDA
|
||||
add A03, A02, LDA
|
||||
add A04, A03, LDA
|
||||
add A05, A04, LDA
|
||||
add A06, A05, LDA
|
||||
add A07, A06, LDA
|
||||
add A08, A07, LDA
|
||||
add A, A08, LDA
|
||||
|
||||
mov B00, B
|
||||
add B, B00, #512 // B = B + 8 * 16 * SIZE
|
||||
|
||||
asr I, N, #4 // I = N / 16
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L8_M16_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_M16_20:
|
||||
|
||||
COPY16x8
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L8_M16_20
|
||||
|
||||
.Lsgemm_tcopy_L8_M16_40:
|
||||
tst N , #8
|
||||
ble .Lsgemm_tcopy_L8_M16_60
|
||||
|
||||
COPY8x8
|
||||
|
||||
.Lsgemm_tcopy_L8_M16_60:
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L8_M16_80
|
||||
|
||||
COPY4x8
|
||||
|
||||
.Lsgemm_tcopy_L8_M16_80:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L8_M16_100
|
||||
|
||||
COPY2x8
|
||||
|
||||
.Lsgemm_tcopy_L8_M16_100:
|
||||
|
||||
tst N, #1
|
||||
ble .Lsgemm_tcopy_L8_M16_END
|
||||
|
||||
COPY1x8
|
||||
|
||||
.Lsgemm_tcopy_L8_M16_END:
|
||||
|
||||
subs J , J, #1 // j--
|
||||
bne .Lsgemm_tcopy_L8_M16_BEGIN
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_tcopy_L4_BEGIN:
|
||||
tst M, #7
|
||||
ble .Lsgemm_tcopy_L999
|
||||
|
||||
tst M, #4
|
||||
ble .Lsgemm_tcopy_L2_BEGIN
|
||||
|
||||
.Lsgemm_tcopy_L4_M16_BEGIN:
|
||||
|
||||
mov A01, A
|
||||
add A02, A01, LDA
|
||||
add A03, A02, LDA
|
||||
add A04, A03, LDA
|
||||
add A, A04, LDA
|
||||
|
||||
mov B00, B
|
||||
add B, B00, #256 // B = B + 4 * 16 * SIZE
|
||||
|
||||
asr I, N, #4 // I = N / 16
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L4_M16_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L4_M16_20:
|
||||
|
||||
COPY16x4
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L4_M16_20
|
||||
|
||||
.Lsgemm_tcopy_L4_M16_40:
|
||||
tst N , #8
|
||||
ble .Lsgemm_tcopy_L4_M16_60
|
||||
|
||||
COPY8x4
|
||||
|
||||
.Lsgemm_tcopy_L4_M16_60:
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L4_M16_80
|
||||
|
||||
COPY4x4
|
||||
|
||||
.Lsgemm_tcopy_L4_M16_80:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L4_M16_100
|
||||
|
||||
COPY2x4
|
||||
|
||||
|
||||
.Lsgemm_tcopy_L4_M16_100:
|
||||
|
||||
tst N, #1
|
||||
ble .Lsgemm_tcopy_L4_M16_END
|
||||
|
||||
COPY1x4
|
||||
|
||||
|
||||
.Lsgemm_tcopy_L4_M16_END:
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_tcopy_L2_BEGIN:
|
||||
|
||||
tst M, #3
|
||||
ble .Lsgemm_tcopy_L999
|
||||
|
||||
tst M, #2
|
||||
ble .Lsgemm_tcopy_L1_BEGIN
|
||||
|
||||
.Lsgemm_tcopy_L2_M16_BEGIN:
|
||||
mov A01, A
|
||||
add A02, A01, LDA
|
||||
add A, A02, LDA
|
||||
|
||||
mov B00, B
|
||||
add B, B00, #128 // B = B + 2 * 16 * SIZE
|
||||
|
||||
asr I, N, #4 // I = N / 16
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L2_M16_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L2_M16_20:
|
||||
|
||||
COPY16x2
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L2_M16_20
|
||||
|
||||
.Lsgemm_tcopy_L2_M16_40:
|
||||
tst N , #8
|
||||
ble .Lsgemm_tcopy_L2_M16_60
|
||||
|
||||
COPY8x2
|
||||
|
||||
.Lsgemm_tcopy_L2_M16_60:
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L2_M16_80
|
||||
|
||||
COPY4x2
|
||||
|
||||
.Lsgemm_tcopy_L2_M16_80:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L2_M16_100
|
||||
|
||||
COPY2x2
|
||||
|
||||
.Lsgemm_tcopy_L2_M16_100:
|
||||
|
||||
tst N , #1
|
||||
ble .Lsgemm_tcopy_L2_M16_END
|
||||
|
||||
COPY1x2
|
||||
|
||||
.Lsgemm_tcopy_L2_M16_END:
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_tcopy_L1_BEGIN:
|
||||
|
||||
tst M, #1
|
||||
ble .Lsgemm_tcopy_L999
|
||||
|
||||
|
||||
.Lsgemm_tcopy_L1_M16_BEGIN:
|
||||
|
||||
mov A01, A // A01 = A
|
||||
mov B00, B
|
||||
|
||||
asr I, N, #4 // I = M / 16
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L1_M16_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L1_M16_20:
|
||||
|
||||
COPY16x1
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L1_M16_20
|
||||
|
||||
.Lsgemm_tcopy_L1_M16_40:
|
||||
tst N , #8
|
||||
ble .Lsgemm_tcopy_L1_M16_60
|
||||
|
||||
COPY8x1
|
||||
|
||||
.Lsgemm_tcopy_L1_M16_60:
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L1_M16_80
|
||||
|
||||
COPY4x1
|
||||
|
||||
.Lsgemm_tcopy_L1_M16_80:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L1_M16_100
|
||||
|
||||
COPY2x1
|
||||
|
||||
.Lsgemm_tcopy_L1_M16_100:
|
||||
|
||||
tst N , #1
|
||||
ble .Lsgemm_tcopy_L1_M16_END
|
||||
|
||||
COPY1x1
|
||||
|
||||
|
||||
.Lsgemm_tcopy_L1_M16_END:
|
||||
|
||||
.Lsgemm_tcopy_L999:
|
||||
mov x0, #0 // set return value
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
||||
|
||||
@@ -54,138 +54,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#if !defined(DOUBLE)
|
||||
ldr s4, [X], #4
|
||||
fcmp s4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs s4, s4
|
||||
fcmp SCALE, s4
|
||||
bge KERNEL_F1_SCALE_GE_XR_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_XR_\@ */
|
||||
fdiv s2, SCALE, s4
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_XR_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_XR_\@: */
|
||||
fdiv s2, s4, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
KERNEL_F1_NEXT_\@:
|
||||
2: /* KERNEL_F1_NEXT_\@: */
|
||||
ldr s5, [X], #4
|
||||
fcmp s5, REGZERO
|
||||
beq KERNEL_F1_END_\@
|
||||
beq 4f /* KERNEL_F1_END_\@ */
|
||||
fabs s5, s5
|
||||
fcmp SCALE, s5
|
||||
bge KERNEL_F1_SCALE_GE_XI_\@
|
||||
bge 3f /* KERNEL_F1_SCALE_GE_XI_\@ */
|
||||
fdiv s2, SCALE, s5
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s5
|
||||
b KERNEL_F1_END_\@
|
||||
KERNEL_F1_SCALE_GE_XI_\@:
|
||||
b 4f /* KERNEL_F1_END_\@ */
|
||||
3: /* KERNEL_F1_SCALE_GE_XI_\@: */
|
||||
fdiv s2, s5, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
#else
|
||||
ldr d4, [X], #8
|
||||
fcmp d4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs d4, d4
|
||||
fcmp SCALE, d4
|
||||
bge KERNEL_F1_SCALE_GE_XR_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_XR_\@ */
|
||||
fdiv d2, SCALE, d4
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_XR_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_XR_\@: */
|
||||
fdiv d2, d4, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
KERNEL_F1_NEXT_\@:
|
||||
2: /* KERNEL_F1_NEXT_\@: */
|
||||
ldr d5, [X], #8
|
||||
fcmp d5, REGZERO
|
||||
beq KERNEL_F1_END_\@
|
||||
beq 4f /* KERNEL_F1_END_\@ */
|
||||
fabs d5, d5
|
||||
fcmp SCALE, d5
|
||||
bge KERNEL_F1_SCALE_GE_XI_\@
|
||||
bge 3f /* KERNEL_F1_SCALE_GE_XI_\@ */
|
||||
fdiv d2, SCALE, d5
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d5
|
||||
b KERNEL_F1_END_\@
|
||||
KERNEL_F1_SCALE_GE_XI_\@:
|
||||
b 4f /* KERNEL_F1_END_\@ */
|
||||
3: /* KERNEL_F1_SCALE_GE_XI_\@: */
|
||||
fdiv d2, d5, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
#endif
|
||||
KERNEL_F1_END_\@:
|
||||
4: /* KERNEL_F1_END_\@: */
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
#if !defined(DOUBLE)
|
||||
ldr s4, [X]
|
||||
fcmp s4, REGZERO
|
||||
beq KERNEL_S1_NEXT_\@
|
||||
beq KERNEL_S1_NEXT
|
||||
fabs s4, s4
|
||||
fcmp SCALE, s4
|
||||
bge KERNEL_S1_SCALE_GE_XR_\@
|
||||
bge KERNEL_S1_SCALE_GE_XR
|
||||
fdiv s2, SCALE, s4
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s4
|
||||
b KERNEL_S1_NEXT_\@
|
||||
KERNEL_S1_SCALE_GE_XR_\@:
|
||||
b KERNEL_S1_NEXT
|
||||
KERNEL_S1_SCALE_GE_XR:
|
||||
fdiv s2, s4, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
KERNEL_S1_NEXT_\@:
|
||||
KERNEL_S1_NEXT:
|
||||
ldr s5, [X, #4]
|
||||
fcmp s5, REGZERO
|
||||
beq KERNEL_S1_END_\@
|
||||
beq KERNEL_S1_END
|
||||
fabs s5, s5
|
||||
fcmp SCALE, s5
|
||||
bge KERNEL_S1_SCALE_GE_XI_\@
|
||||
bge KERNEL_S1_SCALE_GE_XI
|
||||
fdiv s2, SCALE, s5
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s5
|
||||
b KERNEL_S1_END_\@
|
||||
KERNEL_S1_SCALE_GE_XI_\@:
|
||||
b KERNEL_S1_END
|
||||
KERNEL_S1_SCALE_GE_XI:
|
||||
fdiv s2, s5, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
#else
|
||||
ldr d4, [X]
|
||||
fcmp d4, REGZERO
|
||||
beq KERNEL_S1_NEXT_\@
|
||||
beq KERNEL_S1_NEXT
|
||||
fabs d4, d4
|
||||
fcmp SCALE, d4
|
||||
bge KERNEL_S1_SCALE_GE_XR_\@
|
||||
bge KERNEL_S1_SCALE_GE_XR
|
||||
fdiv d2, SCALE, d4
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d4
|
||||
b KERNEL_S1_NEXT_\@
|
||||
KERNEL_S1_SCALE_GE_XR_\@:
|
||||
b KERNEL_S1_NEXT
|
||||
KERNEL_S1_SCALE_GE_XR:
|
||||
fdiv d2, d4, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
KERNEL_S1_NEXT_\@:
|
||||
KERNEL_S1_NEXT:
|
||||
ldr d5, [X, #8]
|
||||
fcmp d5, REGZERO
|
||||
beq KERNEL_S1_END_\@
|
||||
beq KERNEL_S1_END
|
||||
fabs d5, d5
|
||||
fcmp SCALE, d5
|
||||
bge KERNEL_S1_SCALE_GE_XI_\@
|
||||
bge KERNEL_S1_SCALE_GE_XI
|
||||
fdiv d2, SCALE, d5
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d5
|
||||
b KERNEL_S1_END_\@
|
||||
KERNEL_S1_SCALE_GE_XI_\@:
|
||||
b KERNEL_S1_END
|
||||
KERNEL_S1_SCALE_GE_XI:
|
||||
fdiv d2, d5, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
#endif
|
||||
KERNEL_S1_END_\@:
|
||||
KERNEL_S1_END:
|
||||
add X, X, INC_X
|
||||
.endm
|
||||
|
||||
|
||||
@@ -42,101 +42,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
||||
FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5,
|
||||
FLOAT *c, BLASLONG ldc){
|
||||
|
||||
|
||||
BLASLONG i, j;
|
||||
BLASLONG chunk, remain;
|
||||
FLOAT *c_offset1, *c_offset;
|
||||
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
|
||||
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
|
||||
|
||||
c_offset = c;
|
||||
|
||||
chunk = m >> 3;
|
||||
remain = m & 7;
|
||||
if (beta == ZERO){
|
||||
|
||||
j = n;
|
||||
do {
|
||||
c_offset1 = c_offset;
|
||||
c_offset += ldc;
|
||||
|
||||
i = (m >> 3);
|
||||
if (i > 0){
|
||||
do {
|
||||
*(c_offset1 + 0) = ZERO;
|
||||
*(c_offset1 + 1) = ZERO;
|
||||
*(c_offset1 + 2) = ZERO;
|
||||
*(c_offset1 + 3) = ZERO;
|
||||
*(c_offset1 + 4) = ZERO;
|
||||
*(c_offset1 + 5) = ZERO;
|
||||
*(c_offset1 + 6) = ZERO;
|
||||
*(c_offset1 + 7) = ZERO;
|
||||
c_offset1 += 8;
|
||||
i --;
|
||||
} while (i > 0);
|
||||
}
|
||||
|
||||
i = (m & 7);
|
||||
if (i > 0){
|
||||
do {
|
||||
*c_offset1 = ZERO;
|
||||
c_offset1 ++;
|
||||
i --;
|
||||
} while (i > 0);
|
||||
}
|
||||
j --;
|
||||
} while (j > 0);
|
||||
|
||||
for(j=n; j>0; j--){
|
||||
c_offset1 = c_offset;
|
||||
c_offset += ldc;
|
||||
for(i=chunk; i>0; i--){
|
||||
*(c_offset1 + 0) = ZERO;
|
||||
*(c_offset1 + 1) = ZERO;
|
||||
*(c_offset1 + 2) = ZERO;
|
||||
*(c_offset1 + 3) = ZERO;
|
||||
*(c_offset1 + 4) = ZERO;
|
||||
*(c_offset1 + 5) = ZERO;
|
||||
*(c_offset1 + 6) = ZERO;
|
||||
*(c_offset1 + 7) = ZERO;
|
||||
c_offset1 += 8;
|
||||
}
|
||||
for(i=remain; i>0; i--){
|
||||
*c_offset1 = ZERO;
|
||||
c_offset1 ++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
j = n;
|
||||
do {
|
||||
c_offset1 = c_offset;
|
||||
c_offset += ldc;
|
||||
|
||||
i = (m >> 3);
|
||||
if (i > 0){
|
||||
do {
|
||||
ctemp1 = *(c_offset1 + 0);
|
||||
ctemp2 = *(c_offset1 + 1);
|
||||
ctemp3 = *(c_offset1 + 2);
|
||||
ctemp4 = *(c_offset1 + 3);
|
||||
ctemp5 = *(c_offset1 + 4);
|
||||
ctemp6 = *(c_offset1 + 5);
|
||||
ctemp7 = *(c_offset1 + 6);
|
||||
ctemp8 = *(c_offset1 + 7);
|
||||
|
||||
ctemp1 *= beta;
|
||||
ctemp2 *= beta;
|
||||
ctemp3 *= beta;
|
||||
ctemp4 *= beta;
|
||||
ctemp5 *= beta;
|
||||
ctemp6 *= beta;
|
||||
ctemp7 *= beta;
|
||||
ctemp8 *= beta;
|
||||
|
||||
*(c_offset1 + 0) = ctemp1;
|
||||
*(c_offset1 + 1) = ctemp2;
|
||||
*(c_offset1 + 2) = ctemp3;
|
||||
*(c_offset1 + 3) = ctemp4;
|
||||
*(c_offset1 + 4) = ctemp5;
|
||||
*(c_offset1 + 5) = ctemp6;
|
||||
*(c_offset1 + 6) = ctemp7;
|
||||
*(c_offset1 + 7) = ctemp8;
|
||||
c_offset1 += 8;
|
||||
i --;
|
||||
} while (i > 0);
|
||||
}
|
||||
|
||||
i = (m & 7);
|
||||
if (i > 0){
|
||||
do {
|
||||
ctemp1 = *c_offset1;
|
||||
ctemp1 *= beta;
|
||||
*c_offset1 = ctemp1;
|
||||
c_offset1 ++;
|
||||
i --;
|
||||
} while (i > 0);
|
||||
}
|
||||
j --;
|
||||
} while (j > 0);
|
||||
|
||||
for(j=n; j>0; j--){
|
||||
c_offset1 = c_offset;
|
||||
c_offset += ldc;
|
||||
for(i=chunk; i>0; i--){
|
||||
*(c_offset1 + 0) *= beta;
|
||||
*(c_offset1 + 1) *= beta;
|
||||
*(c_offset1 + 2) *= beta;
|
||||
*(c_offset1 + 3) *= beta;
|
||||
*(c_offset1 + 4) *= beta;
|
||||
*(c_offset1 + 5) *= beta;
|
||||
*(c_offset1 + 6) *= beta;
|
||||
*(c_offset1 + 7) *= beta;
|
||||
c_offset1 += 8;
|
||||
}
|
||||
for(i=remain; i>0; i--){
|
||||
*c_offset1 *= beta;
|
||||
c_offset1 ++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user