Compare commits
436 Commits
revert-146
...
revert-179
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
84bcdf9c66 | ||
|
|
8f7e986184 | ||
|
|
d0e83666ad | ||
|
|
d4bad73834 | ||
|
|
065763adde | ||
|
|
210b03b543 | ||
|
|
6234a32656 | ||
|
|
c0d7cd3dac | ||
|
|
667f0cc1cb | ||
|
|
d4c8853a02 | ||
|
|
d3d58f8ee5 | ||
|
|
697dc1baf8 | ||
|
|
a9b51b8448 | ||
|
|
eba394c711 | ||
|
|
582c589727 | ||
|
|
adbf6afa25 | ||
|
|
32bec8afbb | ||
|
|
6e2c494556 | ||
|
|
20c5d668fe | ||
|
|
6d43c51ccf | ||
|
|
d74dc39b0f | ||
|
|
41951da6d4 | ||
|
|
474f7e9583 | ||
|
|
79ea839b63 | ||
|
|
f7f97c6148 | ||
|
|
6f22e1cfb8 | ||
|
|
66b43affbc | ||
|
|
1938819c25 | ||
|
|
bda3dbe2eb | ||
|
|
c3e0f0eb38 | ||
|
|
a980953bd7 | ||
|
|
78c99d5231 | ||
|
|
b7496c3638 | ||
|
|
95f4e87579 | ||
|
|
b095f2fad6 | ||
|
|
02ef20a1e4 | ||
|
|
4c3643ed7f | ||
|
|
591cca7cb0 | ||
|
|
3439158dea | ||
|
|
45fe8cb0c5 | ||
|
|
544b069e85 | ||
|
|
9b2a7ad40d | ||
|
|
10ce70701a | ||
|
|
6fc85a6359 | ||
|
|
831c661386 | ||
|
|
7e5df34e6a | ||
|
|
4f45040b89 | ||
|
|
28aa94bf4b | ||
|
|
56e7c68810 | ||
|
|
cf6df9464c | ||
|
|
6f77af2eef | ||
|
|
4d183e5567 | ||
|
|
34d55fd165 | ||
|
|
b991570210 | ||
|
|
288aeea8a2 | ||
|
|
1ad1e79062 | ||
|
|
b402626509 | ||
|
|
ec0cac1669 | ||
|
|
2349e15149 | ||
|
|
f3c262156e | ||
|
|
30f5a69ab8 | ||
|
|
fd081a91e4 | ||
|
|
094f8c3b57 | ||
|
|
5cf090f516 | ||
|
|
58363542e7 | ||
|
|
3abc22a5bf | ||
|
|
1e531701b7 | ||
|
|
5d42b6ea04 | ||
|
|
ba4f433321 | ||
|
|
4cf7315a5d | ||
|
|
b57af93792 | ||
|
|
8aeab0601e | ||
|
|
1cb7b9015e | ||
|
|
a4bd41e9f2 | ||
|
|
9e2bb0c641 | ||
|
|
dbfd7524cd | ||
|
|
2982ce505d | ||
|
|
5bac15adbd | ||
|
|
e17f969fa0 | ||
|
|
e11126b26a | ||
|
|
74608e470d | ||
|
|
f3fd44a731 | ||
|
|
9e917b16db | ||
|
|
8440a4cb1a | ||
|
|
b55690a659 | ||
|
|
b902a40986 | ||
|
|
5991d1a6cd | ||
|
|
b1b743f434 | ||
|
|
2caa2210bb | ||
|
|
2a589c4b28 | ||
|
|
fd42ca462d | ||
|
|
52d3f7af50 | ||
|
|
5c6e020f49 | ||
|
|
d4d3113adc | ||
|
|
375dff54fc | ||
|
|
a5f165275a | ||
|
|
8c13aa495a | ||
|
|
1ee6d087c3 | ||
|
|
a95a784ab2 | ||
|
|
9bec34cb67 | ||
|
|
87bebdbd8a | ||
|
|
9493f26309 | ||
|
|
36add7570a | ||
|
|
cacacc8007 | ||
|
|
1a00ef3d27 | ||
|
|
4c0d832ec3 | ||
|
|
fc33cbc7bb | ||
|
|
c52a831ae4 | ||
|
|
2e99873ff7 | ||
|
|
00abaa865b | ||
|
|
33043f563f | ||
|
|
66da7677bd | ||
|
|
7932ff3ea9 | ||
|
|
62f4c69708 | ||
|
|
73478664d4 | ||
|
|
ee955757f9 | ||
|
|
48610a4524 | ||
|
|
4a553e8678 | ||
|
|
e788102c10 | ||
|
|
165f00c159 | ||
|
|
40c068a875 | ||
|
|
933896a1d0 | ||
|
|
a4e321400b | ||
|
|
9e65430504 | ||
|
|
2cfa86b406 | ||
|
|
2a9a9389ef | ||
|
|
6463bffd59 | ||
|
|
8ef7d4fb54 | ||
|
|
6400868e55 | ||
|
|
8ebf541e97 | ||
|
|
b03ae3f4dc | ||
|
|
2cc8fb0ad2 | ||
|
|
64826a0d7d | ||
|
|
25f2d25cfe | ||
|
|
73131fa30a | ||
|
|
66fcdd5be8 | ||
|
|
43ac839c16 | ||
|
|
7ba5936ecd | ||
|
|
b14f44d2ad | ||
|
|
e71d70ba87 | ||
|
|
d671870f5f | ||
|
|
4e103c822c | ||
|
|
d2142760e0 | ||
|
|
2fbfc64da8 | ||
|
|
8d5b33b6be | ||
|
|
36aea5ce2d | ||
|
|
1309711e24 | ||
|
|
571e9de2ac | ||
|
|
448ed15115 | ||
|
|
045fb5ea2c | ||
|
|
4dd70d98d7 | ||
|
|
504310eeb9 | ||
|
|
ea1f39518f | ||
|
|
5f2a3c05cd | ||
|
|
d0ec4325cf | ||
|
|
3f73e8b8cf | ||
|
|
a83f01e0ee | ||
|
|
a49203b48c | ||
|
|
b74aef2816 | ||
|
|
a9fa805007 | ||
|
|
9d15a3bd16 | ||
|
|
c6aec89d10 | ||
|
|
bbf2124970 | ||
|
|
1392eba488 | ||
|
|
e6d7711199 | ||
|
|
7a914347c5 | ||
|
|
61659f8765 | ||
|
|
3a8f0a6a1f | ||
|
|
3d3c19717c | ||
|
|
24e344038d | ||
|
|
4e9c34018e | ||
|
|
f5243e8e1f | ||
|
|
ba8388cee0 | ||
|
|
6e54b0a027 | ||
|
|
40c8cbc3bf | ||
|
|
d3c9eb4c7d | ||
|
|
f0a8dc2eec | ||
|
|
cc92257ea6 | ||
|
|
2aba1b1658 | ||
|
|
8396e9e777 | ||
|
|
bfad307ed7 | ||
|
|
b83e4c60c7 | ||
|
|
e344db269b | ||
|
|
545b82efd3 | ||
|
|
e322a951fe | ||
|
|
ff2f171036 | ||
|
|
092175cfec | ||
|
|
750162a05f | ||
|
|
e6d93f20f1 | ||
|
|
c38c65eb65 | ||
|
|
ce3651516f | ||
|
|
0144068537 | ||
|
|
1833a67071 | ||
|
|
0b2b83d9ed | ||
|
|
62cf769aa6 | ||
|
|
eb71d61c7c | ||
|
|
9cf22b7d91 | ||
|
|
cc66743b66 | ||
|
|
2aa0a5804e | ||
|
|
28c28ed275 | ||
|
|
a399d00425 | ||
|
|
f66b9c8826 | ||
|
|
2946c46024 | ||
|
|
05978528c3 | ||
|
|
ef6f0b645e | ||
|
|
0c5b7b400b | ||
|
|
952541e840 | ||
|
|
9369d3e6e5 | ||
|
|
10b70c904d | ||
|
|
6a5ab083b7 | ||
|
|
1f9e4f3193 | ||
|
|
5a6a2bed9a | ||
|
|
2d8cc7193a | ||
|
|
2ddc96c9e5 | ||
|
|
7e39ffe113 | ||
|
|
73de17664d | ||
|
|
6eb4b9ae7c | ||
|
|
5c6f008365 | ||
|
|
d148ec4ea1 | ||
|
|
9e162146a9 | ||
|
|
47bf0dba8f | ||
|
|
12603b7dbb | ||
|
|
bf40f806ef | ||
|
|
ed682a4a0c | ||
|
|
fcb77ab129 | ||
|
|
26e1cfb653 | ||
|
|
c628c6fa59 | ||
|
|
67d81ab49d | ||
|
|
2f957947a6 | ||
|
|
de8fff671d | ||
|
|
6f71c0fce4 | ||
|
|
c2545b0fd6 | ||
|
|
e65f451409 | ||
|
|
02634b549b | ||
|
|
0bea6bb9e7 | ||
|
|
3313e4b946 | ||
|
|
e9cd11768c | ||
|
|
63f7395fb4 | ||
|
|
1cbd8f3ae4 | ||
|
|
6c2d90ba77 | ||
|
|
0297b3211a | ||
|
|
66316b9f4c | ||
|
|
6adc4b7b36 | ||
|
|
2ade0ef085 | ||
|
|
e8880c1699 | ||
|
|
ed7c4a043b | ||
|
|
cf234a0561 | ||
|
|
ae2a33128b | ||
|
|
e4718b1fee | ||
|
|
9b87b64262 | ||
|
|
0218b884c1 | ||
|
|
83da278093 | ||
|
|
358d4df2bd | ||
|
|
06d43760e4 | ||
|
|
a4af8861ff | ||
|
|
7fb62aed7e | ||
|
|
f6021c798d | ||
|
|
e8002536ec | ||
|
|
ce6317f6c0 | ||
|
|
15a78d6b66 | ||
|
|
354a976a59 | ||
|
|
38ad05bd04 | ||
|
|
b7feded85a | ||
|
|
dc9fe05ab5 | ||
|
|
8be027e4c6 | ||
|
|
ac7b6e3e9a | ||
|
|
fc66a0ec0b | ||
|
|
89372e0993 | ||
|
|
ef626c6824 | ||
|
|
83fec56a3f | ||
|
|
5a51cf4576 | ||
|
|
5a92b311e0 | ||
|
|
a7d0f49cec | ||
|
|
f1fb9a4745 | ||
|
|
0023515733 | ||
|
|
99c7bba8e4 | ||
|
|
36c4523d85 | ||
|
|
a8002e283a | ||
|
|
401adddb2b | ||
|
|
c5b13d4e10 | ||
|
|
677e42d7b0 | ||
|
|
e2a8c35e5a | ||
|
|
1a49fb1c05 | ||
|
|
8562d5787a | ||
|
|
93f1eb09c3 | ||
|
|
c90bbda3df | ||
|
|
7df8c4f76f | ||
|
|
2fc748bf72 | ||
|
|
a91f1587b9 | ||
|
|
d1b7be14aa | ||
|
|
b491b10057 | ||
|
|
5fae96fb70 | ||
|
|
a7dbd4c57d | ||
|
|
2cae104b5e | ||
|
|
908d40be71 | ||
|
|
43e592ceb3 | ||
|
|
f0f27868d8 | ||
|
|
961d25e9c7 | ||
|
|
939452ea9d | ||
|
|
f5959f2543 | ||
|
|
82012b960b | ||
|
|
8dd3515fa2 | ||
|
|
95f7f0229c | ||
|
|
5082fe4306 | ||
|
|
7a7619af6d | ||
|
|
9a400b7014 | ||
|
|
893b535540 | ||
|
|
6791294312 | ||
|
|
ddb8b124de | ||
|
|
191746c493 | ||
|
|
eb9b021d38 | ||
|
|
7d7564568c | ||
|
|
a07843bc93 | ||
|
|
41ae8e8d67 | ||
|
|
9c1aa0b0fe | ||
|
|
53457f222f | ||
|
|
458e3af5b1 | ||
|
|
3716267124 | ||
|
|
50acc40613 | ||
|
|
c720f1f019 | ||
|
|
d7d950fcf2 | ||
|
|
12398e53ce | ||
|
|
193f835662 | ||
|
|
7e3151ead7 | ||
|
|
e3a069f108 | ||
|
|
6fff8c626a | ||
|
|
d2b9389f1b | ||
|
|
65b8a5c5d8 | ||
|
|
9795adc7ef | ||
|
|
1a8e487c4a | ||
|
|
5966fd52a2 | ||
|
|
dbafe6357b | ||
|
|
71051259e0 | ||
|
|
73cc321190 | ||
|
|
018f2dad27 | ||
|
|
9d5098dbc9 | ||
|
|
d94d7baf7e | ||
|
|
3af1b5c805 | ||
|
|
88e224f4c0 | ||
|
|
d0c0506588 | ||
|
|
e93355e5e1 | ||
|
|
c1eb06e102 | ||
|
|
8145ecd70b | ||
|
|
26ce518d46 | ||
|
|
1d27fa8507 | ||
|
|
802cf6b22d | ||
|
|
894433a7c7 | ||
|
|
1b83341d19 | ||
|
|
954f1832de | ||
|
|
941ad280a8 | ||
|
|
a8ed428bab | ||
|
|
1da365312a | ||
|
|
2d0929fa7c | ||
|
|
125343cc88 | ||
|
|
8a3b6fa108 | ||
|
|
78694f1b7e | ||
|
|
9c5518319a | ||
|
|
86f49c529d | ||
|
|
625c74a38f | ||
|
|
5fcaca6438 | ||
|
|
4fcdd24459 | ||
|
|
68a3c4fca6 | ||
|
|
0c4718c57a | ||
|
|
f29389c7ac | ||
|
|
734d7c6a93 | ||
|
|
7c861605b2 | ||
|
|
2ca0faf495 | ||
|
|
0fe434598b | ||
|
|
15c437e092 | ||
|
|
b966bd79d5 | ||
|
|
2e988dbf35 | ||
|
|
be6090d396 | ||
|
|
daae8fd197 | ||
|
|
20c6c38e51 | ||
|
|
a1fb7670f7 | ||
|
|
6c99c97489 | ||
|
|
6a0930560e | ||
|
|
24f8d5b624 | ||
|
|
77b4dbd53b | ||
|
|
bc4c3bca01 | ||
|
|
6b0a9d135c | ||
|
|
137ccd9dd9 | ||
|
|
84923dedb7 | ||
|
|
8ec28ff461 | ||
|
|
ca8ca796d3 | ||
|
|
8f811a9312 | ||
|
|
36a17536ca | ||
|
|
bb9876db33 | ||
|
|
d636b418af | ||
|
|
a460c92577 | ||
|
|
33f838393c | ||
|
|
a41d241a0e | ||
|
|
8da6b6ae52 | ||
|
|
01c4b82f04 | ||
|
|
93db123f7e | ||
|
|
752fdb5dd8 | ||
|
|
07ed01e97f | ||
|
|
35c5a32309 | ||
|
|
c7b55b6082 | ||
|
|
840e01061f | ||
|
|
28ca97015d | ||
|
|
73c5ca74fa | ||
|
|
e453555d97 | ||
|
|
6a6ffaff1e | ||
|
|
28ac9ea5a6 | ||
|
|
a55694dd5b | ||
|
|
85a41e9cdb | ||
|
|
40160ff3c1 | ||
|
|
6a99fcce94 | ||
|
|
2c7392f07b | ||
|
|
81215711a2 | ||
|
|
809fd0d451 | ||
|
|
72e65157df | ||
|
|
69a8aa6de2 | ||
|
|
0ab5bf1746 | ||
|
|
22167170b3 | ||
|
|
69d9f36ff4 | ||
|
|
f81815e48a | ||
|
|
5f855d965d | ||
|
|
fa9ca65c0e | ||
|
|
719b68f077 | ||
|
|
fe9f15f2d8 | ||
|
|
497f0c3d8a | ||
|
|
ea37db828e | ||
|
|
e6a0a3de73 | ||
|
|
6e70287776 | ||
|
|
58f236ad73 | ||
|
|
e207107150 | ||
|
|
c9d408064a | ||
|
|
288d1a3f6e | ||
|
|
7c1925acec | ||
|
|
2359c7c1a9 | ||
|
|
7646974227 | ||
|
|
e3a80e6aa8 | ||
|
|
2c0a008281 | ||
|
|
c5425daa6b |
27
.travis.yml
27
.travis.yml
@@ -7,6 +7,7 @@ language: c
|
||||
jobs:
|
||||
include:
|
||||
- &test-ubuntu
|
||||
os: linux
|
||||
stage: test
|
||||
compiler: gcc
|
||||
addons:
|
||||
@@ -57,7 +58,8 @@ jobs:
|
||||
- TARGET_BOX=LINUX32
|
||||
- BTYPE="BINARY=32"
|
||||
|
||||
- stage: test
|
||||
- os: linux
|
||||
stage: test
|
||||
compiler: gcc
|
||||
addons:
|
||||
apt:
|
||||
@@ -77,13 +79,14 @@ jobs:
|
||||
# which is slower than container-based infrastructure used for jobs
|
||||
# that don't require sudo.
|
||||
- &test-alpine
|
||||
os: linux
|
||||
stage: test
|
||||
dist: trusty
|
||||
sudo: true
|
||||
language: minimal
|
||||
before_install:
|
||||
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \
|
||||
&& echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1"
|
||||
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
|
||||
&& echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1"
|
||||
- alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||
install:
|
||||
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
|
||||
@@ -120,6 +123,7 @@ jobs:
|
||||
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
|
||||
|
||||
- &test-cmake
|
||||
os: linux
|
||||
stage: test
|
||||
compiler: clang
|
||||
addons:
|
||||
@@ -147,6 +151,23 @@ jobs:
|
||||
env:
|
||||
- CMAKE=1
|
||||
|
||||
- &test-macos
|
||||
os: osx
|
||||
stage: test
|
||||
osx_image: xcode8
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
- brew update
|
||||
- brew install gcc # for gfortran
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||
|
||||
- <<: *test-macos
|
||||
env:
|
||||
- BTYPE="BINARY=32"
|
||||
|
||||
# whitelist
|
||||
branches:
|
||||
only:
|
||||
|
||||
123
CMakeLists.txt
123
CMakeLists.txt
@@ -6,21 +6,30 @@ cmake_minimum_required(VERSION 2.8.5)
|
||||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 0.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 4.dev)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
include(GNUInstallDirs)
|
||||
|
||||
set(OpenBLAS_LIBNAME openblas)
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
|
||||
#######
|
||||
if(MSVC)
|
||||
option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
|
||||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
|
||||
endif()
|
||||
option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF)
|
||||
option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF)
|
||||
option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF)
|
||||
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
||||
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF)
|
||||
option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
|
||||
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
|
||||
|
||||
# Add a prefix or suffix to all exported symbol names in the shared library.
|
||||
# Avoids conflicts with other BLAS libraries, especially when using
|
||||
# 64 bit integer interfaces in OpenBLAS.
|
||||
|
||||
set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" )
|
||||
set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" )
|
||||
#######
|
||||
if(BUILD_WITHOUT_LAPACK)
|
||||
set(NO_LAPACK 1)
|
||||
@@ -34,11 +43,13 @@ endif()
|
||||
#######
|
||||
|
||||
|
||||
message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.")
|
||||
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
|
||||
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
|
||||
|
||||
set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE})
|
||||
|
||||
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
|
||||
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
@@ -146,6 +157,7 @@ endif()
|
||||
|
||||
# add objects to the openblas lib
|
||||
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
|
||||
|
||||
# Android needs to explicitly link against libm
|
||||
if(ANDROID)
|
||||
@@ -165,6 +177,7 @@ endif()
|
||||
# Set output for libopenblas
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
|
||||
|
||||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
||||
@@ -204,14 +217,84 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
|
||||
SOVERSION ${OpenBLAS_MAJOR_VERSION}
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
|
||||
if (NOT DEFINED ARCH)
|
||||
set(ARCH_IN "x86_64")
|
||||
else()
|
||||
set(ARCH_IN ${ARCH})
|
||||
endif()
|
||||
|
||||
if (${CORE} STREQUAL "generic")
|
||||
set(ARCH_IN "GENERIC")
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED EXPRECISION)
|
||||
set(EXPRECISION_IN 0)
|
||||
else()
|
||||
set(EXPRECISION_IN ${EXPRECISION})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NO_CBLAS)
|
||||
set(NO_CBLAS_IN 0)
|
||||
else()
|
||||
set(NO_CBLAS_IN ${NO_CBLAS})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NO_LAPACK)
|
||||
set(NO_LAPACK_IN 0)
|
||||
else()
|
||||
set(NO_LAPACK_IN ${NO_LAPACK})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NO_LAPACKE)
|
||||
set(NO_LAPACKE_IN 0)
|
||||
else()
|
||||
set(NO_LAPACKE_IN ${NO_LAPACKE})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NEED2UNDERSCORES)
|
||||
set(NEED2UNDERSCORES_IN 0)
|
||||
else()
|
||||
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED ONLY_CBLAS)
|
||||
set(ONLY_CBLAS_IN 0)
|
||||
else()
|
||||
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED BU)
|
||||
set(BU _)
|
||||
endif()
|
||||
|
||||
if (NOT ${SYMBOLPREFIX} STREQUAL "")
|
||||
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||
endif()
|
||||
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
|
||||
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||
endif()
|
||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD
|
||||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||
COMMENT "renaming symbols"
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
# Install project
|
||||
|
||||
# Install libraries
|
||||
install(TARGETS ${OpenBLAS_LIBNAME}
|
||||
EXPORT "OpenBLAS${SUFFIX64}Targets"
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
||||
|
||||
# Install headers
|
||||
set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
|
||||
set(CMAKE_INSTALL_FULL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})
|
||||
|
||||
message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
|
||||
set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h)
|
||||
@@ -259,11 +342,31 @@ if(NOT NO_LAPACKE)
|
||||
ADD_CUSTOM_TARGET(genlapacke
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
|
||||
)
|
||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
|
||||
endif()
|
||||
|
||||
include(FindPkgConfig QUIET)
|
||||
if(PKG_CONFIG_FOUND)
|
||||
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY)
|
||||
install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
||||
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY)
|
||||
install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
||||
endif()
|
||||
|
||||
|
||||
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
|
||||
set(PN OpenBLAS)
|
||||
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}")
|
||||
configure_package_config_file(cmake/${PN}Config.cmake.in
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
|
||||
INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||
write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
|
||||
VERSION ${${PN}_VERSION}
|
||||
COMPATIBILITY AnyNewerVersion)
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake
|
||||
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
|
||||
RENAME ${PN}${SUFFIX64}ConfigVersion.cmake
|
||||
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||
install(EXPORT "${PN}${SUFFIX64}Targets"
|
||||
NAMESPACE "${PN}${SUFFIX64}::"
|
||||
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||
|
||||
|
||||
138
Changelog.txt
138
Changelog.txt
@@ -1,4 +1,142 @@
|
||||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.3
|
||||
31-Aug-2018
|
||||
|
||||
common:
|
||||
* thread memory allocation has been switched back to the method
|
||||
used before version 0.3.1 due to unexpected problems caused by
|
||||
the new code under some circumstances. A new compile-time option
|
||||
USE_TLS has been added to enable the new code, and it is hoped
|
||||
that this can become the default again in the next version.
|
||||
* LAPAck PR272 has been integrated, which fixes spurious errors
|
||||
in DSYEVR and related functions caused by missing conversion
|
||||
from ILAENV to ILAENV_2STAGE in several _2stage routines.
|
||||
* the cmake-generated OpenBLASConfig.cmake now uses correct case
|
||||
for the name of the library
|
||||
* added support for Haiku OS
|
||||
|
||||
x86_64:
|
||||
* added AVX512 implementations of SDOT, DDOT, SAXPY, DAXPY,
|
||||
DSCAL, DGEMVN and DSYMVL
|
||||
* added a workaround for a cygwin issue that prevented compilation
|
||||
of AVX512 code
|
||||
|
||||
IBM Z:
|
||||
* added autodetection of Z14
|
||||
* fixed TRMM errors in the generic target
|
||||
|
||||
====================================================================
|
||||
Version 0.3.2
|
||||
30-Jul-2018
|
||||
|
||||
common:
|
||||
* fixes for regressions caused by the rewrite of the thread
|
||||
initialization code in 0.3.1
|
||||
|
||||
POWER:
|
||||
* fixed cpu autodetection for the BSDs
|
||||
|
||||
MIPS64:
|
||||
* fixed utest errors in AXPY, DSDOT, ROT and SWAP
|
||||
|
||||
x86_64:
|
||||
* added autodetection of AMD Ryzen 2
|
||||
* fixed build with older versions of MSVC
|
||||
|
||||
====================================================================
|
||||
Version 0.3.1
|
||||
01-Jul-2018
|
||||
|
||||
common:
|
||||
* rewritten thread initialization code with significantly reduced overhead
|
||||
* added CBLAS interfaces to the IxAMIN BLAS extension functions
|
||||
* fixed the lapack-test target
|
||||
* CMAKE builds now create an OpenBLASConfig.cmake file
|
||||
* ZAXPY now uses a single thread for small input sizes
|
||||
* the LAPACK code was updated from Reference-LAPACK/lapack#253
|
||||
(fixing LAPACKE interfaces to Aasen's functions)
|
||||
|
||||
POWER:
|
||||
* corrected CROT and ZROT behaviour with zero INC_X
|
||||
|
||||
ARMV7:
|
||||
* corrected xDOT behaviour with zero INC_X or INC_Y
|
||||
|
||||
x86_64:
|
||||
* retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER,
|
||||
this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO
|
||||
(which will still be supported via the slower PRESCOTT kernels when this option is not set)
|
||||
* added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to
|
||||
specify the list of x86_64 targets to include. Any target not on the list will be supported
|
||||
by the Sandybridge or Nehalem kernels if available, or by Prescott.
|
||||
* improved SWITCH_RATIO on Haswell for increased GEMM throughput
|
||||
* added initial support for Intel Skylake X, including an AVX512 SGEMM kernel
|
||||
* added autodetection of Intel Cannon Lake series as Skylake X
|
||||
* added a default L2 cache size for hypervisors that return zero here (Chromebook)
|
||||
* fixed a name clash with recent Windows10 headers that broke the build with (at least)
|
||||
recent mingw from MSYS2
|
||||
* fixed a link error in mixed clang/gfortran builds with OpenMP
|
||||
* updated the OSX deployment target to 10.8
|
||||
* switched on parallel make for builds on MS Windows by default
|
||||
|
||||
x86:
|
||||
* fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y
|
||||
|
||||
====================================================================
|
||||
Version 0.3.0
|
||||
23-May-2108
|
||||
|
||||
common:
|
||||
* fixed some more thread race and locking bugs
|
||||
* added preliminary support for calling an OpenMP build of the library from multiple threads
|
||||
* removed performance impact of thread locks added in 0.2.20 on OpenMP code
|
||||
* general code cleanup
|
||||
* optimized DSDOT implementation
|
||||
* improved thread distribution for GEMM
|
||||
* corrected IMATCOPY/OMATCOPY implementation
|
||||
* fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations
|
||||
* cmake build improvements
|
||||
* pkgconfig file now contains build options
|
||||
* openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build
|
||||
* corrections and improvements for systems with more than 64 cpus
|
||||
* LAPACK code updated to 3.8.0 including later fixes
|
||||
* added ReLAPACK, a recursive implementation of several LAPACK functions
|
||||
* Rewrote ROTMG to handle cases that the netlib code failed to address
|
||||
* Disabled (broken) multithreading code for xTRMV
|
||||
* corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard
|
||||
* shared memory access failures on startup are now handled more gracefully
|
||||
* restored utests from earlier releases (and made them pass on all affected systems)
|
||||
|
||||
SPARC:
|
||||
* several fixes for cpu autodetection
|
||||
|
||||
POWER:
|
||||
* corrected vector register overwriting in several Power8 kernels
|
||||
* optimized additional BLAS functions
|
||||
|
||||
ARM:
|
||||
* added support for CortexA53 and A72
|
||||
* added autodetection for ThunderX2T99
|
||||
* made most optimized kernels the default for generic ARMv8 targets
|
||||
|
||||
x86_64:
|
||||
* parallelized DDOT kernel for Haswell
|
||||
* changed alignment directives in assembly kernels to boost performance on OSX
|
||||
* fixed register handling in the GEMV microkernels (bug exposed by gcc7)
|
||||
* added support for building on OpenBSD and Dragonfly
|
||||
* updated compiler options to work with Intel release 2018
|
||||
* support fully optimized build with clang/flang on Microsoft Windows
|
||||
* fixed building on AIX
|
||||
|
||||
IBM Z:
|
||||
* added optimized BLAS 1/2 functions
|
||||
|
||||
MIPS:
|
||||
* fixed cpu autodetection helper code
|
||||
* added mips32 1004K cpu (Mediatek MT7621 and similar SoC)
|
||||
* added mips64 I6500 cpu
|
||||
|
||||
====================================================================
|
||||
Version 0.2.20
|
||||
24-Jul-2017
|
||||
|
||||
49
Makefile
49
Makefile
@@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1)
|
||||
RELA = re_lapack
|
||||
endif
|
||||
|
||||
ifeq ($(NO_FORTRAN), 1)
|
||||
define NOFORTRAN
|
||||
1
|
||||
endef
|
||||
define NO_LAPACK
|
||||
1
|
||||
endef
|
||||
export NOFORTRAN
|
||||
export NO_LAPACK
|
||||
endif
|
||||
|
||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
|
||||
|
||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
|
||||
@@ -47,7 +58,7 @@ endif
|
||||
endif
|
||||
|
||||
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
|
||||
ifndef NOFORTRAN
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
|
||||
endif
|
||||
ifneq ($(OSNAME), AIX)
|
||||
@@ -86,16 +97,12 @@ endif
|
||||
|
||||
shared :
|
||||
ifndef NO_SHARED
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), NetBSD)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
@@ -112,7 +119,7 @@ endif
|
||||
endif
|
||||
|
||||
tests :
|
||||
ifndef NOFORTRAN
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
touch $(LIBNAME)
|
||||
ifndef NO_FBLAS
|
||||
$(MAKE) -C test all
|
||||
@@ -157,6 +164,9 @@ ifeq ($(DYNAMIC_ARCH), 1)
|
||||
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
|
||||
done
|
||||
@echo DYNAMIC_ARCH=1 >> Makefile.conf_last
|
||||
ifeq ($(DYNAMIC_OLDER), 1)
|
||||
@echo DYNAMIC_OLDER=1 >> Makefile.conf_last
|
||||
endif
|
||||
endif
|
||||
ifdef USE_THREAD
|
||||
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
|
||||
@@ -211,7 +221,7 @@ netlib :
|
||||
|
||||
else
|
||||
netlib : lapack_prebuild
|
||||
ifndef NOFORTRAN
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
|
||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
|
||||
endif
|
||||
@@ -232,7 +242,7 @@ prof_lapack : lapack_prebuild
|
||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
|
||||
|
||||
lapack_prebuild :
|
||||
ifndef NOFORTRAN
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
@@ -241,7 +251,7 @@ ifndef NOFORTRAN
|
||||
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
@@ -257,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifdef SMP
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
else ifeq ($(OSNAME), Haiku)
|
||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
else
|
||||
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
@@ -275,21 +287,21 @@ endif
|
||||
endif
|
||||
|
||||
large.tgz :
|
||||
ifndef NOFORTRAN
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
if [ ! -a $< ]; then
|
||||
-wget http://www.netlib.org/lapack/timing/large.tgz;
|
||||
fi
|
||||
endif
|
||||
|
||||
timing.tgz :
|
||||
ifndef NOFORTRAN
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
if [ ! -a $< ]; then
|
||||
-wget http://www.netlib.org/lapack/timing/timing.tgz;
|
||||
fi
|
||||
endif
|
||||
|
||||
lapack-timing : large.tgz timing.tgz
|
||||
ifndef NOFORTRAN
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
|
||||
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
|
||||
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING
|
||||
@@ -298,9 +310,10 @@ endif
|
||||
|
||||
lapack-test :
|
||||
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
|
||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
|
||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz
|
||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
|
||||
ifneq ($(CROSS), 1)
|
||||
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
|
||||
( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
|
||||
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
|
||||
endif
|
||||
@@ -312,9 +325,9 @@ lapack-runtest:
|
||||
|
||||
|
||||
blas-test:
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
|
||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out)
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
|
||||
|
||||
|
||||
dummy :
|
||||
|
||||
@@ -66,18 +66,13 @@ endif
|
||||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), NetBSD)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@@ -101,8 +96,9 @@ endif
|
||||
|
||||
#Generating openblas.pc
|
||||
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@@ -115,7 +111,7 @@ endif
|
||||
|
||||
ifndef NO_SHARED
|
||||
#ifeq logical or
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
endif
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
|
||||
|
||||
@@ -17,6 +17,10 @@ ifdef CPUIDEMU
|
||||
EXFLAGS = -DCPUIDEMU -DVENDOR=99
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), 1004K)
|
||||
TARGET_FLAGS = -mips32r2
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), P5600)
|
||||
TARGET_FLAGS = -mips32r5
|
||||
endif
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.0.dev
|
||||
VERSION = 0.3.4.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
@@ -17,6 +17,11 @@ VERSION = 0.3.0.dev
|
||||
# If you want to support multiple architecture in one binary
|
||||
# DYNAMIC_ARCH = 1
|
||||
|
||||
# If you want the full list of x86_64 architectures supported in DYNAMIC_ARCH
|
||||
# mode (including individual optimizied codes for PENRYN, DUNNINGTON, OPTERON,
|
||||
# OPTERON_SSE3, ATOM and NANO rather than fallbacks to older architectures)
|
||||
# DYNAMIC_OLDER = 1
|
||||
|
||||
# C compiler including binary type(32bit / 64bit). Default is gcc.
|
||||
# Don't use Intel Compiler or PGI, it won't generate right codes as I expect.
|
||||
# CC = gcc
|
||||
@@ -55,11 +60,26 @@ VERSION = 0.3.0.dev
|
||||
# This flag is always set for POWER8. Don't modify the flag
|
||||
# USE_OPENMP = 1
|
||||
|
||||
# The OpenMP scheduler to use - by default this is "static" and you
|
||||
# will normally not want to change this unless you know that your main
|
||||
# workload will involve tasks that have highly unbalanced running times
|
||||
# for individual threads. Changing away from "static" may also adversely
|
||||
# affect memory access locality in NUMA systems. Setting to "runtime" will
|
||||
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
|
||||
# CCOMMON_OPT += -DOMP_SCHED=dynamic
|
||||
|
||||
# You can define maximum number of threads. Basically it should be
|
||||
# less than actual number of cores. If you don't specify one, it's
|
||||
# automatically detected by the the script.
|
||||
# NUM_THREADS = 24
|
||||
|
||||
# If you have enabled USE_OPENMP and your application would call
|
||||
# OpenBLAS's calculation API from multi threads, please comment it in.
|
||||
# This flag defines how many instances of OpenBLAS's calculation API can
|
||||
# actually run in parallel. If more threads call OpenBLAS's calculation API,
|
||||
# they need to wait for the preceding API calls to finish or risk data corruption.
|
||||
# NUM_PARALLEL = 2
|
||||
|
||||
# if you don't need to install the static library, please comment it in.
|
||||
# NO_STATIC = 1
|
||||
|
||||
@@ -89,6 +109,12 @@ BUILD_LAPACK_DEPRECATED = 1
|
||||
# If you want to use legacy threaded Level 3 implementation.
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
||||
# If you want to use the new, still somewhat experimental code that uses
|
||||
# thread-local storage instead of a central memory buffer in memory.c
|
||||
# Note that if your system uses GLIBC, it needs to have at least glibc 2.21
|
||||
# for this to work.
|
||||
# USE_TLS = 1
|
||||
|
||||
# If you want to drive whole 64bit region by BLAS. Not all Fortran
|
||||
# compiler supports this. It's safe to keep comment it out if you
|
||||
# are not sure(equivalent to "-i8" option).
|
||||
@@ -100,7 +126,7 @@ BUILD_LAPACK_DEPRECATED = 1
|
||||
NO_WARMUP = 1
|
||||
|
||||
# If you want to disable CPU/Memory affinity on Linux.
|
||||
#NO_AFFINITY = 1
|
||||
NO_AFFINITY = 1
|
||||
|
||||
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
|
||||
# BIGNUMA = 1
|
||||
@@ -126,6 +152,9 @@ NO_WARMUP = 1
|
||||
# FUNCTION_PROFILE = 1
|
||||
|
||||
# Support for IEEE quad precision(it's *real* REAL*16)( under testing)
|
||||
# This option should not be used - it is a holdover from unfinished code present
|
||||
# in the original GotoBLAS2 library that may be usable as a starting point but
|
||||
# is not even expected to compile in its present form.
|
||||
# QUAD_PRECISION = 1
|
||||
|
||||
# Theads are still working for a while after finishing BLAS operation
|
||||
@@ -144,8 +173,11 @@ NO_WARMUP = 1
|
||||
# CONSISTENT_FPCSR = 1
|
||||
|
||||
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
||||
# with single thread. You can use this flag to avoid the overhead of multi-threading
|
||||
# in small matrix sizes. The default value is 4.
|
||||
# with single thread. (Actually in recent versions this is a factor proportional to the
|
||||
# number of floating point operations necessary for the given problem size, no longer
|
||||
# an individual dimension). You can use this setting to avoid the overhead of multi-
|
||||
# threading in small matrix sizes. The default value is 4, but values as high as 50 have
|
||||
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
|
||||
# GEMM_MULTITHREAD_THRESHOLD = 4
|
||||
|
||||
# If you need santy check by comparing reference BLAS. It'll be very
|
||||
|
||||
100
Makefile.system
100
Makefile.system
@@ -9,6 +9,11 @@ ifndef TOPDIR
|
||||
TOPDIR = .
|
||||
endif
|
||||
|
||||
# Catch conflicting usage of ARCH in some BSD environments
|
||||
ifeq ($(ARCH), amd64)
|
||||
override ARCH=x86_64
|
||||
endif
|
||||
|
||||
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
|
||||
|
||||
# Default C compiler
|
||||
@@ -17,15 +22,24 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
|
||||
# http://stackoverflow.com/questions/4029274/mingw-and-make-variables
|
||||
# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
|
||||
ifeq ($(origin CC),default)
|
||||
|
||||
# Check if $(CC) refers to a valid command and set the value to gcc if not
|
||||
ifneq ($(findstring cmd.exe,$(SHELL)),)
|
||||
ifeq ($(shell where $(CC) 2>NUL),)
|
||||
CC = gcc
|
||||
# Change the default compile to clang on Mac OSX.
|
||||
# http://stackoverflow.com/questions/714100/os-detecting-makefile
|
||||
UNAME_S := $(shell uname -s)
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CC = clang
|
||||
# EXTRALIB += -Wl,-no_compact_unwind
|
||||
endif
|
||||
endif
|
||||
else # POSIX-ish
|
||||
ifeq ($(shell command -v $(CC) 2>/dev/null),)
|
||||
ifeq ($(shell uname -s),Darwin)
|
||||
CC = clang
|
||||
# EXTRALIB += -Wl,-no_compact_unwind
|
||||
else
|
||||
CC = gcc
|
||||
endif # Darwin
|
||||
endif # CC exists
|
||||
endif # Shell is sane
|
||||
|
||||
endif # CC is set to default
|
||||
|
||||
# Default Fortran compiler (FC) is selected by f_check.
|
||||
|
||||
@@ -53,6 +67,9 @@ ifeq ($(BINARY), 32)
|
||||
ifeq ($(TARGET), HASWELL)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET), SKYLAKEX)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET), SANDYBRIDGE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
@@ -86,6 +103,9 @@ ifeq ($(BINARY), 32)
|
||||
ifeq ($(TARGET_CORE), HASWELL)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), SANDYBRIDGE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
@@ -132,6 +152,10 @@ ifeq ($(NO_AVX2), 1)
|
||||
GETARCH_FLAGS += -DNO_AVX2
|
||||
endif
|
||||
|
||||
ifeq ($(NO_AVX512), 1)
|
||||
GETARCH_FLAGS += -DNO_AVX512
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG), 1)
|
||||
GETARCH_FLAGS += -g
|
||||
endif
|
||||
@@ -175,6 +199,10 @@ endif
|
||||
|
||||
endif
|
||||
|
||||
ifndef NUM_PARALLEL
|
||||
NUM_PARALLEL = 1
|
||||
endif
|
||||
|
||||
ifndef NUM_THREADS
|
||||
NUM_THREADS = $(NUM_CORES)
|
||||
endif
|
||||
@@ -225,12 +253,12 @@ endif
|
||||
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
ifndef MACOSX_DEPLOYMENT_TARGET
|
||||
export MACOSX_DEPLOYMENT_TARGET=10.6
|
||||
export MACOSX_DEPLOYMENT_TARGET=10.8
|
||||
endif
|
||||
MD5SUM = md5 -r
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly))
|
||||
MD5SUM = md5 -r
|
||||
endif
|
||||
|
||||
@@ -424,7 +452,7 @@ CCOMMON_OPT += -fopenmp
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), INTEL)
|
||||
CCOMMON_OPT += -openmp
|
||||
CCOMMON_OPT += -fopenmp
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
@@ -449,13 +477,37 @@ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), x86_64)
|
||||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
||||
DYNAMIC_CORE = PRESCOTT CORE2
|
||||
ifeq ($(DYNAMIC_OLDER), 1)
|
||||
DYNAMIC_CORE += PENRYN DUNNINGTON
|
||||
endif
|
||||
DYNAMIC_CORE += NEHALEM
|
||||
ifeq ($(DYNAMIC_OLDER), 1)
|
||||
DYNAMIC_CORE += OPTERON OPTERON_SSE3
|
||||
endif
|
||||
DYNAMIC_CORE += BARCELONA
|
||||
ifeq ($(DYNAMIC_OLDER), 1)
|
||||
DYNAMIC_CORE += BOBCAT ATOM NANO
|
||||
endif
|
||||
ifneq ($(NO_AVX), 1)
|
||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
|
||||
endif
|
||||
ifneq ($(NO_AVX2), 1)
|
||||
DYNAMIC_CORE += HASWELL ZEN
|
||||
endif
|
||||
ifneq ($(NO_AVX512), 1)
|
||||
ifneq ($(NO_AVX2), 1)
|
||||
DYNAMIC_CORE += SKYLAKEX
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef DYNAMIC_LIST
|
||||
override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST)
|
||||
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT
|
||||
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
|
||||
CCOMMON_OPT += $(XCCOMMON_OPT)
|
||||
#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
@@ -555,9 +607,14 @@ CCOMMON_OPT += -march=mips64
|
||||
FCOMMON_OPT += -march=mips64
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), 1004K)
|
||||
CCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
||||
FCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), P5600)
|
||||
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), I6400)
|
||||
@@ -704,7 +761,7 @@ FCOMMON_OPT += -i8
|
||||
endif
|
||||
endif
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -openmp
|
||||
FCOMMON_OPT += -fopenmp
|
||||
endif
|
||||
endif
|
||||
|
||||
@@ -884,6 +941,10 @@ ifeq ($(DYNAMIC_ARCH), 1)
|
||||
CCOMMON_OPT += -DDYNAMIC_ARCH
|
||||
endif
|
||||
|
||||
ifeq ($(DYNAMIC_OLDER), 1)
|
||||
CCOMMON_OPT += -DDYNAMIC_OLDER
|
||||
endif
|
||||
|
||||
ifeq ($(NO_LAPACK), 1)
|
||||
CCOMMON_OPT += -DNO_LAPACK
|
||||
#Disable LAPACK C interface
|
||||
@@ -906,6 +967,10 @@ ifeq ($(NO_AVX2), 1)
|
||||
CCOMMON_OPT += -DNO_AVX2
|
||||
endif
|
||||
|
||||
ifeq ($(NO_AVX512), 1)
|
||||
CCOMMON_OPT += -DNO_AVX512
|
||||
endif
|
||||
|
||||
ifdef SMP
|
||||
CCOMMON_OPT += -DSMP_SERVER
|
||||
|
||||
@@ -952,10 +1017,16 @@ endif
|
||||
|
||||
CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS)
|
||||
|
||||
CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL)
|
||||
|
||||
ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||
endif
|
||||
|
||||
ifdef USE_TLS
|
||||
CCOMMON_OPT += -DUSE_TLS
|
||||
endif
|
||||
|
||||
ifndef SYMBOLPREFIX
|
||||
SYMBOLPREFIX =
|
||||
endif
|
||||
@@ -1210,6 +1281,7 @@ export MSA_FLAGS
|
||||
export KERNELDIR
|
||||
export FUNCTION_PROFILE
|
||||
export TARGET_CORE
|
||||
export NO_AVX512
|
||||
|
||||
export SGEMM_UNROLL_M
|
||||
export SGEMM_UNROLL_N
|
||||
|
||||
@@ -8,6 +8,21 @@ endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), SKYLAKEX)
|
||||
ifndef NO_AVX512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), Interix)
|
||||
ARFLAGS = -m x64
|
||||
endif
|
||||
|
||||
232
README.md
232
README.md
@@ -5,175 +5,221 @@
|
||||
Travis CI: [](https://travis-ci.org/xianyi/OpenBLAS)
|
||||
|
||||
AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
||||
|
||||
## Introduction
|
||||
|
||||
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
|
||||
|
||||
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.
|
||||
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
|
||||
|
||||
## Binary Packages
|
||||
We provide binary packages for the following platform.
|
||||
|
||||
We provide official binary packages for the following platform:
|
||||
|
||||
* Windows x86/x86_64
|
||||
|
||||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
|
||||
|
||||
## Installation from Source
|
||||
Download from project homepage. http://xianyi.github.com/OpenBLAS/
|
||||
|
||||
Or, check out codes from git://github.com/xianyi/OpenBLAS.git
|
||||
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
|
||||
using Git from https://github.com/xianyi/OpenBLAS.git.
|
||||
|
||||
### Dependencies
|
||||
|
||||
Building OpenBLAS requires the following to be installed:
|
||||
|
||||
* GNU Make
|
||||
* A C compiler, e.g. GCC or Clang
|
||||
* A Fortran compiler (optional, for LAPACK)
|
||||
* IBM MASS (optional, see below)
|
||||
|
||||
### Normal compile
|
||||
* type "make" to detect the CPU automatically.
|
||||
or
|
||||
* type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
|
||||
|
||||
Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically.
|
||||
To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`.
|
||||
The full target list is in the file `TargetList.txt`.
|
||||
|
||||
### Cross compile
|
||||
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
|
||||
|
||||
Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler.
|
||||
The target must be specified explicitly when cross compiling.
|
||||
|
||||
Examples:
|
||||
|
||||
On X86 box, compile this library for loongson3a CPU.
|
||||
* On an x86 box, compile this library for a loongson3a CPU:
|
||||
```sh
|
||||
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
|
||||
```
|
||||
|
||||
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
|
||||
|
||||
On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler.
|
||||
|
||||
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
|
||||
* On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler:
|
||||
```sh
|
||||
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
|
||||
```
|
||||
|
||||
### Debug version
|
||||
|
||||
make DEBUG=1
|
||||
A debug version can be built using `make DEBUG=1`.
|
||||
|
||||
### Compile with MASS Support on Power CPU (Optional dependency)
|
||||
### Compile with MASS support on Power CPU (optional)
|
||||
|
||||
[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and
|
||||
Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER.
|
||||
The library can be installed as below -
|
||||
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
|
||||
consists of a set of mathematical functions for C, C++, and Fortran applications that are
|
||||
are tuned for optimum performance on POWER architectures.
|
||||
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
|
||||
The library can be installed as shown:
|
||||
|
||||
* On Ubuntu:
|
||||
* On Ubuntu:
|
||||
```sh
|
||||
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
|
||||
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install libxlmass-devel.8.1.5
|
||||
```
|
||||
|
||||
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -</br>
|
||||
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list</br>
|
||||
sudo apt-get update</br>
|
||||
sudo apt-get install libxlmass-devel.8.1.5</br>
|
||||
* On RHEL/CentOS:
|
||||
```sh
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
|
||||
sudo rpm --import repomd.xml.key
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
|
||||
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
|
||||
sudo yum install libxlmass-devel.8.1.5
|
||||
```
|
||||
|
||||
* On RHEL/CentOS:
|
||||
After installing the MASS library, compile OpenBLAS with `USE_MASS=1`.
|
||||
For example, to compile on Power8 with MASS support: `make USE_MASS=1 TARGET=POWER8`.
|
||||
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key</br>
|
||||
sudo rpm --import repomd.xml.key</br>
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo</br>
|
||||
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/</br>
|
||||
sudo yum install libxlmass-devel.8.1.5</br>
|
||||
### Install to a specific directory (optional)
|
||||
|
||||
After installing MASS library, compile openblas with USE_MASS=1.
|
||||
Use `PREFIX=` when invoking `make`, for example
|
||||
|
||||
Example:
|
||||
```sh
|
||||
make install PREFIX=your_installation_directory
|
||||
```
|
||||
|
||||
Compiling on Power8 with MASS support -
|
||||
The default installation directory is `/opt/OpenBLAS`.
|
||||
|
||||
make USE_MASS=1 TARGET=POWER8
|
||||
## Supported CPUs and Operating Systems
|
||||
|
||||
### Install to the directory (optional)
|
||||
Please read `GotoBLAS_01Readme.txt`.
|
||||
|
||||
Example:
|
||||
### Additional supported CPUs
|
||||
|
||||
make install PREFIX=your_installation_directory
|
||||
#### x86/x86-64
|
||||
|
||||
The default directory is /opt/OpenBLAS
|
||||
|
||||
## Support CPU & OS
|
||||
Please read GotoBLAS_01Readme.txt
|
||||
|
||||
### Additional support CPU:
|
||||
|
||||
#### x86/x86-64:
|
||||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
|
||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar)
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
||||
|
||||
#### MIPS64:
|
||||
#### MIPS64
|
||||
|
||||
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
||||
- **ICT Loongson 3B**: Experimental
|
||||
|
||||
#### ARM:
|
||||
- **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ )
|
||||
- **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 )
|
||||
#### ARM
|
||||
|
||||
#### ARM64:
|
||||
- **ARMV8**: Experimental
|
||||
- **ARMv6**: Optimized BLAS for vfpv2 and vfpv3-d16 (e.g. BCM2835, Cortex M0+)
|
||||
- **ARMv7**: Optimized BLAS for vfpv3-d32 (e.g. Cortex A8, A9 and A15)
|
||||
|
||||
#### ARM64
|
||||
|
||||
- **ARMv8**: Experimental
|
||||
- **ARM Cortex-A57**: Experimental
|
||||
|
||||
#### PPC/PPC64
|
||||
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with USE_OPENMP=1
|
||||
|
||||
#### IBM zEnterprise System:
|
||||
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
|
||||
|
||||
#### IBM zEnterprise System
|
||||
|
||||
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
|
||||
|
||||
|
||||
### Support OS:
|
||||
### Supported OS
|
||||
|
||||
- **GNU/Linux**
|
||||
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
|
||||
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
|
||||
- **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
- **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts.
|
||||
- **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
|
||||
## Usages
|
||||
Link with libopenblas.a or -lopenblas for shared library.
|
||||
## Usage
|
||||
|
||||
### Set the number of threads with environment variables.
|
||||
Statically link with `libopenblas.a` or dynamically link with `-lopenblas` if OpenBLAS was
|
||||
compiled as a shared library.
|
||||
|
||||
Examples:
|
||||
### Setting the number of threads using environment variables
|
||||
|
||||
export OPENBLAS_NUM_THREADS=4
|
||||
Environment variables are used to specify a maximum number of threads.
|
||||
For example,
|
||||
|
||||
or
|
||||
```sh
|
||||
export OPENBLAS_NUM_THREADS=4
|
||||
export GOTO_NUM_THREADS=4
|
||||
export OMP_NUM_THREADS=4
|
||||
```
|
||||
|
||||
export GOTO_NUM_THREADS=4
|
||||
The priorities are `OPENBLAS_NUM_THREADS` > `GOTO_NUM_THREADS` > `OMP_NUM_THREADS`.
|
||||
|
||||
or
|
||||
If you compile this library with `USE_OPENMP=1`, you should set the `OMP_NUM_THREADS`
|
||||
environment variable; OpenBLAS ignores `OPENBLAS_NUM_THREADS` and `GOTO_NUM_THREADS` when
|
||||
compiled with `USE_OPENMP=1`.
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
### Setting the number of threads at runtime
|
||||
|
||||
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
|
||||
We provide the following functions to control the number of threads at runtime:
|
||||
|
||||
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
|
||||
```c
|
||||
void goto_set_num_threads(int num_threads);
|
||||
void openblas_set_num_threads(int num_threads);
|
||||
```
|
||||
|
||||
### Set the number of threads on runtime.
|
||||
If you compile this library with `USE_OPENMP=1`, you should use the above functions too.
|
||||
|
||||
We provided the below functions to control the number of threads on runtime.
|
||||
## Reporting bugs
|
||||
|
||||
void goto_set_num_threads(int num_threads);
|
||||
|
||||
void openblas_set_num_threads(int num_threads);
|
||||
|
||||
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
|
||||
|
||||
## Report Bugs
|
||||
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
|
||||
Please submit an issue in https://github.com/xianyi/OpenBLAS/issues.
|
||||
|
||||
## Contact
|
||||
|
||||
* OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
|
||||
* OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev
|
||||
|
||||
## ChangeLog
|
||||
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
|
||||
## Change log
|
||||
|
||||
Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version.
|
||||
|
||||
## Troubleshooting
|
||||
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
|
||||
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
|
||||
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
|
||||
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
|
||||
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
|
||||
|
||||
* Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first.
|
||||
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
||||
Clang 3.0 will generate the wrong AVX binary code.
|
||||
* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels.
|
||||
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
|
||||
the library with `BIGNUMA=1`.
|
||||
* OpenBLAS does not set processor affinity by default.
|
||||
On Linux, you can enable processor affinity by commenting out the line `NO_AFFINITY=1` in
|
||||
Makefile.rule. However, note that this may cause
|
||||
[a conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
|
||||
* On Loongson 3A, `make test` may fail with a `pthread_create` error (`EAGAIN`).
|
||||
However, it will be okay when you run the same test case on the shell.
|
||||
|
||||
## Contributing
|
||||
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
|
||||
1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
|
||||
1. Write a test which shows that the bug was fixed or that the feature works as expected.
|
||||
1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
|
||||
|
||||
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue
|
||||
to start a discussion around a feature idea or a bug.
|
||||
2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
|
||||
3. Write a test which shows that the bug was fixed or that the feature works as expected.
|
||||
4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
|
||||
|
||||
## Donation
|
||||
|
||||
Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation).
|
||||
|
||||
@@ -20,6 +20,7 @@ DUNNINGTON
|
||||
NEHALEM
|
||||
SANDYBRIDGE
|
||||
HASWELL
|
||||
SKYLAKEX
|
||||
ATOM
|
||||
|
||||
b)AMD CPU:
|
||||
@@ -56,6 +57,7 @@ CELL
|
||||
|
||||
3.MIPS CPU:
|
||||
P5600
|
||||
1004K
|
||||
|
||||
4.MIPS64 CPU:
|
||||
SICORTEX
|
||||
|
||||
14
USAGE.md
14
USAGE.md
@@ -14,6 +14,20 @@ Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
|
||||
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
|
||||
`MAX_CPU_NUMBER=NUM_THREADS`.
|
||||
|
||||
Despite its name, and due to the use of memory buffers in functions like SGEMM,
|
||||
the setting of NUM_THREADS can be relevant even for a single-threaded build
|
||||
of OpenBLAS, if such functions get called by multiple threads of a program
|
||||
that uses OpenBLAS. In some cases, the affected code may simply crash or throw
|
||||
a segmentation fault without displaying the above warning first.
|
||||
|
||||
Note that the number of threads used at runtime can be altered to differ from the
|
||||
value NUM_THREADS was set to at build time. At runtime, the actual number of
|
||||
threads can be set anywhere from 1 to the build's NUM_THREADS (note however,
|
||||
that this does not change the number of memory buffers that will be allocated,
|
||||
which is set at build time). The number of threads for a process can be set by
|
||||
using the mechanisms described below.
|
||||
|
||||
|
||||
#### How can I use OpenBLAS in multi-threaded applications?
|
||||
|
||||
If your application is already multi-threaded, it will conflict with OpenBLAS
|
||||
|
||||
@@ -237,7 +237,7 @@ int main(int argc, char *argv[]){
|
||||
timeg = time1/loops;
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6, time1);
|
||||
COMPSIZE * COMPSIZE * 2. * (double)k * (double)m * (double)n / timeg * 1.e-6, time1);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -122,7 +122,7 @@ int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *x, *y;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
FLOAT beta [] = {1.0, 1.0};
|
||||
FLOAT beta [] = {1.0, 0.0};
|
||||
char trans='N';
|
||||
blasint m, i, j;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
|
||||
22
c_check
22
c_check
@@ -54,6 +54,8 @@ $compiler = GCC if ($compiler eq "");
|
||||
$os = Linux if ($data =~ /OS_LINUX/);
|
||||
$os = FreeBSD if ($data =~ /OS_FREEBSD/);
|
||||
$os = NetBSD if ($data =~ /OS_NETBSD/);
|
||||
$os = OpenBSD if ($data =~ /OS_OPENBSD/);
|
||||
$os = DragonFly if ($data =~ /OS_DRAGONFLY/);
|
||||
$os = Darwin if ($data =~ /OS_DARWIN/);
|
||||
$os = SunOS if ($data =~ /OS_SUNOS/);
|
||||
$os = AIX if ($data =~ /OS_AIX/);
|
||||
@@ -62,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/);
|
||||
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
|
||||
$os = Interix if ($data =~ /OS_INTERIX/);
|
||||
$os = Android if ($data =~ /OS_ANDROID/);
|
||||
$os = Haiku if ($data =~ /OS_HAIKU/);
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
@@ -199,6 +202,21 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$binformat = bin32;
|
||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
|
||||
$no_avx512= 0;
|
||||
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
|
||||
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
|
||||
$args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_avx512 = 1;
|
||||
} else {
|
||||
$no_avx512 = 0;
|
||||
}
|
||||
unlink("tmpf.o");
|
||||
}
|
||||
|
||||
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
|
||||
|
||||
$data =~ /globl\s([_\.]*)(.*)/;
|
||||
@@ -206,7 +224,6 @@ $data =~ /globl\s([_\.]*)(.*)/;
|
||||
$need_fu = $1;
|
||||
|
||||
$cross = 0;
|
||||
$cross = 1 if ($os ne $hostos);
|
||||
|
||||
if ($architecture ne $hostarch) {
|
||||
$cross = 1;
|
||||
@@ -214,6 +231,8 @@ if ($architecture ne $hostarch) {
|
||||
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
|
||||
}
|
||||
|
||||
$cross = 1 if ($os ne $hostos);
|
||||
|
||||
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
||||
|
||||
$linker_L = "";
|
||||
@@ -286,6 +305,7 @@ print MAKEFILE "CROSS=1\n" if $cross != 0;
|
||||
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
|
||||
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
|
||||
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
|
||||
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
|
||||
|
||||
$os =~ tr/[a-z]/[A-Z]/;
|
||||
$architecture =~ tr/[a-z]/[A-Z]/;
|
||||
|
||||
8
cblas.h
8
cblas.h
@@ -51,7 +51,8 @@ typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=1
|
||||
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
|
||||
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
|
||||
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
|
||||
|
||||
typedef CBLAS_ORDER CBLAS_LAYOUT;
|
||||
|
||||
float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
|
||||
double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
|
||||
float cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
|
||||
@@ -82,6 +83,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
|
||||
CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
||||
79
cmake/OpenBLASConfig.cmake.in
Normal file
79
cmake/OpenBLASConfig.cmake.in
Normal file
@@ -0,0 +1,79 @@
|
||||
# OpenBLASConfig.cmake
|
||||
# --------------------
|
||||
#
|
||||
# OpenBLAS cmake module.
|
||||
# This module sets the following variables in your project::
|
||||
#
|
||||
# OpenBLAS_FOUND - true if OpenBLAS and all required components found on the system
|
||||
# OpenBLAS_VERSION - OpenBLAS version in format Major.Minor.Release
|
||||
# OpenBLAS_INCLUDE_DIRS - Directory where OpenBLAS header is located.
|
||||
# OpenBLAS_INCLUDE_DIR - same as DIRS
|
||||
# OpenBLAS_LIBRARIES - OpenBLAS library to link against.
|
||||
# OpenBLAS_LIBRARY - same as LIBRARIES
|
||||
#
|
||||
#
|
||||
# Available components::
|
||||
#
|
||||
## shared - search for only shared library
|
||||
## static - search for only static library
|
||||
# serial - search for unthreaded library
|
||||
# pthread - search for native pthread threaded library
|
||||
# openmp - search for OpenMP threaded library
|
||||
#
|
||||
#
|
||||
# Exported targets::
|
||||
#
|
||||
# If OpenBLAS is found, this module defines the following :prop_tgt:`IMPORTED`
|
||||
## target. Target is shared _or_ static, so, for both, use separate, not
|
||||
## overlapping, installations. ::
|
||||
#
|
||||
# OpenBLAS::OpenBLAS - the main OpenBLAS library #with header & defs attached.
|
||||
#
|
||||
#
|
||||
# Suggested usage::
|
||||
#
|
||||
# find_package(OpenBLAS)
|
||||
# find_package(OpenBLAS 0.2.20 EXACT CONFIG REQUIRED COMPONENTS pthread)
|
||||
#
|
||||
#
|
||||
# The following variables can be set to guide the search for this package::
|
||||
#
|
||||
# OpenBLAS_DIR - CMake variable, set to directory containing this Config file
|
||||
# CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package
|
||||
# PATH - environment variable, set to bin directory of this package
|
||||
# CMAKE_DISABLE_FIND_PACKAGE_OpenBLAS - CMake variable, disables
|
||||
# find_package(OpenBLAS) when not REQUIRED, perhaps to force internal build
|
||||
|
||||
@PACKAGE_INIT@
|
||||
|
||||
set(PN OpenBLAS)
|
||||
|
||||
# need to check that the @USE_*@ evaluate to something cmake can perform boolean logic upon
|
||||
if(@USE_OPENMP@)
|
||||
set(${PN}_openmp_FOUND 1)
|
||||
elseif(@USE_THREAD@)
|
||||
set(${PN}_pthread_FOUND 1)
|
||||
else()
|
||||
set(${PN}_serial_FOUND 1)
|
||||
endif()
|
||||
|
||||
check_required_components(${PN})
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
# Don't include targets if this file is being picked up by another
|
||||
# project which has already built this as a subproject
|
||||
#-----------------------------------------------------------------------------
|
||||
if(NOT TARGET ${PN}::OpenBLAS)
|
||||
include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake")
|
||||
|
||||
get_property(_loc TARGET ${PN}::OpenBLAS PROPERTY LOCATION)
|
||||
set(${PN}_LIBRARY ${_loc})
|
||||
get_property(_ill TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_LINK_LIBRARIES)
|
||||
set(${PN}_LIBRARIES ${_ill})
|
||||
|
||||
get_property(_id TARGET ${PN}::OpenBLAS PROPERTY INCLUDE_DIRECTORIES)
|
||||
set(${PN}_INCLUDE_DIR ${_id})
|
||||
get_property(_iid TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
|
||||
set(${PN}_INCLUDE_DIRS ${_iid})
|
||||
endif()
|
||||
|
||||
@@ -49,13 +49,27 @@ if (DYNAMIC_ARCH)
|
||||
endif ()
|
||||
|
||||
if (X86_64)
|
||||
set(DYNAMIC_CORE PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
|
||||
set(DYNAMIC_CORE PRESCOTT CORE2)
|
||||
if (DYNAMIC_OLDER)
|
||||
set (DYNAMIC_CORE ${DYNAMIC_CORE} PENRYN DUNNINGTON)
|
||||
endif ()
|
||||
set (DYNAMIC_CORE ${DYNAMIC_CORE} NEHALEM)
|
||||
if (DYNAMIC_OLDER)
|
||||
set (DYNAMIC_CORE ${DYNAMIC_CORE} OPTERON OPTERON_SSE3)
|
||||
endif ()
|
||||
set (DYNAMIC_CORE ${DYNAMIC_CORE} BARCELONA)
|
||||
if (DYNAMIC_OLDER)
|
||||
set (DYNAMIC_CORE ${DYNAMIC_CORE} BOBCAT ATOM NANO)
|
||||
endif ()
|
||||
if (NOT NO_AVX)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR)
|
||||
endif ()
|
||||
if (NOT NO_AVX2)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
|
||||
endif ()
|
||||
if (NOT NO_AVX512)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DYNAMIC_CORE)
|
||||
|
||||
@@ -3,6 +3,11 @@
|
||||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||
## Sets Fortran related variables.
|
||||
|
||||
if (INTERFACE64)
|
||||
set(SUFFIX64 64)
|
||||
set(SUFFIX64_UNDERSCORE _64)
|
||||
endif()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "FLANG")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
||||
if (BINARY64 AND INTERFACE64)
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
||||
libsuffix=@SUFFIX64_UNDERSCORE@
|
||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||
|
||||
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
|
||||
Name: OpenBLAS
|
||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||
Version: @OPENBLAS_VERSION@
|
||||
URL: https://github.com/xianyi/OpenBLAS
|
||||
Libs: -L${libdir} -lopenblas
|
||||
Libs: -L${libdir} -lopenblas${libsuffix}
|
||||
Cflags: -I${includedir}
|
||||
|
||||
@@ -85,7 +85,7 @@ if (NOT NOFORTRAN)
|
||||
endif ()
|
||||
|
||||
# Cannot run getarch on target if we are cross-compiling
|
||||
if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
|
||||
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
|
||||
# Write to config as getarch would
|
||||
|
||||
# TODO: Set up defines that getarch sets up based on every other target
|
||||
|
||||
@@ -33,7 +33,7 @@ endif ()
|
||||
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||
set(NO_AVX 1)
|
||||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE")
|
||||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX")
|
||||
set(TARGET "NEHALEM")
|
||||
endif ()
|
||||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
||||
@@ -96,8 +96,12 @@ if (NOT CMAKE_CROSSCOMPILING)
|
||||
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NUM_PARALLEL)
|
||||
set(NUM_PARALLEL 1)
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NUM_THREADS)
|
||||
if (NOT NUM_CORES EQUAL 0)
|
||||
if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0)
|
||||
# HT?
|
||||
set(NUM_THREADS ${NUM_CORES})
|
||||
else ()
|
||||
@@ -159,6 +163,9 @@ endif ()
|
||||
|
||||
if (DYNAMIC_ARCH)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
||||
if (DYNAMIC_OLDER)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NO_LAPACK)
|
||||
@@ -207,6 +214,10 @@ if (CONSISTENT_FPCSR)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR")
|
||||
endif ()
|
||||
|
||||
if (USE_TLS)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS")
|
||||
endif ()
|
||||
|
||||
# Only for development
|
||||
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST")
|
||||
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST")
|
||||
@@ -224,6 +235,8 @@ endif ()
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}")
|
||||
|
||||
if (USE_SIMPLE_THREADED_LEVEL3)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
|
||||
endif ()
|
||||
|
||||
@@ -66,3 +66,12 @@ else()
|
||||
set(BINARY32 1)
|
||||
endif()
|
||||
|
||||
if (X86_64 OR X86)
|
||||
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
|
||||
if (NO_AVX512 EQUAL 1)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||
endif()
|
||||
file(REMOVE "avx512.tmp" "avx512.o")
|
||||
endif()
|
||||
|
||||
|
||||
30
common.h
30
common.h
@@ -93,7 +93,7 @@ extern "C" {
|
||||
#include <sched.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID)
|
||||
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID)
|
||||
#include <sched.h>
|
||||
#endif
|
||||
|
||||
@@ -105,6 +105,10 @@ extern "C" {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
#define NO_SYSV_IPC
|
||||
#endif
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
#ifdef ATOM
|
||||
#define GOTO_ATOM ATOM
|
||||
@@ -179,7 +183,7 @@ extern "C" {
|
||||
|
||||
#define ALLOCA_ALIGN 63UL
|
||||
|
||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
|
||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
|
||||
|
||||
#ifdef NEEDBUNDERSCORE
|
||||
#define BLASFUNC(FUNC) FUNC##_
|
||||
@@ -253,8 +257,14 @@ typedef unsigned long BLASULONG;
|
||||
|
||||
#ifdef USE64BITINT
|
||||
typedef BLASLONG blasint;
|
||||
#if defined(OS_WINDOWS) && defined(__64BIT__)
|
||||
#define blasabs(x) llabs(x)
|
||||
#else
|
||||
#define blasabs(x) labs(x)
|
||||
#endif
|
||||
#else
|
||||
typedef int blasint;
|
||||
#define blasabs(x) abs(x)
|
||||
#endif
|
||||
#else
|
||||
#ifdef USE64BITINT
|
||||
@@ -642,6 +652,7 @@ void gotoblas_profile_init(void);
|
||||
void gotoblas_profile_quit(void);
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
|
||||
#ifndef C_MSVC
|
||||
int omp_in_parallel(void);
|
||||
int omp_get_num_procs(void);
|
||||
@@ -649,6 +660,21 @@ int omp_get_num_procs(void);
|
||||
__declspec(dllimport) int __cdecl omp_in_parallel(void);
|
||||
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
|
||||
#endif
|
||||
|
||||
#if (__STDC_VERSION__ >= 201112L)
|
||||
#if defined(C_GCC) && ( __GNUC__ < 7)
|
||||
// workaround for GCC bug 65467
|
||||
#ifndef _Atomic
|
||||
#define _Atomic volatile
|
||||
#endif
|
||||
#endif
|
||||
#include <stdatomic.h>
|
||||
#else
|
||||
#ifndef _Atomic
|
||||
#define _Atomic volatile
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#else
|
||||
#ifdef __ELF__
|
||||
int omp_in_parallel (void) __attribute__ ((weak));
|
||||
|
||||
@@ -94,7 +94,7 @@ static inline unsigned int rpcc(void){
|
||||
#define RPCC_DEFINED
|
||||
|
||||
#ifndef NO_AFFINITY
|
||||
#define WHEREAMI
|
||||
//#define WHEREAMI
|
||||
static inline int WhereAmI(void){
|
||||
int ret=0;
|
||||
__asm__ __volatile__(".set push \n"
|
||||
|
||||
@@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* - large enough to support all architectures and kernel
|
||||
* Chosing a too small SIZE will lead to a stack smashing.
|
||||
*/
|
||||
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
|
||||
/* make it volatile because some function (ex: dgemv_n.S) */ \
|
||||
/* do not restore all register */ \
|
||||
volatile int stack_alloc_size = SIZE; \
|
||||
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \
|
||||
stack_alloc_size = 0; \
|
||||
STACK_ALLOC_PROTECT_SET \
|
||||
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \
|
||||
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
|
||||
/* make it volatile because some function (ex: dgemv_n.S) */ \
|
||||
/* do not restore all register */ \
|
||||
volatile int stack_alloc_size = SIZE; \
|
||||
if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \
|
||||
STACK_ALLOC_PROTECT_SET \
|
||||
/* Avoid declaring an array of length 0 */ \
|
||||
TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \
|
||||
__attribute__((aligned(0x20))); \
|
||||
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
|
||||
#else
|
||||
//Original OpenBLAS/GotoBLAS codes.
|
||||
|
||||
10
common_x86.h
10
common_x86.h
@@ -178,7 +178,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
result = x/y;
|
||||
return result;
|
||||
#else
|
||||
|
||||
#if (MAX_CPU_NUMBER > 64)
|
||||
if ( y > 64) {
|
||||
result = x/y;
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
y = blas_quick_divide_table[y];
|
||||
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||
@@ -327,7 +333,7 @@ REALNAME:
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__)
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__)
|
||||
#define PROLOGUE \
|
||||
.text; \
|
||||
.align 16; \
|
||||
|
||||
@@ -60,8 +60,13 @@
|
||||
#endif
|
||||
*/
|
||||
|
||||
#define MB
|
||||
#define WMB
|
||||
#ifdef __GNUC__
|
||||
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||
#else
|
||||
#define MB do {} while (0)
|
||||
#define WMB do {} while (0)
|
||||
#endif
|
||||
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
@@ -196,6 +201,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
|
||||
if (y <= 1) return x;
|
||||
|
||||
#if (MAX_CPU_NUMBER > 64)
|
||||
if (y > 64) {
|
||||
result = x / y;
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
y = blas_quick_divide_table[y];
|
||||
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||
@@ -403,7 +415,7 @@ REALNAME:
|
||||
#define EPILOGUE .end
|
||||
#endif
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(__ELF__) || defined(C_PGI)
|
||||
#define PROLOGUE \
|
||||
.text; \
|
||||
.align 512; \
|
||||
|
||||
3
cpuid.h
3
cpuid.h
@@ -115,6 +115,7 @@
|
||||
#define CORE_STEAMROLLER 25
|
||||
#define CORE_EXCAVATOR 26
|
||||
#define CORE_ZEN 27
|
||||
#define CORE_SKYLAKEX 28
|
||||
|
||||
#define HAVE_SSE (1 << 0)
|
||||
#define HAVE_SSE2 (1 << 1)
|
||||
@@ -137,6 +138,7 @@
|
||||
#define HAVE_AVX (1 << 18)
|
||||
#define HAVE_FMA4 (1 << 19)
|
||||
#define HAVE_FMA3 (1 << 20)
|
||||
#define HAVE_AVX512VL (1 << 21)
|
||||
|
||||
#define CACHE_INFO_L1_I 1
|
||||
#define CACHE_INFO_L1_D 2
|
||||
@@ -211,5 +213,6 @@ typedef struct {
|
||||
#define CPUTYPE_STEAMROLLER 49
|
||||
#define CPUTYPE_EXCAVATOR 50
|
||||
#define CPUTYPE_ZEN 51
|
||||
#define CPUTYPE_SKYLAKEX 52
|
||||
|
||||
#endif
|
||||
|
||||
@@ -121,7 +121,7 @@ int detect(void)
|
||||
return CPU_VULCAN;
|
||||
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
|
||||
return CPU_THUNDERX;
|
||||
else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */
|
||||
else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43"))
|
||||
return CPU_THUNDERX2T99;
|
||||
}
|
||||
|
||||
|
||||
58
cpuid_mips.c
58
cpuid_mips.c
@@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_P5600 1
|
||||
#define CPU_1004K 2
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKOWN",
|
||||
"P5600"
|
||||
"P5600",
|
||||
"1004K"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
@@ -90,7 +92,7 @@ int detect(void){
|
||||
if (!strncmp("cpu", buffer, 3)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
fprintf(stderr, "%s \n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
@@ -99,43 +101,13 @@ int detect(void){
|
||||
fclose(infile);
|
||||
|
||||
if(p != NULL){
|
||||
if (strstr(p, "Loongson-3A")){
|
||||
return CPU_LOONGSON3A;
|
||||
}else if(strstr(p, "Loongson-3B")){
|
||||
return CPU_LOONGSON3B;
|
||||
}else if (strstr(p, "Loongson-3")){
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
p = (char *)NULL;
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("system type", buffer, 11)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if (strstr(p, "loongson3a"))
|
||||
return CPU_LOONGSON3A;
|
||||
}else{
|
||||
if (strstr(p, "5600")) {
|
||||
return CPU_P5600;
|
||||
} else if (strstr(p, "1004K")) {
|
||||
return CPU_1004K;
|
||||
} else
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
}
|
||||
//Check model name for Loongson3
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
p = (char *)NULL;
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("model name", buffer, 10)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if(p != NULL){
|
||||
if (strstr(p, "Loongson-3A")){
|
||||
return CPU_LOONGSON3A;
|
||||
}else if(strstr(p, "Loongson-3B")){
|
||||
return CPU_LOONGSON3B;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
@@ -149,7 +121,7 @@ void get_architecture(void){
|
||||
}
|
||||
|
||||
void get_subarchitecture(void){
|
||||
if(detect()==CPU_P5600){
|
||||
if(detect()==CPU_P5600|| detect()==CPU_1004K){
|
||||
printf("P5600");
|
||||
}else{
|
||||
printf("UNKNOWN");
|
||||
@@ -170,6 +142,14 @@ void get_cpuconfig(void){
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
} else if (detect()==CPU_1004K) {
|
||||
printf("#define MIPS1004K\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 26144\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 8\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
}else{
|
||||
printf("#define UNKNOWN\n");
|
||||
}
|
||||
@@ -178,6 +158,8 @@ void get_cpuconfig(void){
|
||||
void get_libname(void){
|
||||
if(detect()==CPU_P5600) {
|
||||
printf("p5600\n");
|
||||
} else if (detect()==CPU_1004K) {
|
||||
printf("1004K\n");
|
||||
}else{
|
||||
printf("mips\n");
|
||||
}
|
||||
|
||||
@@ -142,6 +142,52 @@ int detect(void){
|
||||
|
||||
return CPUTYPE_PPC970;
|
||||
#endif
|
||||
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
|
||||
int id;
|
||||
id = __asm __volatile("mfpvr %0" : "=r"(id));
|
||||
switch ( id >> 16 ) {
|
||||
case 0x4e: // POWER9
|
||||
return return CPUTYPE_POWER8;
|
||||
break;
|
||||
case 0x4d:
|
||||
case 0x4b: // POWER8/8E
|
||||
return CPUTYPE_POWER8;
|
||||
break;
|
||||
case 0x4a:
|
||||
case 0x3f: // POWER7/7E
|
||||
return CPUTYPE_POWER6;
|
||||
break;
|
||||
case 0x3e:
|
||||
return CPUTYPE_POWER6;
|
||||
break;
|
||||
case 0x3a:
|
||||
return CPUTYPE_POWER5;
|
||||
break;
|
||||
case 0x35:
|
||||
case 0x38: // POWER4 /4+
|
||||
return CPUTYPE_POWER4;
|
||||
break;
|
||||
case 0x40:
|
||||
case 0x41: // POWER3 /3+
|
||||
return CPUTYPE_POWER3;
|
||||
break;
|
||||
case 0x39:
|
||||
case 0x3c:
|
||||
case 0x44:
|
||||
case 0x45:
|
||||
return CPUTYPE_PPC970;
|
||||
break;
|
||||
case 0x70:
|
||||
return CPUTYPE_CELL;
|
||||
break;
|
||||
case 0x8003:
|
||||
return CPUTYPE_PPCG4;
|
||||
break;
|
||||
default:
|
||||
return CPUTYPE_UNKNOWN;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void get_architecture(void){
|
||||
|
||||
51
cpuid_x86.c
51
cpuid_x86.c
@@ -50,6 +50,8 @@
|
||||
#ifdef NO_AVX
|
||||
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
|
||||
#define CORE_HASWELL CORE_NEHALEM
|
||||
#define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM
|
||||
#define CORE_SKYLAKEX CORE_NEHALEM
|
||||
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
|
||||
#define CORE_SANDYBRIDGE CORE_NEHALEM
|
||||
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
|
||||
@@ -1299,6 +1301,19 @@ int get_cpuname(void){
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 5:
|
||||
// Skylake X
|
||||
#ifndef NO_AVX512
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
#endif
|
||||
case 14:
|
||||
// Skylake
|
||||
if(support_avx())
|
||||
@@ -1324,6 +1339,23 @@ int get_cpuname(void){
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 6:
|
||||
switch (model) {
|
||||
case 6: // Cannon Lake
|
||||
#ifndef NO_AVX512
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
switch (model) {
|
||||
@@ -1420,6 +1452,8 @@ int get_cpuname(void){
|
||||
switch (model) {
|
||||
case 1:
|
||||
// AMD Ryzen
|
||||
case 8:
|
||||
// AMD Ryzen2
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_ZEN;
|
||||
@@ -1556,6 +1590,7 @@ static char *cpuname[] = {
|
||||
"STEAMROLLER",
|
||||
"EXCAVATOR",
|
||||
"ZEN",
|
||||
"SKYLAKEX"
|
||||
};
|
||||
|
||||
static char *lowercpuname[] = {
|
||||
@@ -1610,6 +1645,7 @@ static char *lowercpuname[] = {
|
||||
"steamroller",
|
||||
"excavator",
|
||||
"zen",
|
||||
"skylakex"
|
||||
};
|
||||
|
||||
static char *corename[] = {
|
||||
@@ -1641,6 +1677,7 @@ static char *corename[] = {
|
||||
"STEAMROLLER",
|
||||
"EXCAVATOR",
|
||||
"ZEN",
|
||||
"SKYLAKEX"
|
||||
};
|
||||
|
||||
static char *corename_lower[] = {
|
||||
@@ -1672,6 +1709,7 @@ static char *corename_lower[] = {
|
||||
"steamroller",
|
||||
"excavator",
|
||||
"zen",
|
||||
"skylakex"
|
||||
};
|
||||
|
||||
|
||||
@@ -1860,6 +1898,19 @@ int get_coretype(void){
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 5:
|
||||
// Skylake X
|
||||
#ifndef NO_AVX512
|
||||
return CORE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
#endif
|
||||
case 14:
|
||||
// Skylake
|
||||
if(support_avx())
|
||||
|
||||
@@ -29,15 +29,18 @@
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13"
|
||||
"Z13",
|
||||
"Z14"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13"
|
||||
"z13",
|
||||
"z14"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
@@ -62,6 +65,10 @@ int detect(void)
|
||||
if (strstr(p, "2964")) return CPU_Z13;
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
|
||||
/* detect z14, but fall back to z13 */
|
||||
if (strstr(p, "3906")) return CPU_Z13;
|
||||
if (strstr(p, "3907")) return CPU_Z13;
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
@@ -107,5 +114,9 @@ void get_cpuconfig(void)
|
||||
printf("#define Z13\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
case CPU_Z14:
|
||||
printf("#define Z14\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
12
ctest.c
12
ctest.c
@@ -60,6 +60,14 @@ OS_FREEBSD
|
||||
OS_NETBSD
|
||||
#endif
|
||||
|
||||
#if defined(__OpenBSD__)
|
||||
OS_OPENBSD
|
||||
#endif
|
||||
|
||||
#if defined(__DragonFly__)
|
||||
OS_DRAGONFLY
|
||||
#endif
|
||||
|
||||
#if defined(__sun)
|
||||
OS_SUNOS
|
||||
#endif
|
||||
@@ -93,6 +101,10 @@ OS_INTERIX
|
||||
OS_LINUX
|
||||
#endif
|
||||
|
||||
#if defined(__HAIKU__)
|
||||
OS_HAIKU
|
||||
#endif
|
||||
|
||||
#if defined(__i386) || defined(_X86)
|
||||
ARCH_X86
|
||||
#endif
|
||||
|
||||
@@ -102,7 +102,13 @@ clean ::
|
||||
rm -f x*
|
||||
|
||||
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
|
||||
CEXTRALIB =
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
CEXTRALIB = -lomp
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Single real
|
||||
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)
|
||||
|
||||
@@ -362,7 +362,7 @@ cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -410,7 +410,7 @@ zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -458,7 +458,7 @@ xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -558,7 +558,7 @@ cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -606,7 +606,7 @@ zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -654,7 +654,7 @@ xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -1821,7 +1821,7 @@ cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -1869,7 +1869,7 @@ zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -1917,7 +1917,7 @@ xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -1974,7 +1974,7 @@ cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -2022,7 +2022,7 @@ zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -2070,7 +2070,7 @@ xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -2731,7 +2731,7 @@ cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -2779,7 +2779,7 @@ zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -2827,7 +2827,7 @@ xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -2927,7 +2927,7 @@ cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -2975,7 +2975,7 @@ zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -3023,7 +3023,7 @@ xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -4190,7 +4190,7 @@ cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -4238,7 +4238,7 @@ zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -4286,7 +4286,7 @@ xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -4343,7 +4343,7 @@ cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -4391,7 +4391,7 @@ zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -4439,7 +4439,7 @@ xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
|
||||
@@ -91,7 +91,12 @@
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
|
||||
|
||||
@@ -67,7 +67,12 @@
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
|
||||
|
||||
@@ -91,7 +91,8 @@
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
volatile
|
||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
|
||||
@@ -346,7 +347,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
/* Make sure if no one is using workspace */
|
||||
START_RPCC();
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
#if defined(FUSED_GEMM) && !defined(TIMING)
|
||||
@@ -408,7 +409,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
|
||||
/* Wait until other region of B is initialized */
|
||||
START_RPCC();
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
STOP_RPCC(waiting2);
|
||||
|
||||
/* Apply kernel with local region of A and part of other region of B */
|
||||
@@ -426,6 +427,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
/* Clear synchronization flag if this thread is done with other region of B */
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
@@ -487,7 +489,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
START_RPCC();
|
||||
for (i = 0; i < args -> nthreads; i++) {
|
||||
for (js = 0; js < DIVIDE_RATE; js++) {
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
|
||||
}
|
||||
}
|
||||
STOP_RPCC(waiting3);
|
||||
@@ -653,8 +655,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
}
|
||||
|
||||
/* Clear synchronization flags */
|
||||
for (i = 0; i < MAX_CPU_NUMBER; i++) {
|
||||
for (j = 0; j < MAX_CPU_NUMBER; j++) {
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
for (j = 0; j < nthreads; j++) {
|
||||
for (k = 0; k < DIVIDE_RATE; k++) {
|
||||
job[i].working[j][CACHE_LINE_SIZE * k] = 0;
|
||||
}
|
||||
|
||||
@@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD)
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU)
|
||||
#include <dlfcn.h>
|
||||
#include <signal.h>
|
||||
#include <sys/resource.h>
|
||||
@@ -582,7 +582,7 @@ int blas_thread_init(void){
|
||||
if(ret!=0){
|
||||
struct rlimit rlim;
|
||||
const char *msg = strerror(ret);
|
||||
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg);
|
||||
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg);
|
||||
#ifdef RLIMIT_NPROC
|
||||
if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
|
||||
fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
|
||||
|
||||
@@ -36,6 +36,7 @@
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
//#include <sys/mman.h>
|
||||
@@ -47,13 +48,22 @@
|
||||
|
||||
#else
|
||||
|
||||
#ifndef OMP_SCHED
|
||||
#define OMP_SCHED static
|
||||
#endif
|
||||
|
||||
int blas_server_avail = 0;
|
||||
|
||||
static void * blas_thread_buffer[MAX_CPU_NUMBER];
|
||||
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
|
||||
#else
|
||||
static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
|
||||
#endif
|
||||
|
||||
void goto_set_num_threads(int num_threads) {
|
||||
|
||||
int i=0;
|
||||
int i=0, j=0;
|
||||
|
||||
if (num_threads < 1) num_threads = blas_num_threads;
|
||||
|
||||
@@ -68,15 +78,17 @@ void goto_set_num_threads(int num_threads) {
|
||||
omp_set_num_threads(blas_cpu_number);
|
||||
|
||||
//adjust buffer for each thread
|
||||
for(i=0; i<blas_cpu_number; i++){
|
||||
if(blas_thread_buffer[i]==NULL){
|
||||
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||
for(j=0; j<blas_cpu_number; j++){
|
||||
if(blas_thread_buffer[i][j]==NULL){
|
||||
blas_thread_buffer[i][j]=blas_memory_alloc(2);
|
||||
}
|
||||
}
|
||||
}
|
||||
for(; i<MAX_CPU_NUMBER; i++){
|
||||
if(blas_thread_buffer[i]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i]);
|
||||
blas_thread_buffer[i]=NULL;
|
||||
for(; j<MAX_CPU_NUMBER; j++){
|
||||
if(blas_thread_buffer[i][j]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i][j]);
|
||||
blas_thread_buffer[i][j]=NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined(ARCH_MIPS64)
|
||||
@@ -92,30 +104,34 @@ void openblas_set_num_threads(int num_threads) {
|
||||
|
||||
int blas_thread_init(void){
|
||||
|
||||
int i=0;
|
||||
int i=0, j=0;
|
||||
|
||||
blas_get_cpu_number();
|
||||
|
||||
blas_server_avail = 1;
|
||||
|
||||
for(i=0; i<blas_num_threads; i++){
|
||||
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||
}
|
||||
for(; i<MAX_CPU_NUMBER; i++){
|
||||
blas_thread_buffer[i]=NULL;
|
||||
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||
for(j=0; j<blas_num_threads; j++){
|
||||
blas_thread_buffer[i][j]=blas_memory_alloc(2);
|
||||
}
|
||||
for(; j<MAX_CPU_NUMBER; j++){
|
||||
blas_thread_buffer[i][j]=NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int BLASFUNC(blas_thread_shutdown)(void){
|
||||
int i=0;
|
||||
int i=0, j=0;
|
||||
blas_server_avail = 0;
|
||||
|
||||
for(i=0; i<MAX_CPU_NUMBER; i++){
|
||||
if(blas_thread_buffer[i]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i]);
|
||||
blas_thread_buffer[i]=NULL;
|
||||
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||
for(j=0; j<MAX_CPU_NUMBER; j++){
|
||||
if(blas_thread_buffer[i][j]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i][j]);
|
||||
blas_thread_buffer[i][j]=NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -206,7 +222,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||
}
|
||||
}
|
||||
|
||||
static void exec_threads(blas_queue_t *queue){
|
||||
static void exec_threads(blas_queue_t *queue, int buf_index){
|
||||
|
||||
void *buffer, *sa, *sb;
|
||||
int pos=0, release_flag=0;
|
||||
@@ -223,7 +239,7 @@ static void exec_threads(blas_queue_t *queue){
|
||||
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
||||
|
||||
pos = omp_get_thread_num();
|
||||
buffer = blas_thread_buffer[pos];
|
||||
buffer = blas_thread_buffer[buf_index][pos];
|
||||
|
||||
//fallback
|
||||
if(buffer==NULL) {
|
||||
@@ -291,7 +307,7 @@ static void exec_threads(blas_queue_t *queue){
|
||||
|
||||
int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||
|
||||
BLASLONG i;
|
||||
BLASLONG i, buf_index;
|
||||
|
||||
if ((num <= 0) || (queue == NULL)) return 0;
|
||||
|
||||
@@ -302,16 +318,39 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||
}
|
||||
#endif
|
||||
|
||||
#pragma omp parallel for schedule(static)
|
||||
while(true) {
|
||||
for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Bool inuse = false;
|
||||
if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) {
|
||||
#else
|
||||
if(blas_buffer_inuse[i] == false) {
|
||||
blas_buffer_inuse[i] = true;
|
||||
#endif
|
||||
buf_index = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(i != MAX_PARALLEL_NUMBER)
|
||||
break;
|
||||
}
|
||||
|
||||
#pragma omp parallel for schedule(OMP_SCHED)
|
||||
for (i = 0; i < num; i ++) {
|
||||
|
||||
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||
queue[i].position = i;
|
||||
#endif
|
||||
|
||||
exec_threads(&queue[i]);
|
||||
exec_threads(&queue[i], buf_index);
|
||||
}
|
||||
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
atomic_store(&blas_buffer_inuse[buf_index], false);
|
||||
#else
|
||||
blas_buffer_inuse[buf_index] = false;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -49,6 +49,167 @@
|
||||
#define EXTERN
|
||||
#endif
|
||||
|
||||
#ifdef DYNAMIC_LIST
|
||||
extern gotoblas_t gotoblas_PRESCOTT;
|
||||
|
||||
#ifdef DYN_ATHLON
|
||||
extern gotoblas_t gotoblas_ATHLON;
|
||||
#else
|
||||
#define gotoblas_ATHLON gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_KATMAI
|
||||
extern gotoblas_t gotoblas_KATMAI;
|
||||
#else
|
||||
#define gotoblas_KATMAI gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BANIAS
|
||||
extern gotoblas_t gotoblas_BANIAS;
|
||||
#else
|
||||
#define gotoblas_BANIAS gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_COPPERMINE
|
||||
extern gotoblas_t gotoblas_COPPERMINE;
|
||||
#else
|
||||
#define gotoblas_COPPERMINE gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_NORTHWOOD
|
||||
extern gotoblas_t gotoblas_NORTHWOOD;
|
||||
#else
|
||||
#define gotoblas_NORTHWOOD gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_CORE2
|
||||
extern gotoblas_t gotoblas_CORE2;
|
||||
#else
|
||||
#define gotoblas_CORE2 gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_NEHALEM
|
||||
extern gotoblas_t gotoblas_NEHALEM;
|
||||
#else
|
||||
#define gotoblas_NEHALEM gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BARCELONA
|
||||
extern gotoblas_t gotoblas_BARCELONA;
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_BARCELONA gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_BARCELONA gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_ATOM
|
||||
extern gotoblas_t gotoblas_ATOM;
|
||||
elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_ATOM gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_ATOM gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_NANO
|
||||
extern gotoblas_t gotoblas_NANO;
|
||||
#else
|
||||
#define gotoblas_NANO gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_PENRYN
|
||||
extern gotoblas_t gotoblas_PENRYN;
|
||||
#else
|
||||
#define gotoblas_PENRYN gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_DUNNINGTON
|
||||
extern gotoblas_t gotoblas_DUNNINGTON;
|
||||
#else
|
||||
#define gotoblas_DUNNINGTON gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_OPTERON
|
||||
extern gotoblas_t gotoblas_OPTERON;
|
||||
#else
|
||||
#define gotoblas_OPTERON gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_OPTERON_SSE3
|
||||
extern gotoblas_t gotoblas_OPTERON_SSE3;
|
||||
#else
|
||||
#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BOBCAT
|
||||
extern gotoblas_t gotoblas_BOBCAT;
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_BOBCAT gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_BOBCAT gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_SANDYBRIDGE
|
||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BULLDOZER
|
||||
extern gotoblas_t gotoblas_BULLDOZER;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_BULLDOZER gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_BULLDOZER gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_PILEDRIVER
|
||||
extern gotoblas_t gotoblas_PILEDRIVER;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_PILEDRIVER gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_PILEDRIVER gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_STEAMROLLER
|
||||
extern gotoblas_t gotoblas_STEAMROLLER;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_STEAMROLLER gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_STEAMROLLER gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_EXCAVATOR
|
||||
extern gotoblas_t gotoblas_EXCAVATOR;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_EXCAVATOR gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_EXCAVATOR gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_HASWELL
|
||||
extern gotoblas_t gotoblas_HASWELL;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_HASWELL gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_ZEN
|
||||
extern gotoblas_t gotoblas_ZEN;
|
||||
#elif defined(DYN_HASWELL)
|
||||
#define gotoblas_ZEN gotoblas_HASWELL
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_ZEN gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_ZEN gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_SKYLAKEX
|
||||
extern gotoblas_t gotoblas_SKYLAKEX;
|
||||
#elif defined(DYN_HASWELL)
|
||||
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_SKYLAKEX gotoblas_PRESCOTT
|
||||
#endif
|
||||
|
||||
|
||||
#else // not DYNAMIC_LIST
|
||||
EXTERN gotoblas_t gotoblas_KATMAI;
|
||||
EXTERN gotoblas_t gotoblas_COPPERMINE;
|
||||
EXTERN gotoblas_t gotoblas_NORTHWOOD;
|
||||
@@ -56,16 +217,27 @@ EXTERN gotoblas_t gotoblas_BANIAS;
|
||||
EXTERN gotoblas_t gotoblas_ATHLON;
|
||||
|
||||
extern gotoblas_t gotoblas_PRESCOTT;
|
||||
extern gotoblas_t gotoblas_CORE2;
|
||||
extern gotoblas_t gotoblas_NEHALEM;
|
||||
extern gotoblas_t gotoblas_BARCELONA;
|
||||
#ifdef DYNAMIC_OLDER
|
||||
extern gotoblas_t gotoblas_ATOM;
|
||||
extern gotoblas_t gotoblas_NANO;
|
||||
extern gotoblas_t gotoblas_CORE2;
|
||||
extern gotoblas_t gotoblas_PENRYN;
|
||||
extern gotoblas_t gotoblas_DUNNINGTON;
|
||||
extern gotoblas_t gotoblas_NEHALEM;
|
||||
extern gotoblas_t gotoblas_OPTERON;
|
||||
extern gotoblas_t gotoblas_OPTERON_SSE3;
|
||||
extern gotoblas_t gotoblas_BARCELONA;
|
||||
extern gotoblas_t gotoblas_BOBCAT;
|
||||
#else
|
||||
#define gotoblas_ATOM gotoblas_NEHALEM
|
||||
#define gotoblas_NANO gotoblas_NEHALEM
|
||||
#define gotoblas_PENRYN gotoblas_CORE2
|
||||
#define gotoblas_DUNNINGTON gotoblas_CORE2
|
||||
#define gotoblas_OPTERON gotoblas_CORE2
|
||||
#define gotoblas_OPTERON_SSE3 gotoblas_CORE2
|
||||
#define gotoblas_BOBCAT gotoblas_CORE2
|
||||
#endif
|
||||
|
||||
#ifndef NO_AVX
|
||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||
extern gotoblas_t gotoblas_BULLDOZER;
|
||||
@@ -74,15 +246,22 @@ extern gotoblas_t gotoblas_STEAMROLLER;
|
||||
extern gotoblas_t gotoblas_EXCAVATOR;
|
||||
#ifdef NO_AVX2
|
||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||
#else
|
||||
extern gotoblas_t gotoblas_HASWELL;
|
||||
extern gotoblas_t gotoblas_ZEN;
|
||||
#ifndef NO_AVX512
|
||||
extern gotoblas_t gotoblas_SKYLAKEX;
|
||||
#else
|
||||
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
//Use NEHALEM kernels for sandy bridge
|
||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
||||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
||||
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
||||
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
||||
@@ -90,6 +269,7 @@ extern gotoblas_t gotoblas_ZEN;
|
||||
#define gotoblas_ZEN gotoblas_BARCELONA
|
||||
#endif
|
||||
|
||||
#endif // DYNAMIC_LIST
|
||||
|
||||
#define VENDOR_INTEL 1
|
||||
#define VENDOR_AMD 2
|
||||
@@ -284,8 +464,21 @@ static gotoblas_t *get_coretype(void){
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
if (model == 5) {
|
||||
// Intel Skylake X
|
||||
#ifndef NO_AVX512
|
||||
return &gotoblas_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
return &gotoblas_HASWELL;
|
||||
else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
//Intel Skylake
|
||||
if (model == 14 || model == 5) {
|
||||
if (model == 14) {
|
||||
if(support_avx())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
@@ -307,6 +500,23 @@ static gotoblas_t *get_coretype(void){
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
return NULL;
|
||||
case 6:
|
||||
if (model == 6) {
|
||||
// Cannon Lake
|
||||
#ifndef NO_AVX512
|
||||
return &gotoblas_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return &gotoblas_HASWELL;
|
||||
#else
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return &gotoblas_NEHALEM;
|
||||
#endif
|
||||
}
|
||||
return NULL;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 14 ) { // Kaby Lake
|
||||
@@ -397,7 +607,7 @@ static gotoblas_t *get_coretype(void){
|
||||
}
|
||||
}
|
||||
} else if (exfamily == 8) {
|
||||
if (model == 1) {
|
||||
if (model == 1 || model == 8) {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
@@ -445,7 +655,8 @@ static char *corename[] = {
|
||||
"Haswell",
|
||||
"Steamroller",
|
||||
"Excavator",
|
||||
"Zen"
|
||||
"Zen",
|
||||
"SkylakeX"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
@@ -473,7 +684,7 @@ char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
|
||||
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
|
||||
if (gotoblas == &gotoblas_ZEN) return corename[23];
|
||||
|
||||
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
@@ -485,7 +696,7 @@ static gotoblas_t *force_coretype(char *coretype){
|
||||
char message[128];
|
||||
//char mname[20];
|
||||
|
||||
for ( i=1 ; i <= 23; i++)
|
||||
for ( i=1 ; i <= 24; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype,corename[i],20))
|
||||
{
|
||||
@@ -503,6 +714,7 @@ static gotoblas_t *force_coretype(char *coretype){
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 24: return (&gotoblas_SKYLAKEX);
|
||||
case 23: return (&gotoblas_ZEN);
|
||||
case 22: return (&gotoblas_EXCAVATOR);
|
||||
case 21: return (&gotoblas_STEAMROLLER);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -35,6 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#if defined(_WIN32) && defined(_MSC_VER)
|
||||
#if _MSC_VER < 1900
|
||||
#define snprintf _snprintf
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static char* openblas_config_str=""
|
||||
#ifdef USE64BITINT
|
||||
"USE64BITINT "
|
||||
@@ -54,6 +60,9 @@ static char* openblas_config_str=""
|
||||
#ifdef NO_AFFINITY
|
||||
"NO_AFFINITY "
|
||||
#endif
|
||||
#ifdef USE_OPENMP
|
||||
"USE_OPENMP "
|
||||
#endif
|
||||
#ifndef DYNAMIC_ARCH
|
||||
CHAR_CORENAME
|
||||
#endif
|
||||
@@ -61,18 +70,23 @@ static char* openblas_config_str=""
|
||||
|
||||
#ifdef DYNAMIC_ARCH
|
||||
char *gotoblas_corename();
|
||||
static char tmp_config_str[256];
|
||||
#endif
|
||||
|
||||
static char tmp_config_str[256];
|
||||
int openblas_get_parallel();
|
||||
|
||||
char* CNAME() {
|
||||
#ifndef DYNAMIC_ARCH
|
||||
return openblas_config_str;
|
||||
#else
|
||||
char tmpstr[20];
|
||||
strcpy(tmp_config_str, openblas_config_str);
|
||||
#ifdef DYNAMIC_ARCH
|
||||
strcat(tmp_config_str, gotoblas_corename());
|
||||
return tmp_config_str;
|
||||
#endif
|
||||
if (openblas_get_parallel() == 0)
|
||||
sprintf(tmpstr, " SINGLE_THREADED");
|
||||
else
|
||||
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
|
||||
strcat(tmp_config_str, tmpstr);
|
||||
return tmp_config_str;
|
||||
}
|
||||
|
||||
|
||||
@@ -83,3 +97,4 @@ char* openblas_get_corename() {
|
||||
return gotoblas_corename();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -167,7 +167,7 @@ int get_L2_size(void){
|
||||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
|
||||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
|
||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
|
||||
|
||||
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
@@ -251,7 +251,7 @@ int get_L2_size(void){
|
||||
void blas_set_parameter(void){
|
||||
|
||||
int factor;
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
|
||||
int size = 16;
|
||||
#else
|
||||
int size = get_L2_size();
|
||||
|
||||
@@ -114,20 +114,22 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
|
||||
endif
|
||||
ifneq (,$(filter 1 2,$(NOFORTRAN)))
|
||||
#only build without Fortran
|
||||
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
else
|
||||
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
endif
|
||||
|
||||
dllinit.$(SUFFIX) : dllinit.c
|
||||
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
||||
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
|
||||
so : ../$(LIBSONAME)
|
||||
|
||||
ifeq ($(OSNAME), Android)
|
||||
INTERNALNAME = $(LIBPREFIX).so
|
||||
FEXTRALIB += -lm
|
||||
EXTRALIB += -lm
|
||||
else
|
||||
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
@@ -156,7 +158,7 @@ endif
|
||||
endif
|
||||
|
||||
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||
|
||||
so : ../$(LIBSONAME)
|
||||
|
||||
|
||||
6
f_check
6
f_check
@@ -97,7 +97,7 @@ if ($compiler eq "") {
|
||||
|
||||
if ($data =~ /Intel/) {
|
||||
$vendor = INTEL;
|
||||
$openmp = "-openmp";
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($data =~ /Sun Fortran/) {
|
||||
@@ -127,7 +127,7 @@ if ($compiler eq "") {
|
||||
|
||||
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
|
||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
|
||||
if ($data =~ /zho_ge__/) {
|
||||
if ($data =~ / zho_ge__/) {
|
||||
$need2bu = 1;
|
||||
}
|
||||
}
|
||||
@@ -155,7 +155,7 @@ if ($compiler eq "") {
|
||||
if ($compiler =~ /ifort/) {
|
||||
$vendor = INTEL;
|
||||
$bu = "_";
|
||||
$openmp = "-openmp";
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /pathf/) {
|
||||
|
||||
23
getarch.c
23
getarch.c
@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#ifdef OS_WINDOWS
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#if defined(__FreeBSD__) || defined(__APPLE__)
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
@@ -326,6 +326,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define CORENAME "HASWELL"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_SKYLAKEX
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#define SUBARCHITECTURE "SKYLAKEX"
|
||||
#define ARCHCONFIG "-DSKYLAKEX " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
|
||||
"-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
|
||||
#define LIBNAME "skylakex"
|
||||
#define CORENAME "SKYLAKEX"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ATOM
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
@@ -1074,7 +1089,7 @@ static int get_num_cores(void) {
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
SYSTEM_INFO sysinfo;
|
||||
#elif defined(__FreeBSD__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
int m[2], count;
|
||||
size_t len;
|
||||
#endif
|
||||
@@ -1088,7 +1103,7 @@ static int get_num_cores(void) {
|
||||
GetSystemInfo(&sysinfo);
|
||||
return sysinfo.dwNumberOfProcessors;
|
||||
|
||||
#elif defined(__FreeBSD__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
@@ -1181,9 +1196,7 @@ int main(int argc, char *argv[]){
|
||||
#elif NO_PARALLEL_MAKE==1
|
||||
printf("MAKE += -j 1\n");
|
||||
#else
|
||||
#ifndef OS_WINDOWS
|
||||
printf("MAKE += -j %d\n", get_num_cores());
|
||||
#endif
|
||||
#endif
|
||||
|
||||
break;
|
||||
|
||||
@@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \
|
||||
idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX)
|
||||
|
||||
CSBLAS1OBJS = \
|
||||
cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
|
||||
cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
|
||||
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
|
||||
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
|
||||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
|
||||
@@ -277,7 +277,7 @@ CSBLAS3OBJS = \
|
||||
cblas_sgeadd.$(SUFFIX)
|
||||
|
||||
CDBLAS1OBJS = \
|
||||
cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
|
||||
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
|
||||
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
||||
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
|
||||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
|
||||
@@ -294,7 +294,7 @@ CDBLAS3OBJS += \
|
||||
cblas_dgeadd.$(SUFFIX)
|
||||
|
||||
CCBLAS1OBJS = \
|
||||
cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
|
||||
cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
|
||||
cblas_ccopy.$(SUFFIX) \
|
||||
cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \
|
||||
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
|
||||
@@ -320,7 +320,7 @@ CCBLAS3OBJS = \
|
||||
|
||||
|
||||
CZBLAS1OBJS = \
|
||||
cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
|
||||
cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
|
||||
cblas_zcopy.$(SUFFIX) \
|
||||
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
|
||||
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
|
||||
@@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c
|
||||
cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
|
||||
@@ -40,11 +40,11 @@
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
#endif
|
||||
#if defined(Z13)
|
||||
#define MULTI_THREAD_MINIMAL 200000
|
||||
#else
|
||||
#define MULTI_THREAD_MINIMAL 10000
|
||||
#define MULTI_THREAD_MINIMAL 10000
|
||||
#endif
|
||||
#ifndef CBLAS
|
||||
|
||||
@@ -83,17 +83,15 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
|
||||
if (incy < 0) y -= (n - 1) * incy;
|
||||
|
||||
#ifdef SMP
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
//disable multi-thread when incx==0 or incy==0
|
||||
//In that case, the threads would be dependent.
|
||||
if (incx == 0 || incy == 0)
|
||||
nthreads = 1;
|
||||
|
||||
//
|
||||
//Temporarily work-around the low performance issue with small imput size &
|
||||
//multithreads.
|
||||
if (n <= MULTI_THREAD_MINIMAL)
|
||||
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (trans) lenx = m;
|
||||
if (trans) leny = n;
|
||||
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
||||
@@ -44,6 +44,7 @@
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
#define SMP_THRESHOLD_MIN 65536.0
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "QGEMM "
|
||||
#elif defined(DOUBLE)
|
||||
@@ -52,6 +53,7 @@
|
||||
#define ERROR_NAME "SGEMM "
|
||||
#endif
|
||||
#else
|
||||
#define SMP_THRESHOLD_MIN 8192.0
|
||||
#ifndef GEMM3M
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "XGEMM "
|
||||
@@ -121,8 +123,6 @@ void NAME(char *TRANSA, char *TRANSB,
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
int nthreads_max;
|
||||
int nthreads_avail;
|
||||
double MNK;
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
@@ -245,8 +245,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||
XFLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
int nthreads_max;
|
||||
int nthreads_avail;
|
||||
double MNK;
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
@@ -411,25 +409,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||
mode |= (transa << BLAS_TRANSA_SHIFT);
|
||||
mode |= (transb << BLAS_TRANSB_SHIFT);
|
||||
|
||||
nthreads_max = num_cpu_avail(3);
|
||||
nthreads_avail = nthreads_max;
|
||||
|
||||
#ifndef COMPLEX
|
||||
MNK = (double) args.m * (double) args.n * (double) args.k;
|
||||
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||
nthreads_max = 1;
|
||||
#else
|
||||
MNK = (double) args.m * (double) args.n * (double) args.k;
|
||||
if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||
nthreads_max = 1;
|
||||
#endif
|
||||
args.common = NULL;
|
||||
|
||||
if ( nthreads_max > nthreads_avail )
|
||||
args.nthreads = nthreads_avail;
|
||||
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = nthreads_max;
|
||||
|
||||
args.nthreads = num_cpu_avail(3);
|
||||
args.common = NULL;
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (trans) lenx = m;
|
||||
if (trans) leny = n;
|
||||
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
||||
@@ -97,7 +97,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
|
||||
|
||||
blas_level1_thread(mode, n, k1, k2, dummyalpha,
|
||||
a, lda, NULL, 0, ipiv, incx,
|
||||
laswp[flag], nthreads);
|
||||
(int(*)())laswp[flag], nthreads);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -96,7 +96,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
|
||||
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
|
||||
blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads);
|
||||
blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)())laswp[flag], nthreads);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -22,8 +22,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||
long double s;
|
||||
long double r, roe, z;
|
||||
|
||||
long double ada = fabs(da);
|
||||
long double adb = fabs(db);
|
||||
long double ada = fabsl(da);
|
||||
long double adb = fabsl(db);
|
||||
long double scale = ada + adb;
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
@@ -64,6 +64,13 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||
|
||||
FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp;
|
||||
|
||||
if (*dd2 == ZERO || dy1 == ZERO)
|
||||
{
|
||||
dflag = -TWO;
|
||||
dparam[0] = dflag;
|
||||
return;
|
||||
}
|
||||
|
||||
if(*dd1 < ZERO)
|
||||
{
|
||||
dflag = -ONE;
|
||||
@@ -76,6 +83,16 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||
*dd2 = ZERO;
|
||||
*dx1 = ZERO;
|
||||
}
|
||||
else if ((*dd1 == ZERO || *dx1 == ZERO) && *dd2 > ZERO)
|
||||
{
|
||||
dflag = ONE;
|
||||
dh12 = 1;
|
||||
dh21 = -1;
|
||||
*dx1 = dy1;
|
||||
dtemp = *dd1;
|
||||
*dd1 = *dd2;
|
||||
*dd2 = dtemp;
|
||||
}
|
||||
else
|
||||
{
|
||||
dp2 = *dd2 * dy1;
|
||||
@@ -90,6 +107,9 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||
dq1 = dp1 * *dx1;
|
||||
if(ABS(dq1) > ABS(dq2))
|
||||
{
|
||||
dflag = ZERO;
|
||||
dh11 = ONE;
|
||||
dh22 = ONE;
|
||||
dh21 = - dy1 / *dx1;
|
||||
dh12 = dp2 / dp1;
|
||||
|
||||
@@ -100,8 +120,19 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||
*dd1 = *dd1 / du;
|
||||
*dd2 = *dd2 / du;
|
||||
*dx1 = *dx1 * du;
|
||||
} else {
|
||||
dflag = -ONE;
|
||||
|
||||
dh11 = ZERO;
|
||||
dh12 = ZERO;
|
||||
dh21 = ZERO;
|
||||
dh22 = ZERO;
|
||||
|
||||
*dd1 = ZERO;
|
||||
*dd2 = ZERO;
|
||||
*dx1 = ZERO;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -120,7 +151,9 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||
}
|
||||
else
|
||||
{
|
||||
dflag = ONE;
|
||||
dflag = ONE;
|
||||
dh21 = -ONE;
|
||||
dh12 = ONE;
|
||||
|
||||
dh11 = dp1 / dp2;
|
||||
dh22 = *dx1 / dy1;
|
||||
@@ -134,76 +167,33 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||
}
|
||||
|
||||
|
||||
if(*dd1 != ZERO)
|
||||
while ( *dd1 <= RGAMSQ && *dd1 != ZERO)
|
||||
{
|
||||
if( (*dd1 <= RGAMSQ) || (*dd1 >= GAMSQ) )
|
||||
{
|
||||
if(dflag == ZERO)
|
||||
{
|
||||
dh11 = ONE;
|
||||
dh22 = ONE;
|
||||
dflag = -ONE;
|
||||
}
|
||||
else
|
||||
{
|
||||
dh21 = -ONE;
|
||||
dh12 = ONE;
|
||||
dflag = -ONE;
|
||||
}
|
||||
if( *dd1 <= RGAMSQ )
|
||||
{
|
||||
while (ABS(*dd1) <= RGAMSQ) {
|
||||
*dd1 = *dd1 * (GAM * GAM);
|
||||
*dx1 = *dx1 / GAM;
|
||||
dh11 = dh11 / GAM;
|
||||
dh12 = dh12 / GAM;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while (ABS(*dd1) >= GAMSQ) {
|
||||
*dd1 = *dd1 / (GAM * GAM);
|
||||
*dx1 = *dx1 * GAM;
|
||||
dh11 = dh11 * GAM;
|
||||
dh12 = dh12 * GAM;
|
||||
}
|
||||
}
|
||||
}
|
||||
dflag = -ONE;
|
||||
*dd1 = *dd1 * (GAM * GAM);
|
||||
*dx1 = *dx1 / GAM;
|
||||
dh11 = dh11 / GAM;
|
||||
dh12 = dh12 / GAM;
|
||||
}
|
||||
while (ABS(*dd1) > GAMSQ) {
|
||||
dflag = -ONE;
|
||||
*dd1 = *dd1 / (GAM * GAM);
|
||||
*dx1 = *dx1 * GAM;
|
||||
dh11 = dh11 * GAM;
|
||||
dh12 = dh12 * GAM;
|
||||
}
|
||||
|
||||
if(*dd2 != ZERO)
|
||||
{
|
||||
if( (ABS(*dd2) <= RGAMSQ) || (ABS(*dd2) >= GAMSQ) )
|
||||
{
|
||||
if(dflag == ZERO)
|
||||
{
|
||||
dh11 = ONE;
|
||||
dh22 = ONE;
|
||||
dflag = -ONE;
|
||||
}
|
||||
else
|
||||
{
|
||||
dh21 = -ONE;
|
||||
dh12 = ONE;
|
||||
dflag = -ONE;
|
||||
}
|
||||
if( ABS(*dd2) <= RGAMSQ )
|
||||
{
|
||||
while (ABS(*dd2) <= RGAMSQ) {
|
||||
*dd2 = *dd2 * (GAM * GAM);
|
||||
dh21 = dh21 / GAM;
|
||||
dh22 = dh22 / GAM;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while (ABS(*dd2) >= GAMSQ) {
|
||||
*dd2 = *dd2 / (GAM * GAM);
|
||||
dh21 = dh21 * GAM;
|
||||
dh22 = dh22 * GAM;
|
||||
}
|
||||
}
|
||||
}
|
||||
while (ABS(*dd2) <= RGAMSQ && *dd2 != ZERO) {
|
||||
dflag = -ONE;
|
||||
*dd2 = *dd2 * (GAM * GAM);
|
||||
dh21 = dh21 / GAM;
|
||||
dh22 = dh22 / GAM;
|
||||
}
|
||||
while (ABS(*dd2) > GAMSQ) {
|
||||
dflag = -ONE;
|
||||
*dd2 = *dd2 / (GAM * GAM);
|
||||
dh21 = dh21 * GAM;
|
||||
dh22 = dh22 * GAM;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -184,7 +184,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
||||
@@ -76,10 +76,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){
|
||||
|
||||
|
||||
#ifdef SMP
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
if (n <= 1048576 )
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -168,7 +168,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
||||
@@ -166,7 +166,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
||||
@@ -366,12 +366,13 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
mode |= (trans << BLAS_TRANSA_SHIFT);
|
||||
mode |= (side << BLAS_RSIDE_SHIFT);
|
||||
|
||||
args.nthreads = num_cpu_avail(3);
|
||||
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
|
||||
args.nthreads = 1;
|
||||
else
|
||||
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(3);
|
||||
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
|
||||
@@ -41,7 +41,11 @@
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#if defined(Z13)
|
||||
#define MULTI_THREAD_MINIMAL 200000
|
||||
#else
|
||||
#define MULTI_THREAD_MINIMAL 10000
|
||||
#endif
|
||||
#ifndef CBLAS
|
||||
|
||||
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
|
||||
@@ -69,7 +73,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
|
||||
#endif
|
||||
|
||||
#ifndef CBLAS
|
||||
PRINT_DEBUG_CNAME;
|
||||
PRINT_DEBUG_NAME;
|
||||
#else
|
||||
PRINT_DEBUG_CNAME;
|
||||
#endif
|
||||
@@ -86,12 +90,15 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
|
||||
if (incy < 0) y -= (n - 1) * incy * 2;
|
||||
|
||||
#ifdef SMP
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
//disable multi-thread when incx==0 or incy==0
|
||||
//In that case, the threads would be dependent.
|
||||
if (incx == 0 || incy == 0)
|
||||
//
|
||||
//Temporarily work-around the low performance issue with small imput size &
|
||||
//multithreads.
|
||||
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -237,7 +237,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (trans & 1) lenx = m;
|
||||
if (trans & 1) leny = n;
|
||||
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha_r == ZERO && alpha_i == ZERO) return;
|
||||
|
||||
|
||||
@@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (trans & 1) lenx = m;
|
||||
if (trans & 1) leny = n;
|
||||
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha_r == ZERO && alpha_i == ZERO) return;
|
||||
|
||||
|
||||
@@ -190,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
|
||||
@@ -181,7 +181,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
|
||||
@@ -180,7 +180,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||
long double db_i = *(DB + 1);
|
||||
long double r;
|
||||
|
||||
long double ada = fabs(da_r) + fabs(da_i);
|
||||
long double ada = fabsl(da_r) + fabsl(da_i);
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
|
||||
@@ -90,10 +90,10 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
#ifdef SMP
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
if ( n <= 1048576 )
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -79,12 +79,12 @@ FLOAT *y = (FLOAT*)vy;
|
||||
if (incy < 0) y -= (n - 1) * incy * 2;
|
||||
|
||||
#ifdef SMP
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
//disable multi-thread when incx==0 or incy==0
|
||||
//In that case, the threads would be dependent.
|
||||
if (incx == 0 || incy == 0)
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -239,6 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
} else
|
||||
nthreads = 1;
|
||||
|
||||
/* FIXME TRMV multithreading appears to be broken, see issue 1332*/
|
||||
nthreads = 1;
|
||||
|
||||
if(nthreads > 1) {
|
||||
buffer_size = n > 16 ? 0 : n * 4 + 40;
|
||||
}
|
||||
|
||||
@@ -121,7 +121,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||
# Makefile.L3
|
||||
set(USE_TRMM false)
|
||||
|
||||
if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen")
|
||||
if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex")
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
|
||||
|
||||
@@ -29,9 +29,11 @@ USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), HASWELL)
|
||||
ifeq ($(ARCH), x86_64)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), SKYLAKEX)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ZEN)
|
||||
@@ -42,7 +44,7 @@ ifeq ($(CORE), POWER8)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z13)
|
||||
ifeq ($(ARCH), zarch)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
|
||||
@@ -49,6 +49,7 @@ SDOTKERNEL = ../arm/dot.c
|
||||
DDOTKERNEL = ../arm/dot.c
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
DSDOTKERNEL = ../generic/dot.c
|
||||
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
|
||||
@@ -58,11 +58,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d4 - d5 }
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
fldmiad X!, { d6 - d7 }
|
||||
vldmia.f64 X!, { d6 - d7 }
|
||||
vabs.f64 d6, d6
|
||||
vadd.f64 d1 , d1, d5
|
||||
vabs.f64 d7, d7
|
||||
@@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 }
|
||||
vldmia.f64 X!, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
|
||||
@@ -82,22 +82,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
fldmiad X, { d4 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
fldmiad X, { d4 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
fldmiad X, { d4 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
fldmiad X, { d4 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
@@ -107,7 +107,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
@@ -118,11 +118,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
fldmias X!, { s4 - s5 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
fldmias X!, { s6 - s7 }
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
vabs.f32 s6, s6
|
||||
vadd.f32 s1 , s1, s5
|
||||
vabs.f32 s7, s7
|
||||
@@ -133,7 +133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 }
|
||||
vldmia.f32 X!, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
|
||||
@@ -142,22 +142,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
fldmias X, { s4 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
fldmias X, { s4 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
fldmias X, { s4 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
fldmias X, { s4 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
@@ -167,7 +167,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
@@ -184,11 +184,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d4 - d5 }
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
fldmiad X!, { d6 - d7 }
|
||||
vldmia.f64 X!, { d6 - d7 }
|
||||
vabs.f64 d6, d6
|
||||
vadd.f64 d1 , d1, d5
|
||||
vabs.f64 d7, d7
|
||||
@@ -196,11 +196,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vadd.f64 d1 , d1, d7
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d4 - d5 }
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
fldmiad X!, { d6 - d7 }
|
||||
vldmia.f64 X!, { d6 - d7 }
|
||||
vabs.f64 d6, d6
|
||||
vadd.f64 d1 , d1, d5
|
||||
vabs.f64 d7, d7
|
||||
@@ -212,11 +212,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 }
|
||||
vldmia.f64 X!, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
|
||||
fldmiad X!, { d4 }
|
||||
vldmia.f64 X!, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
|
||||
@@ -226,28 +226,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
fldmiad X, { d4 -d5 }
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
fldmiad X, { d4 -d5 }
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
fldmiad X, { d4 -d5 }
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
fldmiad X, { d4 -d5 }
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
@@ -259,7 +259,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 -d5 }
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
@@ -273,22 +273,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmias X!, { s4 - s5 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
fldmias X!, { s6 - s7 }
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
vabs.f32 s6, s6
|
||||
vadd.f32 s1 , s1, s5
|
||||
vabs.f32 s7, s7
|
||||
vadd.f32 s0 , s0, s6
|
||||
vadd.f32 s1 , s1, s7
|
||||
|
||||
fldmias X!, { s4 - s5 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
fldmias X!, { s6 - s7 }
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
vabs.f32 s6, s6
|
||||
vadd.f32 s1 , s1, s5
|
||||
vabs.f32 s7, s7
|
||||
@@ -300,11 +300,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 }
|
||||
vldmia.f32 X!, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
|
||||
fldmias X!, { s4 }
|
||||
vldmia.f32 X!, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
|
||||
@@ -313,28 +313,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
fldmias X, { s4 -s5 }
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
fldmias X, { s4 -s5 }
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
fldmias X, { s4 -s5 }
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
fldmias X, { s4 -s5 }
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
@@ -346,7 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 -s5 }
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
|
||||
@@ -146,17 +146,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d4 - d7 }
|
||||
vldmia.f64 X!, { d4 - d7 }
|
||||
pld [ Y, #X_PRE ]
|
||||
fldmiad Y , { d8 - d11 }
|
||||
vldmia.f64 Y , { d8 - d11 }
|
||||
fmacd d8 , d0, d4
|
||||
fstmiad Y!, { d8 }
|
||||
vstmia.f64 Y!, { d8 }
|
||||
fmacd d9 , d0, d5
|
||||
fstmiad Y!, { d9 }
|
||||
vstmia.f64 Y!, { d9 }
|
||||
fmacd d10, d0, d6
|
||||
fstmiad Y!, { d10 }
|
||||
vstmia.f64 Y!, { d10 }
|
||||
fmacd d11, d0, d7
|
||||
fstmiad Y!, { d11 }
|
||||
vstmia.f64 Y!, { d11 }
|
||||
|
||||
|
||||
.endm
|
||||
@@ -164,19 +164,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 }
|
||||
fldmiad Y , { d8 }
|
||||
vldmia.f64 X!, { d4 }
|
||||
vldmia.f64 Y , { d8 }
|
||||
fmacd d8 , d0, d4
|
||||
fstmiad Y!, { d8 }
|
||||
vstmia.f64 Y!, { d8 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X , { d4 }
|
||||
fldmiad Y , { d8 }
|
||||
vldmia.f64 X , { d4 }
|
||||
vldmia.f64 Y , { d8 }
|
||||
fmacd d8 , d0, d4
|
||||
fstmiad Y , { d8 }
|
||||
vstmia.f64 Y , { d8 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -186,16 +186,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
fldmias X!, { s4 - s7 }
|
||||
fldmias Y , { s8 - s11 }
|
||||
vldmia.f32 X!, { s4 - s7 }
|
||||
vldmia.f32 Y , { s8 - s11 }
|
||||
fmacs s8 , s0, s4
|
||||
fstmias Y!, { s8 }
|
||||
vstmia.f32 Y!, { s8 }
|
||||
fmacs s9 , s0, s5
|
||||
fstmias Y!, { s9 }
|
||||
vstmia.f32 Y!, { s9 }
|
||||
fmacs s10, s0, s6
|
||||
fstmias Y!, { s10 }
|
||||
vstmia.f32 Y!, { s10 }
|
||||
fmacs s11, s0, s7
|
||||
fstmias Y!, { s11 }
|
||||
vstmia.f32 Y!, { s11 }
|
||||
|
||||
|
||||
.endm
|
||||
@@ -203,19 +203,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 }
|
||||
fldmias Y , { s8 }
|
||||
vldmia.f32 X!, { s4 }
|
||||
vldmia.f32 Y , { s8 }
|
||||
fmacs s8 , s0, s4
|
||||
fstmias Y!, { s8 }
|
||||
vstmia.f32 Y!, { s8 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X , { s4 }
|
||||
fldmias Y , { s8 }
|
||||
vldmia.f32 X , { s4 }
|
||||
vldmia.f32 Y , { s8 }
|
||||
fmacs s8 , s0, s4
|
||||
fstmias Y , { s8 }
|
||||
vstmia.f32 Y , { s8 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -231,42 +231,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d4 - d7 }
|
||||
vldmia.f64 X!, { d4 - d7 }
|
||||
pld [ Y, #X_PRE ]
|
||||
fldmiad Y , { d8 - d11 }
|
||||
vldmia.f64 Y , { d8 - d11 }
|
||||
|
||||
FMAC_R1 d8 , d0, d4
|
||||
FMAC_R2 d8 , d1, d5
|
||||
FMAC_I1 d9 , d0, d5
|
||||
FMAC_I2 d9 , d1, d4
|
||||
fstmiad Y!, { d8 }
|
||||
fstmiad Y!, { d9 }
|
||||
vstmia.f64 Y!, { d8 }
|
||||
vstmia.f64 Y!, { d9 }
|
||||
|
||||
FMAC_R1 d10, d0, d6
|
||||
FMAC_R2 d10, d1, d7
|
||||
FMAC_I1 d11, d0, d7
|
||||
FMAC_I2 d11, d1, d6
|
||||
fstmiad Y!, { d10 }
|
||||
fstmiad Y!, { d11 }
|
||||
vstmia.f64 Y!, { d10 }
|
||||
vstmia.f64 Y!, { d11 }
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d4 - d7 }
|
||||
vldmia.f64 X!, { d4 - d7 }
|
||||
pld [ Y, #X_PRE ]
|
||||
fldmiad Y , { d8 - d11 }
|
||||
vldmia.f64 Y , { d8 - d11 }
|
||||
|
||||
FMAC_R1 d8 , d0, d4
|
||||
FMAC_R2 d8 , d1, d5
|
||||
FMAC_I1 d9 , d0, d5
|
||||
FMAC_I2 d9 , d1, d4
|
||||
fstmiad Y!, { d8 }
|
||||
fstmiad Y!, { d9 }
|
||||
vstmia.f64 Y!, { d8 }
|
||||
vstmia.f64 Y!, { d9 }
|
||||
|
||||
FMAC_R1 d10, d0, d6
|
||||
FMAC_R2 d10, d1, d7
|
||||
FMAC_I1 d11, d0, d7
|
||||
FMAC_I2 d11, d1, d6
|
||||
fstmiad Y!, { d10 }
|
||||
fstmiad Y!, { d11 }
|
||||
vstmia.f64 Y!, { d10 }
|
||||
vstmia.f64 Y!, { d11 }
|
||||
|
||||
|
||||
|
||||
@@ -277,15 +277,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 - d5 }
|
||||
fldmiad Y , { d8 - d9 }
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vldmia.f64 Y , { d8 - d9 }
|
||||
|
||||
FMAC_R1 d8 , d0, d4
|
||||
FMAC_R2 d8 , d1, d5
|
||||
FMAC_I1 d9 , d0, d5
|
||||
FMAC_I2 d9 , d1, d4
|
||||
fstmiad Y!, { d8 }
|
||||
fstmiad Y!, { d9 }
|
||||
vstmia.f64 Y!, { d8 }
|
||||
vstmia.f64 Y!, { d9 }
|
||||
|
||||
|
||||
|
||||
@@ -293,14 +293,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X , { d4 - d5 }
|
||||
fldmiad Y , { d8 - d9 }
|
||||
vldmia.f64 X , { d4 - d5 }
|
||||
vldmia.f64 Y , { d8 - d9 }
|
||||
|
||||
FMAC_R1 d8 , d0, d4
|
||||
FMAC_R2 d8 , d1, d5
|
||||
FMAC_I1 d9 , d0, d5
|
||||
FMAC_I2 d9 , d1, d4
|
||||
fstmiad Y , { d8 - d9 }
|
||||
vstmia.f64 Y , { d8 - d9 }
|
||||
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
@@ -314,40 +314,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmias X!, { s4 - s7 }
|
||||
vldmia.f32 X!, { s4 - s7 }
|
||||
pld [ Y, #X_PRE ]
|
||||
fldmias Y , { s8 - s11 }
|
||||
vldmia.f32 Y , { s8 - s11 }
|
||||
|
||||
FMAC_R1 s8 , s0, s4
|
||||
FMAC_R2 s8 , s1, s5
|
||||
FMAC_I1 s9 , s0, s5
|
||||
FMAC_I2 s9 , s1, s4
|
||||
fstmias Y!, { s8 }
|
||||
fstmias Y!, { s9 }
|
||||
vstmia.f32 Y!, { s8 }
|
||||
vstmia.f32 Y!, { s9 }
|
||||
|
||||
FMAC_R1 s10, s0, s6
|
||||
FMAC_R2 s10, s1, s7
|
||||
FMAC_I1 s11, s0, s7
|
||||
FMAC_I2 s11, s1, s6
|
||||
fstmias Y!, { s10 }
|
||||
fstmias Y!, { s11 }
|
||||
vstmia.f32 Y!, { s10 }
|
||||
vstmia.f32 Y!, { s11 }
|
||||
|
||||
fldmias X!, { s4 - s7 }
|
||||
fldmias Y , { s8 - s11 }
|
||||
vldmia.f32 X!, { s4 - s7 }
|
||||
vldmia.f32 Y , { s8 - s11 }
|
||||
|
||||
FMAC_R1 s8 , s0, s4
|
||||
FMAC_R2 s8 , s1, s5
|
||||
FMAC_I1 s9 , s0, s5
|
||||
FMAC_I2 s9 , s1, s4
|
||||
fstmias Y!, { s8 }
|
||||
fstmias Y!, { s9 }
|
||||
vstmia.f32 Y!, { s8 }
|
||||
vstmia.f32 Y!, { s9 }
|
||||
|
||||
FMAC_R1 s10, s0, s6
|
||||
FMAC_R2 s10, s1, s7
|
||||
FMAC_I1 s11, s0, s7
|
||||
FMAC_I2 s11, s1, s6
|
||||
fstmias Y!, { s10 }
|
||||
fstmias Y!, { s11 }
|
||||
vstmia.f32 Y!, { s10 }
|
||||
vstmia.f32 Y!, { s11 }
|
||||
|
||||
|
||||
|
||||
@@ -358,15 +358,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 - s5 }
|
||||
fldmias Y , { s8 - s9 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vldmia.f32 Y , { s8 - s9 }
|
||||
|
||||
FMAC_R1 s8 , s0, s4
|
||||
FMAC_R2 s8 , s1, s5
|
||||
FMAC_I1 s9 , s0, s5
|
||||
FMAC_I2 s9 , s1, s4
|
||||
fstmias Y!, { s8 }
|
||||
fstmias Y!, { s9 }
|
||||
vstmia.f32 Y!, { s8 }
|
||||
vstmia.f32 Y!, { s9 }
|
||||
|
||||
|
||||
|
||||
@@ -374,14 +374,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X , { s4 - s5 }
|
||||
fldmias Y , { s8 - s9 }
|
||||
vldmia.f32 X , { s4 - s5 }
|
||||
vldmia.f32 Y , { s8 - s9 }
|
||||
|
||||
FMAC_R1 s8 , s0, s4
|
||||
FMAC_R2 s8 , s1, s5
|
||||
FMAC_I1 s9 , s0, s5
|
||||
FMAC_I2 s9 , s1, s4
|
||||
fstmias Y , { s8 - s9 }
|
||||
vstmia.f32 Y , { s8 - s9 }
|
||||
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
@@ -440,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
cmp N, #0
|
||||
ble axpy_kernel_L999
|
||||
|
||||
/*
|
||||
cmp INC_X, #0
|
||||
beq axpy_kernel_L999
|
||||
|
||||
cmp INC_Y, #0
|
||||
beq axpy_kernel_L999
|
||||
|
||||
*/
|
||||
cmp INC_X, #1
|
||||
bne axpy_kernel_S_BEGIN
|
||||
|
||||
|
||||
@@ -65,15 +65,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmias X!, { s0 - s7 }
|
||||
fstmias Y!, { s0 - s7 }
|
||||
vldmia.f32 X!, { s0 - s7 }
|
||||
vstmia.f32 Y!, { s0 - s7 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro COPY_F1
|
||||
|
||||
fldmias X!, { s0 - s1 }
|
||||
fstmias Y!, { s0 - s1 }
|
||||
vldmia.f32 X!, { s0 - s1 }
|
||||
vstmia.f32 Y!, { s0 - s1 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -83,23 +83,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY_S4
|
||||
|
||||
nop
|
||||
fldmias X, { s0 - s1 }
|
||||
fstmias Y, { s0 - s1 }
|
||||
vldmia.f32 X, { s0 - s1 }
|
||||
vstmia.f32 Y, { s0 - s1 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s2 - s3 }
|
||||
fstmias Y, { s2 - s3 }
|
||||
vldmia.f32 X, { s2 - s3 }
|
||||
vstmia.f32 Y, { s2 - s3 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s0 - s1 }
|
||||
fstmias Y, { s0 - s1 }
|
||||
vldmia.f32 X, { s0 - s1 }
|
||||
vstmia.f32 Y, { s0 - s1 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s2 - s3 }
|
||||
fstmias Y, { s2 - s3 }
|
||||
vldmia.f32 X, { s2 - s3 }
|
||||
vstmia.f32 Y, { s2 - s3 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -108,8 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY_S1
|
||||
|
||||
fldmias X, { s0 - s1 }
|
||||
fstmias Y, { s0 - s1 }
|
||||
vldmia.f32 X, { s0 - s1 }
|
||||
vstmia.f32 Y, { s0 - s1 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
|
||||
@@ -76,30 +76,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
pld [ X, #X_PRE ]
|
||||
pld [ Y, #X_PRE ]
|
||||
|
||||
fldmias X!, { s4 - s5 }
|
||||
fldmias Y!, { s8 - s9 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vldmia.f32 Y!, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fldmias X!, { s6 - s7 }
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
fmacs s2 , s5, s9
|
||||
fmacs s3 , s5, s8
|
||||
|
||||
fldmias Y!, { s10 - s11 }
|
||||
vldmia.f32 Y!, { s10 - s11 }
|
||||
fmacs s0 , s6, s10
|
||||
fmacs s1 , s6, s11
|
||||
fmacs s2 , s7, s11
|
||||
fmacs s3 , s7, s10
|
||||
|
||||
|
||||
fldmias X!, { s4 - s5 }
|
||||
fldmias Y!, { s8 - s9 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vldmia.f32 Y!, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fldmias X!, { s6 - s7 }
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
fmacs s2 , s5, s9
|
||||
fmacs s3 , s5, s8
|
||||
|
||||
fldmias Y!, { s10 - s11 }
|
||||
vldmia.f32 Y!, { s10 - s11 }
|
||||
fmacs s0 , s6, s10
|
||||
fmacs s1 , s6, s11
|
||||
fmacs s2 , s7, s11
|
||||
@@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 - s5 }
|
||||
fldmias Y!, { s8 - s9 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vldmia.f32 Y!, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fmacs s2 , s5, s9
|
||||
@@ -125,8 +125,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
nop
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s8 - s9 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fmacs s2 , s5, s9
|
||||
@@ -134,8 +134,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s8 - s9 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fmacs s2 , s5, s9
|
||||
@@ -143,8 +143,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s8 - s9 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fmacs s2 , s5, s9
|
||||
@@ -152,8 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s8 - s9 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fmacs s2 , s5, s9
|
||||
@@ -166,8 +166,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s8 - s9 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fmacs s2 , s5, s9
|
||||
@@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
cmp N, #0
|
||||
ble cdot_kernel_L999
|
||||
|
||||
cmp INC_X, #0
|
||||
beq cdot_kernel_L999
|
||||
# cmp INC_X, #0
|
||||
# beq cdot_kernel_L999
|
||||
|
||||
cmp INC_Y, #0
|
||||
beq cdot_kernel_L999
|
||||
# cmp INC_Y, #0
|
||||
# beq cdot_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne cdot_kernel_S_BEGIN
|
||||
|
||||
@@ -165,9 +165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_I
|
||||
|
||||
pld [ AO, #A_PRE ]
|
||||
fldmias AO!, { s0 - s3 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
pld [ BO, #B_PRE ]
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
|
||||
fmuls s8 , s0, s4
|
||||
@@ -197,9 +197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_M1
|
||||
|
||||
pld [ AO, #A_PRE ]
|
||||
fldmias AO!, { s0 - s3 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
pld [ BO, #B_PRE ]
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -225,8 +225,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_M2
|
||||
|
||||
fldmias AO!, { s0 - s3 }
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -254,8 +254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_E
|
||||
|
||||
fldmias AO!, { s0 - s3 }
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_SUB
|
||||
|
||||
fldmias AO!, { s0 - s3 }
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -317,7 +317,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s7 }
|
||||
vldmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
@@ -329,9 +329,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s11
|
||||
FMAC_I2 s7 , s1 , s10
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
fldmias CO2, { s4 - s7 }
|
||||
vldmia.f32 CO2, { s4 - s7 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
@@ -343,7 +343,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s15
|
||||
FMAC_I2 s7 , s1 , s14
|
||||
|
||||
fstmias CO2, { s4 - s7 }
|
||||
vstmia.f32 CO2, { s4 - s7 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -500,23 +500,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s5 }
|
||||
vldmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
FMAC_R2 s4 , s1 , s9
|
||||
FMAC_I2 s5 , s1 , s8
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
fldmias CO2, { s4 - s5 }
|
||||
vldmia.f32 CO2, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
FMAC_R2 s4 , s1 , s13
|
||||
FMAC_I2 s5 , s1 , s12
|
||||
|
||||
fstmias CO2, { s4 - s5 }
|
||||
vstmia.f32 CO2, { s4 - s5 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
@@ -671,7 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s7 }
|
||||
vldmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
@@ -683,7 +683,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s11
|
||||
FMAC_I2 s7 , s1 , s10
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -800,14 +800,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s5 }
|
||||
vldmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
FMAC_R2 s4 , s1 , s9
|
||||
FMAC_I2 s5 , s1 , s8
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
|
||||
@@ -182,30 +182,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_I
|
||||
pld [ AO , #A_PRE ]
|
||||
pld [ BO , #B_PRE ]
|
||||
fldmias AO!, { s0 - s1 }
|
||||
fldmias BO!, { s8 - s9 }
|
||||
vldmia.f32 AO!, { s0 - s1 }
|
||||
vldmia.f32 BO!, { s8 - s9 }
|
||||
|
||||
fmuls s16 , s0, s8
|
||||
fmuls s24 , s1, s9
|
||||
fldmias AO!, { s2 - s3 }
|
||||
vldmia.f32 AO!, { s2 - s3 }
|
||||
fmuls s17 , s0, s9
|
||||
fmuls s25 , s1, s8
|
||||
|
||||
fldmias BO!, { s10 - s11 }
|
||||
vldmia.f32 BO!, { s10 - s11 }
|
||||
fmuls s18 , s2, s8
|
||||
fmuls s26 , s3, s9
|
||||
fldmias AO!, { s4 - s5 }
|
||||
vldmia.f32 AO!, { s4 - s5 }
|
||||
fmuls s19 , s2, s9
|
||||
fmuls s27 , s3, s8
|
||||
|
||||
fldmias BO!, { s12 - s13 }
|
||||
vldmia.f32 BO!, { s12 - s13 }
|
||||
fmuls s20 , s0, s10
|
||||
fmuls s28 , s1, s11
|
||||
fldmias AO!, { s6 - s7 }
|
||||
vldmia.f32 AO!, { s6 - s7 }
|
||||
fmuls s21 , s0, s11
|
||||
fmuls s29 , s1, s10
|
||||
|
||||
fldmias BO!, { s14 - s15 }
|
||||
vldmia.f32 BO!, { s14 - s15 }
|
||||
fmuls s22 , s2, s10
|
||||
fmuls s30 , s3, s11
|
||||
fmuls s23 , s2, s11
|
||||
@@ -218,17 +218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_M1
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
fldmias AO!, { s4 - s5 }
|
||||
vldmia.f32 AO!, { s4 - s5 }
|
||||
fmacs s24 , s1, s9
|
||||
fmacs s17 , s0, s9
|
||||
fldmias BO!, { s12 - s13 }
|
||||
vldmia.f32 BO!, { s12 - s13 }
|
||||
fmacs s25 , s1, s8
|
||||
|
||||
fmacs s18 , s2, s8
|
||||
fldmias AO!, { s6 - s7 }
|
||||
vldmia.f32 AO!, { s6 - s7 }
|
||||
fmacs s26 , s3, s9
|
||||
fmacs s19 , s2, s9
|
||||
fldmias BO!, { s14 - s15 }
|
||||
vldmia.f32 BO!, { s14 - s15 }
|
||||
fmacs s27 , s3, s8
|
||||
|
||||
fmacs s20 , s0, s10
|
||||
@@ -250,19 +250,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
pld [ BO , #B_PRE ]
|
||||
fmacs s24 , s5, s13
|
||||
fmacs s17 , s4, s13
|
||||
fldmias AO!, { s0 - s1 }
|
||||
vldmia.f32 AO!, { s0 - s1 }
|
||||
fmacs s25 , s5, s12
|
||||
|
||||
fmacs s18 , s6, s12
|
||||
fmacs s26 , s7, s13
|
||||
fldmias BO!, { s8 - s9 }
|
||||
vldmia.f32 BO!, { s8 - s9 }
|
||||
fmacs s19 , s6, s13
|
||||
fmacs s27 , s7, s12
|
||||
|
||||
fldmias AO!, { s2 - s3 }
|
||||
vldmia.f32 AO!, { s2 - s3 }
|
||||
fmacs s20 , s4, s14
|
||||
fmacs s28 , s5, s15
|
||||
fldmias BO!, { s10 - s11 }
|
||||
vldmia.f32 BO!, { s10 - s11 }
|
||||
fmacs s21 , s4, s15
|
||||
fmacs s29 , s5, s14
|
||||
|
||||
@@ -300,16 +300,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_SUB
|
||||
|
||||
fldmias AO!, { s0 - s1 }
|
||||
fldmias BO!, { s8 - s9 }
|
||||
vldmia.f32 AO!, { s0 - s1 }
|
||||
vldmia.f32 BO!, { s8 - s9 }
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
fmacs s24 , s1, s9
|
||||
fldmias AO!, { s2 - s3 }
|
||||
vldmia.f32 AO!, { s2 - s3 }
|
||||
fmacs s17 , s0, s9
|
||||
fmacs s25 , s1, s8
|
||||
|
||||
fldmias BO!, { s10 - s11 }
|
||||
vldmia.f32 BO!, { s10 - s11 }
|
||||
fmacs s18 , s2, s8
|
||||
fmacs s26 , s3, s9
|
||||
fmacs s19 , s2, s9
|
||||
@@ -338,8 +338,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s7 }
|
||||
fldmias CO2, { s8 - s11 }
|
||||
vldmia.f32 CO1, { s4 - s7 }
|
||||
vldmia.f32 CO2, { s8 - s11 }
|
||||
|
||||
FADD_R s16, s24 , s16
|
||||
FADD_I s17, s25 , s17
|
||||
@@ -370,8 +370,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s10, s1 , s23
|
||||
FMAC_I2 s11, s1 , s22
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
fstmias CO2, { s8 - s11 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
vstmia.f32 CO2, { s8 - s11 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -534,8 +534,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s5 }
|
||||
fldmias CO2, { s8 - s9 }
|
||||
vldmia.f32 CO1, { s4 - s5 }
|
||||
vldmia.f32 CO2, { s8 - s9 }
|
||||
|
||||
FADD_R s16, s24 , s16
|
||||
FADD_I s17, s25 , s17
|
||||
@@ -552,8 +552,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s8 , s1 , s21
|
||||
FMAC_I2 s9 , s1 , s20
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
fstmias CO2, { s8 - s9 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
vstmia.f32 CO2, { s8 - s9 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
@@ -716,7 +716,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s7 }
|
||||
vldmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
FADD_R s16, s24 , s16
|
||||
FADD_I s17, s25 , s17
|
||||
@@ -733,7 +733,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s19
|
||||
FMAC_I2 s7 , s1 , s18
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -851,7 +851,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s5 }
|
||||
vldmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
FADD_R s16, s24 , s16
|
||||
FADD_I s17, s25 , s17
|
||||
@@ -861,7 +861,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s4 , s1 , s17
|
||||
FMAC_I2 s5 , s1 , s16
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
|
||||
@@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s6 , [ AO2, #8 ]
|
||||
flds s7 , [ AO2, #12 ]
|
||||
|
||||
fstmias BO!, { s0 - s7 }
|
||||
vstmia.f32 BO!, { s0 - s7 }
|
||||
add AO2, AO2, #16
|
||||
|
||||
.endm
|
||||
@@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s3 , [ AO2, #4 ]
|
||||
|
||||
add AO1, AO1, #8
|
||||
fstmias BO!, { s0 - s3 }
|
||||
vstmia.f32 BO!, { s0 - s3 }
|
||||
add AO2, AO2, #8
|
||||
|
||||
.endm
|
||||
@@ -111,7 +111,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s2 , [ AO1, #8 ]
|
||||
flds s3 , [ AO1, #12 ]
|
||||
|
||||
fstmias BO!, { s0 - s3 }
|
||||
vstmia.f32 BO!, { s0 - s3 }
|
||||
add AO1, AO1, #16
|
||||
|
||||
.endm
|
||||
@@ -122,7 +122,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0 , [ AO1, #0 ]
|
||||
flds s1 , [ AO1, #4 ]
|
||||
|
||||
fstmias BO!, { s0 - s1 }
|
||||
vstmia.f32 BO!, { s0 - s1 }
|
||||
add AO1, AO1, #8
|
||||
|
||||
.endm
|
||||
|
||||
@@ -73,12 +73,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**************************************************************************************/
|
||||
.macro COPY2x2
|
||||
|
||||
fldmias AO1, { s0 - s3 }
|
||||
vldmia.f32 AO1, { s0 - s3 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmias r3, { s4 - s7 }
|
||||
vldmia.f32 r3, { s4 - s7 }
|
||||
|
||||
fstmias BO1, { s0 - s7 }
|
||||
vstmia.f32 BO1, { s0 - s7 }
|
||||
add AO1, AO1, #16
|
||||
add BO1, BO1, M4
|
||||
|
||||
@@ -86,12 +86,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY1x2
|
||||
|
||||
fldmias AO1, { s0 -s1 }
|
||||
vldmia.f32 AO1, { s0 -s1 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmias r3, { s2 - s3 }
|
||||
vldmia.f32 r3, { s2 - s3 }
|
||||
|
||||
fstmias BO2, { s0 - s3 }
|
||||
vstmia.f32 BO2, { s0 - s3 }
|
||||
add AO1, AO1, #8
|
||||
add BO2, BO2, #16
|
||||
|
||||
@@ -100,9 +100,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
/*************************************************************************************************************************/
|
||||
.macro COPY2x1
|
||||
|
||||
fldmias AO1, { s0 - s3 }
|
||||
vldmia.f32 AO1, { s0 - s3 }
|
||||
|
||||
fstmias BO1, { s0 - s3 }
|
||||
vstmia.f32 BO1, { s0 - s3 }
|
||||
add AO1, AO1, #16
|
||||
add BO1, BO1, M4
|
||||
|
||||
@@ -110,9 +110,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY1x1
|
||||
|
||||
fldmias AO1, { s0 - s1 }
|
||||
vldmia.f32 AO1, { s0 - s1 }
|
||||
|
||||
fstmias BO2, { s0 - s1 }
|
||||
vstmia.f32 BO2, { s0 - s1 }
|
||||
add AO1, AO1, #8
|
||||
add BO2, BO2, #8
|
||||
|
||||
|
||||
@@ -201,7 +201,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias YO, { s4 - s7 }
|
||||
vldmia.f32 YO, { s4 - s7 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
@@ -213,9 +213,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s11
|
||||
FMAC_I2 s7 , s1 , s10
|
||||
|
||||
fstmias YO!, { s4 - s7 }
|
||||
vstmia.f32 YO!, { s4 - s7 }
|
||||
|
||||
fldmias YO, { s4 - s7 }
|
||||
vldmia.f32 YO, { s4 - s7 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
@@ -227,7 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s15
|
||||
FMAC_I2 s7 , s1 , s14
|
||||
|
||||
fstmias YO!, { s4 - s7 }
|
||||
vstmia.f32 YO!, { s4 - s7 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -266,14 +266,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
FMAC_R2 s4 , s1 , s9
|
||||
FMAC_I2 s5 , s1 , s8
|
||||
|
||||
fstmias YO, { s4 - s5 }
|
||||
vstmia.f32 YO, { s4 - s5 }
|
||||
|
||||
add YO, YO, #8
|
||||
|
||||
@@ -349,47 +349,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
FMAC_R2 s4 , s1 , s9
|
||||
FMAC_I2 s5 , s1 , s8
|
||||
|
||||
fstmias YO, { s4 - s5 }
|
||||
vstmia.f32 YO, { s4 - s5 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s6 - s7 }
|
||||
vldmia.f32 YO, { s6 - s7 }
|
||||
|
||||
FMAC_R1 s6 , s0 , s10
|
||||
FMAC_I1 s7 , s0 , s11
|
||||
FMAC_R2 s6 , s1 , s11
|
||||
FMAC_I2 s7 , s1 , s10
|
||||
|
||||
fstmias YO, { s6 - s7 }
|
||||
vstmia.f32 YO, { s6 - s7 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
FMAC_R2 s4 , s1 , s13
|
||||
FMAC_I2 s5 , s1 , s12
|
||||
|
||||
fstmias YO, { s4 - s5 }
|
||||
vstmia.f32 YO, { s4 - s5 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s6 - s7 }
|
||||
vldmia.f32 YO, { s6 - s7 }
|
||||
|
||||
FMAC_R1 s6 , s0 , s14
|
||||
FMAC_I1 s7 , s0 , s15
|
||||
FMAC_R2 s6 , s1 , s15
|
||||
FMAC_I2 s7 , s1 , s14
|
||||
|
||||
fstmias YO, { s6 - s7 }
|
||||
vstmia.f32 YO, { s6 - s7 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
@@ -430,14 +430,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
FMAC_R2 s4 , s1 , s9
|
||||
FMAC_I2 s5 , s1 , s8
|
||||
|
||||
fstmias YO, { s4 - s5 }
|
||||
vstmia.f32 YO, { s4 - s5 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
|
||||
@@ -150,9 +150,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F2X1
|
||||
|
||||
fldmias XO! , { s2 - s3 }
|
||||
fldmias AO1!, { s4 - s5 }
|
||||
fldmias AO2!, { s8 - s9 }
|
||||
vldmia.f32 XO! , { s2 - s3 }
|
||||
vldmia.f32 AO1!, { s4 - s5 }
|
||||
vldmia.f32 AO2!, { s8 - s9 }
|
||||
|
||||
fmacs s12 , s4 , s2
|
||||
fmacs s13 , s4 , s3
|
||||
@@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F2
|
||||
|
||||
fldmias YO, { s4 - s7 }
|
||||
vldmia.f32 YO, { s4 - s7 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
@@ -180,7 +180,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s15
|
||||
FMAC_I2 s7 , s1 , s14
|
||||
|
||||
fstmias YO!, { s4 - s7 }
|
||||
vstmia.f32 YO!, { s4 - s7 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -204,8 +204,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmias XO! , { s2 - s3 }
|
||||
fldmias AO1!, { s4 - s5 }
|
||||
vldmia.f32 XO! , { s2 - s3 }
|
||||
vldmia.f32 AO1!, { s4 - s5 }
|
||||
|
||||
fmacs s12 , s4 , s2
|
||||
fmacs s13 , s4 , s3
|
||||
@@ -216,14 +216,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
FMAC_R2 s4 , s1 , s13
|
||||
FMAC_I2 s5 , s1 , s12
|
||||
|
||||
fstmias YO!, { s4 - s5 }
|
||||
vstmia.f32 YO!, { s4 - s5 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -249,9 +249,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X1
|
||||
|
||||
fldmias XO , { s2 - s3 }
|
||||
fldmias AO1!, { s4 - s5 }
|
||||
fldmias AO2!, { s8 - s9 }
|
||||
vldmia.f32 XO , { s2 - s3 }
|
||||
vldmia.f32 AO1!, { s4 - s5 }
|
||||
vldmia.f32 AO2!, { s8 - s9 }
|
||||
|
||||
fmacs s12 , s4 , s2
|
||||
fmacs s13 , s4 , s3
|
||||
@@ -269,25 +269,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S2
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
FMAC_R2 s4 , s1 , s13
|
||||
FMAC_I2 s5 , s1 , s12
|
||||
|
||||
fstmias YO, { s4 - s5 }
|
||||
vstmia.f32 YO, { s4 - s5 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s6 - s7 }
|
||||
vldmia.f32 YO, { s6 - s7 }
|
||||
|
||||
FMAC_R1 s6 , s0 , s14
|
||||
FMAC_I1 s7 , s0 , s15
|
||||
FMAC_R2 s6 , s1 , s15
|
||||
FMAC_I2 s7 , s1 , s14
|
||||
|
||||
fstmias YO, { s6 - s7 }
|
||||
vstmia.f32 YO, { s6 - s7 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
@@ -313,8 +313,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmias XO , { s2 - s3 }
|
||||
fldmias AO1!, { s4 - s5 }
|
||||
vldmia.f32 XO , { s2 - s3 }
|
||||
vldmia.f32 AO1!, { s4 - s5 }
|
||||
|
||||
fmacs s12 , s4 , s2
|
||||
fmacs s13 , s4 , s3
|
||||
@@ -327,14 +327,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
FMAC_R2 s4 , s1 , s13
|
||||
FMAC_I2 s5 , s1 , s12
|
||||
|
||||
fstmias YO, { s4 - s5 }
|
||||
vstmia.f32 YO, { s4 - s5 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
|
||||
@@ -165,9 +165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_I
|
||||
|
||||
pld [ AO, #A_PRE ]
|
||||
fldmias AO!, { s0 - s3 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
pld [ BO, #B_PRE ]
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
|
||||
fmuls s8 , s0, s4
|
||||
@@ -197,9 +197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_M1
|
||||
|
||||
pld [ AO, #A_PRE ]
|
||||
fldmias AO!, { s0 - s3 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
pld [ BO, #B_PRE ]
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -225,8 +225,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_M2
|
||||
|
||||
fldmias AO!, { s0 - s3 }
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -254,8 +254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_E
|
||||
|
||||
fldmias AO!, { s0 - s3 }
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_SUB
|
||||
|
||||
fldmias AO!, { s0 - s3 }
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -331,7 +331,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s11
|
||||
FMAC_I2 s7 , s1 , s10
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
flds s4, FP_ZERO
|
||||
vmov.f32 s5, s4
|
||||
@@ -348,7 +348,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s15
|
||||
FMAC_I2 s7 , s1 , s14
|
||||
|
||||
fstmias CO2, { s4 - s7 }
|
||||
vstmia.f32 CO2, { s4 - s7 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -513,7 +513,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s4 , s1 , s9
|
||||
FMAC_I2 s5 , s1 , s8
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
flds s4, FP_ZERO
|
||||
vmov.f32 s5, s4
|
||||
@@ -523,7 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s4 , s1 , s13
|
||||
FMAC_I2 s5 , s1 , s12
|
||||
|
||||
fstmias CO2, { s4 - s5 }
|
||||
vstmia.f32 CO2, { s4 - s5 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
@@ -693,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s11
|
||||
FMAC_I2 s7 , s1 , s10
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -818,7 +818,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s4 , s1 , s9
|
||||
FMAC_I2 s5 , s1 , s8
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
|
||||
@@ -170,30 +170,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_I
|
||||
pld [ AO , #A_PRE ]
|
||||
pld [ BO , #B_PRE ]
|
||||
fldmias AO!, { s0 - s1 }
|
||||
fldmias BO!, { s8 - s9 }
|
||||
vldmia.f32 AO!, { s0 - s1 }
|
||||
vldmia.f32 BO!, { s8 - s9 }
|
||||
|
||||
fmuls s16 , s0, s8
|
||||
fmuls s24 , s1, s9
|
||||
fldmias AO!, { s2 - s3 }
|
||||
vldmia.f32 AO!, { s2 - s3 }
|
||||
fmuls s17 , s0, s9
|
||||
fmuls s25 , s1, s8
|
||||
|
||||
fldmias BO!, { s10 - s11 }
|
||||
vldmia.f32 BO!, { s10 - s11 }
|
||||
fmuls s18 , s2, s8
|
||||
fmuls s26 , s3, s9
|
||||
fldmias AO!, { s4 - s5 }
|
||||
vldmia.f32 AO!, { s4 - s5 }
|
||||
fmuls s19 , s2, s9
|
||||
fmuls s27 , s3, s8
|
||||
|
||||
fldmias BO!, { s12 - s13 }
|
||||
vldmia.f32 BO!, { s12 - s13 }
|
||||
fmuls s20 , s0, s10
|
||||
fmuls s28 , s1, s11
|
||||
fldmias AO!, { s6 - s7 }
|
||||
vldmia.f32 AO!, { s6 - s7 }
|
||||
fmuls s21 , s0, s11
|
||||
fmuls s29 , s1, s10
|
||||
|
||||
fldmias BO!, { s14 - s15 }
|
||||
vldmia.f32 BO!, { s14 - s15 }
|
||||
fmuls s22 , s2, s10
|
||||
fmuls s30 , s3, s11
|
||||
fmuls s23 , s2, s11
|
||||
@@ -206,17 +206,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_M1
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
fldmias AO!, { s4 - s5 }
|
||||
vldmia.f32 AO!, { s4 - s5 }
|
||||
fmacs s24 , s1, s9
|
||||
fmacs s17 , s0, s9
|
||||
fldmias BO!, { s12 - s13 }
|
||||
vldmia.f32 BO!, { s12 - s13 }
|
||||
fmacs s25 , s1, s8
|
||||
|
||||
fmacs s18 , s2, s8
|
||||
fldmias AO!, { s6 - s7 }
|
||||
vldmia.f32 AO!, { s6 - s7 }
|
||||
fmacs s26 , s3, s9
|
||||
fmacs s19 , s2, s9
|
||||
fldmias BO!, { s14 - s15 }
|
||||
vldmia.f32 BO!, { s14 - s15 }
|
||||
fmacs s27 , s3, s8
|
||||
|
||||
fmacs s20 , s0, s10
|
||||
@@ -238,19 +238,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
pld [ BO , #B_PRE ]
|
||||
fmacs s24 , s5, s13
|
||||
fmacs s17 , s4, s13
|
||||
fldmias AO!, { s0 - s1 }
|
||||
vldmia.f32 AO!, { s0 - s1 }
|
||||
fmacs s25 , s5, s12
|
||||
|
||||
fmacs s18 , s6, s12
|
||||
fmacs s26 , s7, s13
|
||||
fldmias BO!, { s8 - s9 }
|
||||
vldmia.f32 BO!, { s8 - s9 }
|
||||
fmacs s19 , s6, s13
|
||||
fmacs s27 , s7, s12
|
||||
|
||||
fldmias AO!, { s2 - s3 }
|
||||
vldmia.f32 AO!, { s2 - s3 }
|
||||
fmacs s20 , s4, s14
|
||||
fmacs s28 , s5, s15
|
||||
fldmias BO!, { s10 - s11 }
|
||||
vldmia.f32 BO!, { s10 - s11 }
|
||||
fmacs s21 , s4, s15
|
||||
fmacs s29 , s5, s14
|
||||
|
||||
@@ -288,16 +288,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_SUB
|
||||
|
||||
fldmias AO!, { s0 - s1 }
|
||||
fldmias BO!, { s8 - s9 }
|
||||
vldmia.f32 AO!, { s0 - s1 }
|
||||
vldmia.f32 BO!, { s8 - s9 }
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
fmacs s24 , s1, s9
|
||||
fldmias AO!, { s2 - s3 }
|
||||
vldmia.f32 AO!, { s2 - s3 }
|
||||
fmacs s17 , s0, s9
|
||||
fmacs s25 , s1, s8
|
||||
|
||||
fldmias BO!, { s10 - s11 }
|
||||
vldmia.f32 BO!, { s10 - s11 }
|
||||
fmacs s18 , s2, s8
|
||||
fmacs s26 , s3, s9
|
||||
fmacs s19 , s2, s9
|
||||
@@ -354,8 +354,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s10, s1 , s23
|
||||
FMAC_I2 s11, s1 , s22
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
fstmias CO2, { s8 - s11 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
vstmia.f32 CO2, { s8 - s11 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -532,8 +532,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s8 , s1 , s21
|
||||
FMAC_I2 s9 , s1 , s20
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
fstmias CO2, { s8 - s9 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
vstmia.f32 CO2, { s8 - s9 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
@@ -710,7 +710,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s19
|
||||
FMAC_I2 s7 , s1 , s18
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -835,7 +835,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s4 , s1 , s17
|
||||
FMAC_I2 s5 , s1 , s16
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
|
||||
@@ -65,15 +65,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d0 - d3 }
|
||||
fstmiad Y!, { d0 - d3 }
|
||||
vldmia.f64 X!, { d0 - d3 }
|
||||
vstmia.f64 Y!, { d0 - d3 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro COPY_F1
|
||||
|
||||
fldmiad X!, { d0 }
|
||||
fstmiad Y!, { d0 }
|
||||
vldmia.f64 X!, { d0 }
|
||||
vstmia.f64 Y!, { d0 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -83,23 +83,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY_S4
|
||||
|
||||
nop
|
||||
fldmiad X, { d0 }
|
||||
fstmiad Y, { d0 }
|
||||
vldmia.f64 X, { d0 }
|
||||
vstmia.f64 Y, { d0 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmiad X, { d1 }
|
||||
fstmiad Y, { d1 }
|
||||
vldmia.f64 X, { d1 }
|
||||
vstmia.f64 Y, { d1 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmiad X, { d0 }
|
||||
fstmiad Y, { d0 }
|
||||
vldmia.f64 X, { d0 }
|
||||
vstmia.f64 Y, { d0 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmiad X, { d1 }
|
||||
fstmiad Y, { d1 }
|
||||
vldmia.f64 X, { d1 }
|
||||
vstmia.f64 Y, { d1 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -108,8 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY_S1
|
||||
|
||||
fldmiad X, { d0 }
|
||||
fstmiad Y, { d0 }
|
||||
vldmia.f64 X, { d0 }
|
||||
vstmia.f64 Y, { d0 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
|
||||
@@ -67,26 +67,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d8 }
|
||||
vldmia.f64 X!, { d8 }
|
||||
pld [ Y, #X_PRE ]
|
||||
fldmiad Y!, { d4 }
|
||||
fldmiad Y!, { d5 }
|
||||
vldmia.f64 Y!, { d4 }
|
||||
vldmia.f64 Y!, { d5 }
|
||||
fmacd d0 , d4, d8
|
||||
fldmiad X!, { d9 }
|
||||
fldmiad Y!, { d6 }
|
||||
vldmia.f64 X!, { d9 }
|
||||
vldmia.f64 Y!, { d6 }
|
||||
fmacd d1 , d5, d9
|
||||
fldmiad X!, { d10 }
|
||||
fldmiad X!, { d11 }
|
||||
vldmia.f64 X!, { d10 }
|
||||
vldmia.f64 X!, { d11 }
|
||||
fmacd d0 , d6, d10
|
||||
fldmiad Y!, { d7 }
|
||||
vldmia.f64 Y!, { d7 }
|
||||
fmacd d1 , d7, d11
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 }
|
||||
fldmiad Y!, { d8 }
|
||||
vldmia.f64 X!, { d4 }
|
||||
vldmia.f64 Y!, { d8 }
|
||||
fmacd d0 , d4, d8
|
||||
|
||||
.endm
|
||||
@@ -97,26 +97,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_S4
|
||||
|
||||
nop
|
||||
fldmiad X, { d4 }
|
||||
fldmiad Y, { d8 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vldmia.f64 Y, { d8 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
fmacd d0 , d4, d8
|
||||
|
||||
fldmiad X, { d5 }
|
||||
fldmiad Y, { d9 }
|
||||
vldmia.f64 X, { d5 }
|
||||
vldmia.f64 Y, { d9 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
fmacd d1 , d5, d9
|
||||
|
||||
fldmiad X, { d6 }
|
||||
fldmiad Y, { d10 }
|
||||
vldmia.f64 X, { d6 }
|
||||
vldmia.f64 Y, { d10 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
fmacd d0 , d6, d10
|
||||
|
||||
fldmiad X, { d7 }
|
||||
fldmiad Y, { d11 }
|
||||
vldmia.f64 X, { d7 }
|
||||
vldmia.f64 Y, { d11 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
fmacd d1 , d7, d11
|
||||
@@ -126,8 +126,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 }
|
||||
fldmiad Y, { d8 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vldmia.f64 Y, { d8 }
|
||||
add X, X, INC_X
|
||||
fmacd d0 , d4, d8
|
||||
add Y, Y, INC_Y
|
||||
@@ -164,11 +164,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
cmp N, #0
|
||||
ble ddot_kernel_L999
|
||||
|
||||
cmp INC_X, #0
|
||||
beq ddot_kernel_L999
|
||||
# cmp INC_X, #0
|
||||
# beq ddot_kernel_L999
|
||||
|
||||
cmp INC_Y, #0
|
||||
beq ddot_kernel_L999
|
||||
# cmp INC_Y, #0
|
||||
# beq ddot_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne ddot_kernel_S_BEGIN
|
||||
|
||||
@@ -331,7 +331,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add r4 , CO2, r3
|
||||
pld [ CO2 , #C_PRE ]
|
||||
|
||||
fldmiad CO1, { d8 - d11 }
|
||||
vldmia.f64 CO1, { d8 - d11 }
|
||||
pld [ r4 , #C_PRE ]
|
||||
|
||||
fmacd d8 , d0 , d16
|
||||
@@ -352,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fmacd d15, d0 , d23
|
||||
fstd d11, [CO1, #24 ]
|
||||
|
||||
fldmiad r4, { d8 - d11 }
|
||||
vldmia.f64 r4, { d8 - d11 }
|
||||
|
||||
fmacd d8 , d0 , d24
|
||||
fstd d12, [CO2]
|
||||
@@ -367,7 +367,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
pld [ CO2 , #C_PRE ]
|
||||
|
||||
fldmiad CO2, { d12 - d15 }
|
||||
vldmia.f64 CO2, { d12 - d15 }
|
||||
|
||||
fstd d8 , [r4 ]
|
||||
fmacd d12, d0 , d28
|
||||
@@ -378,7 +378,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fstd d11, [r4 , #24 ]
|
||||
fmacd d15, d0 , d31
|
||||
|
||||
fstmiad CO2, { d12 - d15 }
|
||||
vstmia.f64 CO2, { d12 - d15 }
|
||||
|
||||
add CO1, CO1, #32
|
||||
|
||||
|
||||
@@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d3 , [ AO2, #8 ]
|
||||
|
||||
add AO1, AO1, #16
|
||||
fstmiad BO!, { d0 - d3 }
|
||||
vstmia.f64 BO!, { d0 - d3 }
|
||||
add AO2, AO2, #16
|
||||
|
||||
.endm
|
||||
@@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d1 , [ AO2, #0 ]
|
||||
add AO1, AO1, #8
|
||||
|
||||
fstmiad BO!, { d0 - d1 }
|
||||
vstmia.f64 BO!, { d0 - d1 }
|
||||
add AO2, AO2, #8
|
||||
|
||||
.endm
|
||||
@@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d0 , [ AO1, #0 ]
|
||||
fldd d1 , [ AO1, #8 ]
|
||||
|
||||
fstmiad BO!, { d0 - d1 }
|
||||
vstmia.f64 BO!, { d0 - d1 }
|
||||
add AO1, AO1, #16
|
||||
|
||||
.endm
|
||||
@@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
fldd d0 , [ AO1, #0 ]
|
||||
|
||||
fstmiad BO!, { d0 }
|
||||
vstmia.f64 BO!, { d0 }
|
||||
add AO1, AO1, #8
|
||||
|
||||
.endm
|
||||
|
||||
@@ -105,10 +105,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d11, [ AO4, #16 ]
|
||||
fldd d15, [ AO4, #24 ]
|
||||
|
||||
fstmiad BO!, { d0 - d3 }
|
||||
vstmia.f64 BO!, { d0 - d3 }
|
||||
add AO4, AO4, #32
|
||||
fstmiad BO!, { d4 - d7 }
|
||||
fstmiad BO!, { d8 - d15 }
|
||||
vstmia.f64 BO!, { d4 - d7 }
|
||||
vstmia.f64 BO!, { d8 - d15 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -122,7 +122,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d3 , [ AO4, #0 ]
|
||||
|
||||
add AO3, AO3, #8
|
||||
fstmiad BO!, { d0 - d3 }
|
||||
vstmia.f64 BO!, { d0 - d3 }
|
||||
add AO4, AO4, #8
|
||||
|
||||
.endm
|
||||
@@ -140,7 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d5 , [ AO2, #16 ]
|
||||
fldd d7 , [ AO2, #24 ]
|
||||
|
||||
fstmiad BO!, { d0 - d7 }
|
||||
vstmia.f64 BO!, { d0 - d7 }
|
||||
add AO2, AO2, #32
|
||||
|
||||
.endm
|
||||
@@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d1 , [ AO2, #0 ]
|
||||
add AO1, AO1, #8
|
||||
|
||||
fstmiad BO!, { d0 - d1 }
|
||||
vstmia.f64 BO!, { d0 - d1 }
|
||||
add AO2, AO2, #8
|
||||
|
||||
.endm
|
||||
@@ -164,7 +164,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d2 , [ AO1, #16 ]
|
||||
fldd d3 , [ AO1, #24 ]
|
||||
|
||||
fstmiad BO!, { d0 - d3 }
|
||||
vstmia.f64 BO!, { d0 - d3 }
|
||||
add AO1, AO1, #32
|
||||
|
||||
.endm
|
||||
@@ -174,7 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
fldd d0 , [ AO1, #0 ]
|
||||
|
||||
fstmiad BO!, { d0 }
|
||||
vstmia.f64 BO!, { d0 }
|
||||
add AO1, AO1, #8
|
||||
|
||||
.endm
|
||||
|
||||
@@ -76,21 +76,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY4x4
|
||||
|
||||
pld [ AO1, #A_PRE ]
|
||||
fldmiad AO1, { d0 - d3 }
|
||||
vldmia.f64 AO1, { d0 - d3 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
pld [ r3, #A_PRE ]
|
||||
fldmiad r3, { d4 - d7 }
|
||||
vldmia.f64 r3, { d4 - d7 }
|
||||
|
||||
add r3, r3, LDA
|
||||
pld [ r3, #A_PRE ]
|
||||
fldmiad r3, { d8 - d11 }
|
||||
vldmia.f64 r3, { d8 - d11 }
|
||||
|
||||
add r3, r3, LDA
|
||||
pld [ r3, #A_PRE ]
|
||||
fldmiad r3, { d12 - d15 }
|
||||
vldmia.f64 r3, { d12 - d15 }
|
||||
|
||||
fstmiad BO1, { d0 - d15 }
|
||||
vstmia.f64 BO1, { d0 - d15 }
|
||||
add AO1, AO1, #32
|
||||
add BO1, BO1, M4
|
||||
|
||||
@@ -98,18 +98,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY2x4
|
||||
|
||||
fldmiad AO1, { d0 - d1 }
|
||||
vldmia.f64 AO1, { d0 - d1 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmiad r3, { d2 - d3 }
|
||||
vldmia.f64 r3, { d2 - d3 }
|
||||
|
||||
add r3, r3, LDA
|
||||
fldmiad r3, { d4 - d5 }
|
||||
vldmia.f64 r3, { d4 - d5 }
|
||||
|
||||
add r3, r3, LDA
|
||||
fldmiad r3, { d6 - d7 }
|
||||
vldmia.f64 r3, { d6 - d7 }
|
||||
|
||||
fstmiad BO2, { d0 - d7 }
|
||||
vstmia.f64 BO2, { d0 - d7 }
|
||||
add AO1, AO1, #16
|
||||
add BO2, BO2, #64
|
||||
|
||||
@@ -117,18 +117,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY1x4
|
||||
|
||||
fldmiad AO1, { d0 }
|
||||
vldmia.f64 AO1, { d0 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmiad r3, { d1 }
|
||||
vldmia.f64 r3, { d1 }
|
||||
|
||||
add r3, r3, LDA
|
||||
fldmiad r3, { d2 }
|
||||
vldmia.f64 r3, { d2 }
|
||||
|
||||
add r3, r3, LDA
|
||||
fldmiad r3, { d3 }
|
||||
vldmia.f64 r3, { d3 }
|
||||
|
||||
fstmiad BO3, { d0 - d3 }
|
||||
vstmia.f64 BO3, { d0 - d3 }
|
||||
add AO1, AO1, #8
|
||||
add BO3, BO3, #32
|
||||
|
||||
@@ -139,13 +139,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY4x2
|
||||
|
||||
pld [ AO1, #A_PRE ]
|
||||
fldmiad AO1, { d0 - d3 }
|
||||
vldmia.f64 AO1, { d0 - d3 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
pld [ r3, #A_PRE ]
|
||||
fldmiad r3, { d4 - d7 }
|
||||
vldmia.f64 r3, { d4 - d7 }
|
||||
|
||||
fstmiad BO1, { d0 - d7 }
|
||||
vstmia.f64 BO1, { d0 - d7 }
|
||||
add AO1, AO1, #32
|
||||
add BO1, BO1, M4
|
||||
|
||||
@@ -153,12 +153,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY2x2
|
||||
|
||||
fldmiad AO1, { d0 - d1 }
|
||||
vldmia.f64 AO1, { d0 - d1 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmiad r3, { d2 - d3 }
|
||||
vldmia.f64 r3, { d2 - d3 }
|
||||
|
||||
fstmiad BO2, { d0 - d3 }
|
||||
vstmia.f64 BO2, { d0 - d3 }
|
||||
add AO1, AO1, #16
|
||||
add BO2, BO2, #32
|
||||
|
||||
@@ -166,12 +166,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY1x2
|
||||
|
||||
fldmiad AO1, { d0 }
|
||||
vldmia.f64 AO1, { d0 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmiad r3, { d1 }
|
||||
vldmia.f64 r3, { d1 }
|
||||
|
||||
fstmiad BO3, { d0 - d1 }
|
||||
vstmia.f64 BO3, { d0 - d1 }
|
||||
add AO1, AO1, #8
|
||||
add BO3, BO3, #16
|
||||
|
||||
@@ -182,9 +182,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY4x1
|
||||
|
||||
pld [ AO1, #A_PRE ]
|
||||
fldmiad AO1, { d0 - d3 }
|
||||
vldmia.f64 AO1, { d0 - d3 }
|
||||
|
||||
fstmiad BO1, { d0 - d3 }
|
||||
vstmia.f64 BO1, { d0 - d3 }
|
||||
add AO1, AO1, #32
|
||||
add BO1, BO1, M4
|
||||
|
||||
@@ -192,9 +192,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY2x1
|
||||
|
||||
fldmiad AO1, { d0 - d1 }
|
||||
vldmia.f64 AO1, { d0 - d1 }
|
||||
|
||||
fstmiad BO2, { d0 - d1 }
|
||||
vstmia.f64 BO2, { d0 - d1 }
|
||||
add AO1, AO1, #16
|
||||
add BO2, BO2, #16
|
||||
|
||||
@@ -202,9 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY1x1
|
||||
|
||||
fldmiad AO1, { d0 }
|
||||
vldmia.f64 AO1, { d0 }
|
||||
|
||||
fstmiad BO3, { d0 }
|
||||
vstmia.f64 BO3, { d0 }
|
||||
add AO1, AO1, #8
|
||||
add BO3, BO3, #8
|
||||
|
||||
|
||||
@@ -128,10 +128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d8 , [ BO ]
|
||||
|
||||
pld [ AO , #A_PRE ]
|
||||
fldmiad AO!, { d0 - d1}
|
||||
vldmia.f64 AO!, { d0 - d1}
|
||||
|
||||
fmuld d16 , d0, d8
|
||||
fldmiad AO!, { d2 - d3}
|
||||
vldmia.f64 AO!, { d2 - d3}
|
||||
fmuld d17 , d1, d8
|
||||
fldd d9 , [ BO, #8 ]
|
||||
fmuld d18 , d2, d8
|
||||
@@ -148,10 +148,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fmuld d23 , d3, d9
|
||||
|
||||
fmuld d24 , d0, d10
|
||||
fldmiad AO!, { d4 - d5 }
|
||||
vldmia.f64 AO!, { d4 - d5 }
|
||||
fmuld d25 , d1, d10
|
||||
fmuld d26 , d2, d10
|
||||
fldmiad AO!, { d6 - d7 }
|
||||
vldmia.f64 AO!, { d6 - d7 }
|
||||
fmuld d27 , d3, d10
|
||||
|
||||
fldd d13, [ BO, #8 ]
|
||||
@@ -173,10 +173,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d8 , [ BO ]
|
||||
|
||||
pld [ AO , #A_PRE ]
|
||||
fldmiad AO!, { d0 - d1}
|
||||
vldmia.f64 AO!, { d0 - d1}
|
||||
|
||||
fmacd d16 , d0, d8
|
||||
fldmiad AO!, { d2 - d3}
|
||||
vldmia.f64 AO!, { d2 - d3}
|
||||
fmacd d17 , d1, d8
|
||||
fldd d9 , [ BO, #8 ]
|
||||
fmacd d18 , d2, d8
|
||||
@@ -193,10 +193,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fmacd d23 , d3, d9
|
||||
|
||||
fmacd d24 , d0, d10
|
||||
fldmiad AO!, { d4 - d5 }
|
||||
vldmia.f64 AO!, { d4 - d5 }
|
||||
fmacd d25 , d1, d10
|
||||
fmacd d26 , d2, d10
|
||||
fldmiad AO!, { d6 - d7 }
|
||||
vldmia.f64 AO!, { d6 - d7 }
|
||||
fmacd d27 , d3, d10
|
||||
|
||||
fldd d13, [ BO, #8 ]
|
||||
@@ -225,11 +225,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d8 , [ BO ]
|
||||
fmacd d21 , d5, d13
|
||||
fmacd d22 , d6, d13
|
||||
fldmiad AO!, { d0 - d1 }
|
||||
vldmia.f64 AO!, { d0 - d1 }
|
||||
fmacd d23 , d7, d13
|
||||
|
||||
fmacd d24 , d4, d14
|
||||
fldmiad AO!, { d2 - d3 }
|
||||
vldmia.f64 AO!, { d2 - d3 }
|
||||
fmacd d25 , d5, d14
|
||||
fldd d9 , [ BO, #8 ]
|
||||
fmacd d26 , d6, d14
|
||||
@@ -257,10 +257,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fmacd d19 , d3, d8
|
||||
|
||||
fmacd d20 , d0, d9
|
||||
fldmiad AO!, { d4 - d5 }
|
||||
vldmia.f64 AO!, { d4 - d5 }
|
||||
fmacd d21 , d1, d9
|
||||
fmacd d22 , d2, d9
|
||||
fldmiad AO!, { d6 - d7 }
|
||||
vldmia.f64 AO!, { d6 - d7 }
|
||||
fmacd d23 , d3, d9
|
||||
|
||||
fmacd d24 , d0, d10
|
||||
@@ -390,7 +390,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fstd d11, [r4 , #24 ]
|
||||
fmuld d15, d0 , d31
|
||||
|
||||
fstmiad CO2, { d12 - d15 }
|
||||
vstmia.f64 CO2, { d12 - d15 }
|
||||
|
||||
add CO1, CO1, #32
|
||||
|
||||
|
||||
@@ -139,8 +139,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F8X1
|
||||
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmiad XO! , { d2 }
|
||||
fldmiad AO1 , { d4 - d7 }
|
||||
vldmia.f64 XO! , { d2 }
|
||||
vldmia.f64 AO1 , { d4 - d7 }
|
||||
|
||||
vmla.f64 d8 , d2 , d4
|
||||
pld [ AO2 , #4*SIZE ]
|
||||
@@ -150,7 +150,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmla.f64 d11 , d2 , d7
|
||||
|
||||
|
||||
fldmiad r3 , { d4 - d7 }
|
||||
vldmia.f64 r3 , { d4 - d7 }
|
||||
|
||||
vmla.f64 d12 , d2 , d4
|
||||
vmla.f64 d13 , d2 , d5
|
||||
@@ -164,23 +164,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F8
|
||||
|
||||
fldmiad YO, { d4 - d7 }
|
||||
vldmia.f64 YO, { d4 - d7 }
|
||||
|
||||
vmla.f64 d4 , d0, d8
|
||||
vmla.f64 d5 , d0, d9
|
||||
vmla.f64 d6 , d0, d10
|
||||
vmla.f64 d7 , d0, d11
|
||||
|
||||
fstmiad YO!, { d4 - d7 }
|
||||
vstmia.f64 YO!, { d4 - d7 }
|
||||
|
||||
fldmiad YO, { d4 - d7 }
|
||||
vldmia.f64 YO, { d4 - d7 }
|
||||
|
||||
vmla.f64 d4 , d0, d12
|
||||
vmla.f64 d5 , d0, d13
|
||||
vmla.f64 d6 , d0, d14
|
||||
vmla.f64 d7 , d0, d15
|
||||
|
||||
fstmiad YO!, { d4 - d7 }
|
||||
vstmia.f64 YO!, { d4 - d7 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -195,8 +195,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmiad XO! , { d2 }
|
||||
fldmiad AO1 , { d8 }
|
||||
vldmia.f64 XO! , { d2 }
|
||||
vldmia.f64 AO1 , { d8 }
|
||||
vmla.f64 d12 , d2 , d8
|
||||
add AO1, AO1, LDA
|
||||
|
||||
@@ -204,9 +204,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmiad YO, { d4 }
|
||||
vldmia.f64 YO, { d4 }
|
||||
vmla.f64 d4, d0, d12
|
||||
fstmiad YO!, { d4 }
|
||||
vstmia.f64 YO!, { d4 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -234,8 +234,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_S4X1
|
||||
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmiad XO , { d2 }
|
||||
fldmiad AO1 , { d8 - d11 }
|
||||
vldmia.f64 XO , { d2 }
|
||||
vldmia.f64 AO1 , { d8 - d11 }
|
||||
|
||||
vmla.f64 d12 , d2 , d8
|
||||
add AO1, AO1, LDA
|
||||
@@ -249,24 +249,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S4
|
||||
|
||||
fldmiad YO, { d4 }
|
||||
vldmia.f64 YO, { d4 }
|
||||
vmla.f64 d4 , d0, d12
|
||||
fstmiad YO, { d4 }
|
||||
vstmia.f64 YO, { d4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d5 }
|
||||
vldmia.f64 YO, { d5 }
|
||||
vmla.f64 d5 , d0, d13
|
||||
fstmiad YO, { d5 }
|
||||
vstmia.f64 YO, { d5 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d4 }
|
||||
vldmia.f64 YO, { d4 }
|
||||
vmla.f64 d4 , d0, d14
|
||||
fstmiad YO, { d4 }
|
||||
vstmia.f64 YO, { d4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d5 }
|
||||
vldmia.f64 YO, { d5 }
|
||||
vmla.f64 d5 , d0, d15
|
||||
fstmiad YO, { d5 }
|
||||
vstmia.f64 YO, { d5 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmiad XO , { d2 }
|
||||
fldmiad AO1 , { d8 }
|
||||
vldmia.f64 XO , { d2 }
|
||||
vldmia.f64 AO1 , { d8 }
|
||||
vmla.f64 d12 , d2 , d8
|
||||
add AO1, AO1, LDA
|
||||
add XO, XO , INC_X
|
||||
@@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmiad YO, { d4 }
|
||||
vldmia.f64 YO, { d4 }
|
||||
vmla.f64 d4, d0, d12
|
||||
fstmiad YO , { d4 }
|
||||
vstmia.f64 YO , { d4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -338,8 +338,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F8X1
|
||||
|
||||
pld [ AO2, #A_PRE ]
|
||||
fldmias XO! , { s2 }
|
||||
fldmias AO1 , { s4 - s7 }
|
||||
vldmia.f32 XO! , { s2 }
|
||||
vldmia.f32 AO1 , { s4 - s7 }
|
||||
|
||||
vmla.f32 s8 , s2 , s4
|
||||
vmla.f32 s9 , s2 , s5
|
||||
@@ -348,7 +348,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
add r3, AO1, #4*SIZE
|
||||
|
||||
fldmias r3 , { s4 - s7 }
|
||||
vldmia.f32 r3 , { s4 - s7 }
|
||||
|
||||
vmla.f32 s12 , s2 , s4
|
||||
vmla.f32 s13 , s2 , s5
|
||||
@@ -362,24 +362,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F8
|
||||
|
||||
fldmias YO, { s4 - s7 }
|
||||
vldmia.f32 YO, { s4 - s7 }
|
||||
|
||||
vmla.f32 s4 , s0, s8
|
||||
vmla.f32 s5 , s0, s9
|
||||
vmla.f32 s6 , s0, s10
|
||||
vmla.f32 s7 , s0, s11
|
||||
|
||||
fstmias YO!, { s4 - s7 }
|
||||
vstmia.f32 YO!, { s4 - s7 }
|
||||
|
||||
|
||||
fldmias YO, { s4 - s7 }
|
||||
vldmia.f32 YO, { s4 - s7 }
|
||||
|
||||
vmla.f32 s4 , s0, s12
|
||||
vmla.f32 s5 , s0, s13
|
||||
vmla.f32 s6 , s0, s14
|
||||
vmla.f32 s7 , s0, s15
|
||||
|
||||
fstmias YO!, { s4 - s7 }
|
||||
vstmia.f32 YO!, { s4 - s7 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -394,8 +394,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmias XO! , { s2 }
|
||||
fldmias AO1 , { s8 }
|
||||
vldmia.f32 XO! , { s2 }
|
||||
vldmia.f32 AO1 , { s8 }
|
||||
vmla.f32 s12 , s2 , s8
|
||||
add AO1, AO1, LDA
|
||||
|
||||
@@ -403,9 +403,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmias YO, { s4 }
|
||||
vldmia.f32 YO, { s4 }
|
||||
vmla.f32 s4, s0, s12
|
||||
fstmias YO!, { s4 }
|
||||
vstmia.f32 YO!, { s4 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -434,8 +434,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S4X1
|
||||
|
||||
fldmias XO , { s2 }
|
||||
fldmias AO1 , { s8 - s11 }
|
||||
vldmia.f32 XO , { s2 }
|
||||
vldmia.f32 AO1 , { s8 - s11 }
|
||||
|
||||
vmla.f32 s12 , s2 , s8
|
||||
vmla.f32 s13 , s2 , s9
|
||||
@@ -449,24 +449,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S4
|
||||
|
||||
fldmias YO, { s4 }
|
||||
vldmia.f32 YO, { s4 }
|
||||
vmla.f32 s4 , s0, s12
|
||||
fstmias YO, { s4 }
|
||||
vstmia.f32 YO, { s4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s5 }
|
||||
vldmia.f32 YO, { s5 }
|
||||
vmla.f32 s5 , s0, s13
|
||||
fstmias YO, { s5 }
|
||||
vstmia.f32 YO, { s5 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s4 }
|
||||
vldmia.f32 YO, { s4 }
|
||||
vmla.f32 s4 , s0, s14
|
||||
fstmias YO, { s4 }
|
||||
vstmia.f32 YO, { s4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s5 }
|
||||
vldmia.f32 YO, { s5 }
|
||||
vmla.f32 s5 , s0, s15
|
||||
fstmias YO, { s5 }
|
||||
vstmia.f32 YO, { s5 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -482,8 +482,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmias XO , { s2 }
|
||||
fldmias AO1 , { s8 }
|
||||
vldmia.f32 XO , { s2 }
|
||||
vldmia.f32 AO1 , { s8 }
|
||||
vmla.f32 s12 , s2 , s8
|
||||
add AO1, AO1, LDA
|
||||
add XO, XO , INC_X
|
||||
@@ -492,9 +492,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmias YO, { s4 }
|
||||
vldmia.f32 YO, { s4 }
|
||||
vmla.f32 s4, s0, s12
|
||||
fstmias YO , { s4 }
|
||||
vstmia.f32 YO , { s4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
|
||||
@@ -138,8 +138,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F8X1
|
||||
|
||||
fldmiad XO! , { d4 }
|
||||
fldmiad AO1 , { d8 - d15 }
|
||||
vldmia.f64 XO! , { d4 }
|
||||
vldmia.f64 AO1 , { d8 - d15 }
|
||||
|
||||
vmla.f64 d24 , d4 , d8
|
||||
pld [ AO2 , #A_PRE ]
|
||||
@@ -158,7 +158,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F8
|
||||
|
||||
fldmiad YO, { d16 - d23 }
|
||||
vldmia.f64 YO, { d16 - d23 }
|
||||
|
||||
vmla.f64 d16, d0, d24
|
||||
vmla.f64 d17, d0, d25
|
||||
@@ -169,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmla.f64 d22, d0, d30
|
||||
vmla.f64 d23, d0, d31
|
||||
|
||||
fstmiad YO!, { d16 - d23 }
|
||||
vstmia.f64 YO!, { d16 - d23 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -184,8 +184,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmiad XO! , { d4 }
|
||||
fldmiad AO1 , { d8 }
|
||||
vldmia.f64 XO! , { d4 }
|
||||
vldmia.f64 AO1 , { d8 }
|
||||
vmla.f64 d24 , d4 , d8
|
||||
add AO1, AO1, LDA
|
||||
|
||||
@@ -193,9 +193,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmiad YO, { d16 }
|
||||
vldmia.f64 YO, { d16 }
|
||||
vmla.f64 d16, d0, d24
|
||||
fstmiad YO!, { d16 }
|
||||
vstmia.f64 YO!, { d16 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -234,8 +234,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
pld [ AO2 , #A_PRE ]
|
||||
pld [ AO2 , #A_PRE+32 ]
|
||||
fldmiad XO , { d4 }
|
||||
fldmiad AO1 , { d8 - d15 }
|
||||
vldmia.f64 XO , { d4 }
|
||||
vldmia.f64 AO1 , { d8 - d15 }
|
||||
|
||||
vmla.f64 d24 , d4 , d8
|
||||
vmla.f64 d25 , d4 , d9
|
||||
@@ -253,44 +253,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S8
|
||||
|
||||
fldmiad YO, { d16 }
|
||||
vldmia.f64 YO, { d16 }
|
||||
vmla.f64 d16, d0, d24
|
||||
fstmiad YO, { d16 }
|
||||
vstmia.f64 YO, { d16 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d17 }
|
||||
vldmia.f64 YO, { d17 }
|
||||
vmla.f64 d17, d0, d25
|
||||
fstmiad YO, { d17 }
|
||||
vstmia.f64 YO, { d17 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d18 }
|
||||
vldmia.f64 YO, { d18 }
|
||||
vmla.f64 d18, d0, d26
|
||||
fstmiad YO, { d18 }
|
||||
vstmia.f64 YO, { d18 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d19 }
|
||||
vldmia.f64 YO, { d19 }
|
||||
vmla.f64 d19, d0, d27
|
||||
fstmiad YO, { d19 }
|
||||
vstmia.f64 YO, { d19 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d20 }
|
||||
vldmia.f64 YO, { d20 }
|
||||
vmla.f64 d20, d0, d28
|
||||
fstmiad YO, { d20 }
|
||||
vstmia.f64 YO, { d20 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d21 }
|
||||
vldmia.f64 YO, { d21 }
|
||||
vmla.f64 d21, d0, d29
|
||||
fstmiad YO, { d21 }
|
||||
vstmia.f64 YO, { d21 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d22 }
|
||||
vldmia.f64 YO, { d22 }
|
||||
vmla.f64 d22, d0, d30
|
||||
fstmiad YO, { d22 }
|
||||
vstmia.f64 YO, { d22 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d23 }
|
||||
vldmia.f64 YO, { d23 }
|
||||
vmla.f64 d23, d0, d31
|
||||
fstmiad YO, { d23 }
|
||||
vstmia.f64 YO, { d23 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -306,8 +306,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmiad XO , { d4 }
|
||||
fldmiad AO1 , { d8 }
|
||||
vldmia.f64 XO , { d4 }
|
||||
vldmia.f64 AO1 , { d8 }
|
||||
vmla.f64 d24 , d4 , d8
|
||||
add AO1, AO1, LDA
|
||||
add XO, XO, INC_X
|
||||
@@ -316,9 +316,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmiad YO, { d16 }
|
||||
vldmia.f64 YO, { d16 }
|
||||
vmla.f64 d16, d0, d24
|
||||
fstmiad YO, { d16 }
|
||||
vstmia.f64 YO, { d16 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -361,8 +361,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F8X1
|
||||
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmias XO! , { s4 }
|
||||
fldmias AO1 , { s8 - s15 }
|
||||
vldmia.f32 XO! , { s4 }
|
||||
vldmia.f32 AO1 , { s8 - s15 }
|
||||
|
||||
vmla.f32 s24 , s4 , s8
|
||||
vmla.f32 s25 , s4 , s9
|
||||
@@ -379,7 +379,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F8
|
||||
|
||||
fldmias YO, { s16 - s23 }
|
||||
vldmia.f32 YO, { s16 - s23 }
|
||||
|
||||
vmla.f32 s16, s0, s24
|
||||
vmla.f32 s17, s0, s25
|
||||
@@ -390,7 +390,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmla.f32 s22, s0, s30
|
||||
vmla.f32 s23, s0, s31
|
||||
|
||||
fstmias YO!, { s16 - s23 }
|
||||
vstmia.f32 YO!, { s16 - s23 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -405,8 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmias XO! , { s4 }
|
||||
fldmias AO1 , { s8 }
|
||||
vldmia.f32 XO! , { s4 }
|
||||
vldmia.f32 AO1 , { s8 }
|
||||
vmla.f32 s24 , s4 , s8
|
||||
add AO1, AO1, LDA
|
||||
|
||||
@@ -414,9 +414,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmias YO, { s16 }
|
||||
vldmia.f32 YO, { s16 }
|
||||
vmla.f32 s16, s0, s24
|
||||
fstmias YO!, { s16 }
|
||||
vstmia.f32 YO!, { s16 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -454,8 +454,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_S8X1
|
||||
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmias XO , { s4 }
|
||||
fldmias AO1 , { s8 - s15 }
|
||||
vldmia.f32 XO , { s4 }
|
||||
vldmia.f32 AO1 , { s8 - s15 }
|
||||
|
||||
vmla.f32 s24 , s4 , s8
|
||||
vmla.f32 s25 , s4 , s9
|
||||
@@ -473,44 +473,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S8
|
||||
|
||||
fldmias YO, { s16 }
|
||||
vldmia.f32 YO, { s16 }
|
||||
vmla.f32 s16, s0, s24
|
||||
fstmias YO, { s16 }
|
||||
vstmia.f32 YO, { s16 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s17 }
|
||||
vldmia.f32 YO, { s17 }
|
||||
vmla.f32 s17, s0, s25
|
||||
fstmias YO, { s17 }
|
||||
vstmia.f32 YO, { s17 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s18 }
|
||||
vldmia.f32 YO, { s18 }
|
||||
vmla.f32 s18, s0, s26
|
||||
fstmias YO, { s18 }
|
||||
vstmia.f32 YO, { s18 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s19 }
|
||||
vldmia.f32 YO, { s19 }
|
||||
vmla.f32 s19, s0, s27
|
||||
fstmias YO, { s19 }
|
||||
vstmia.f32 YO, { s19 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s20 }
|
||||
vldmia.f32 YO, { s20 }
|
||||
vmla.f32 s20, s0, s28
|
||||
fstmias YO, { s20 }
|
||||
vstmia.f32 YO, { s20 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s21 }
|
||||
vldmia.f32 YO, { s21 }
|
||||
vmla.f32 s21, s0, s29
|
||||
fstmias YO, { s21 }
|
||||
vstmia.f32 YO, { s21 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s22 }
|
||||
vldmia.f32 YO, { s22 }
|
||||
vmla.f32 s22, s0, s30
|
||||
fstmias YO, { s22 }
|
||||
vstmia.f32 YO, { s22 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s23 }
|
||||
vldmia.f32 YO, { s23 }
|
||||
vmla.f32 s23, s0, s31
|
||||
fstmias YO, { s23 }
|
||||
vstmia.f32 YO, { s23 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -526,8 +526,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmias XO , { s4 }
|
||||
fldmias AO1 , { s8 }
|
||||
vldmia.f32 XO , { s4 }
|
||||
vldmia.f32 AO1 , { s8 }
|
||||
vmla.f32 s24 , s4 , s8
|
||||
add AO1, AO1, LDA
|
||||
add XO, XO, INC_X
|
||||
@@ -536,9 +536,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmias YO, { s16 }
|
||||
vldmia.f32 YO, { s16 }
|
||||
vmla.f32 s16, s0, s24
|
||||
fstmias YO, { s16 }
|
||||
vstmia.f32 YO, { s16 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
|
||||
@@ -112,13 +112,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F2X4
|
||||
|
||||
pld [ XO , #X_PRE ]
|
||||
fldmiad XO! , { d12 - d15 }
|
||||
vldmia.f64 XO! , { d12 - d15 }
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmiad AO2!, { d4 - d5 }
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
fldmiad AO2!, { d6 - d7 }
|
||||
vldmia.f64 AO2!, { d4 - d5 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO2!, { d6 - d7 }
|
||||
|
||||
vmla.f64 d2 , d12 , d8
|
||||
vmla.f64 d3 , d12 , d4
|
||||
@@ -133,9 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F2X1
|
||||
|
||||
fldmiad XO! , { d1 }
|
||||
fldmiad AO1!, { d8 }
|
||||
fldmiad AO2!, { d4 }
|
||||
vldmia.f64 XO! , { d1 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
vldmia.f64 AO2!, { d4 }
|
||||
vmla.f64 d2 , d1 , d8
|
||||
vmla.f64 d3 , d1 , d4
|
||||
|
||||
@@ -143,10 +143,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F2
|
||||
|
||||
fldmiad YO, { d4 - d5 }
|
||||
vldmia.f64 YO, { d4 - d5 }
|
||||
vmla.f64 d4, d0, d2
|
||||
vmla.f64 d5, d0, d3
|
||||
fstmiad YO!, { d4 - d5 }
|
||||
vstmia.f64 YO!, { d4 - d5 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -160,10 +160,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F1X4
|
||||
|
||||
pld [ XO , #X_PRE ]
|
||||
fldmiad XO! , { d12 - d15 }
|
||||
vldmia.f64 XO! , { d12 - d15 }
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
vmla.f64 d2 , d12 , d8
|
||||
vmla.f64 d2 , d13 , d9
|
||||
vmla.f64 d2 , d14, d10
|
||||
@@ -173,17 +173,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmiad XO! , { d1 }
|
||||
fldmiad AO1!, { d8 }
|
||||
vldmia.f64 XO! , { d1 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
vmla.f64 d2 , d1 , d8
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmiad YO, { d4 }
|
||||
vldmia.f64 YO, { d4 }
|
||||
vmla.f64 d4, d0, d2
|
||||
fstmiad YO!, { d4 }
|
||||
vstmia.f64 YO!, { d4 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -197,23 +197,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X4
|
||||
|
||||
fldmiad XO , { d12 }
|
||||
vldmia.f64 XO , { d12 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmiad AO2!, { d4 - d5 }
|
||||
vldmia.f64 AO2!, { d4 - d5 }
|
||||
|
||||
fldmiad XO , { d13 }
|
||||
vldmia.f64 XO , { d13 }
|
||||
add XO, XO, INC_X
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
fldmiad AO2!, { d6 - d7 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO2!, { d6 - d7 }
|
||||
|
||||
fldmiad XO , { d14 }
|
||||
vldmia.f64 XO , { d14 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
fldmiad XO , { d15 }
|
||||
vldmia.f64 XO , { d15 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
vmla.f64 d2 , d12 , d8
|
||||
@@ -229,9 +229,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X1
|
||||
|
||||
fldmiad XO , { d1 }
|
||||
fldmiad AO1!, { d8 }
|
||||
fldmiad AO2!, { d4 }
|
||||
vldmia.f64 XO , { d1 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
vldmia.f64 AO2!, { d4 }
|
||||
vmla.f64 d2 , d1 , d8
|
||||
add XO, XO, INC_X
|
||||
vmla.f64 d3 , d1 , d4
|
||||
@@ -240,14 +240,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S2
|
||||
|
||||
fldmiad YO, { d4 }
|
||||
vldmia.f64 YO, { d4 }
|
||||
vmla.f64 d4, d0, d2
|
||||
fstmiad YO, { d4 }
|
||||
vstmia.f64 YO, { d4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d5 }
|
||||
vldmia.f64 YO, { d5 }
|
||||
vmla.f64 d5, d0, d3
|
||||
fstmiad YO, { d5 }
|
||||
vstmia.f64 YO, { d5 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -261,20 +261,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X4
|
||||
|
||||
fldmiad XO , { d12 }
|
||||
vldmia.f64 XO , { d12 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
|
||||
fldmiad XO , { d13 }
|
||||
vldmia.f64 XO , { d13 }
|
||||
add XO, XO, INC_X
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
|
||||
fldmiad XO , { d14 }
|
||||
vldmia.f64 XO , { d14 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
fldmiad XO , { d15 }
|
||||
vldmia.f64 XO , { d15 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
vmla.f64 d2 , d12 , d8
|
||||
@@ -286,8 +286,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmiad XO , { d1 }
|
||||
fldmiad AO1!, { d8 }
|
||||
vldmia.f64 XO , { d1 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
vmla.f64 d2 , d1 , d8
|
||||
add XO, XO, INC_X
|
||||
|
||||
@@ -295,9 +295,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmiad YO, { d4 }
|
||||
vldmia.f64 YO, { d4 }
|
||||
vmla.f64 d4, d0, d2
|
||||
fstmiad YO, { d4 }
|
||||
vstmia.f64 YO, { d4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -315,11 +315,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F2X4
|
||||
|
||||
fldmias XO! , { s12 - s15 }
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
fldmias AO2!, { s4 - s5 }
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
fldmias AO2!, { s6 - s7 }
|
||||
vldmia.f32 XO! , { s12 - s15 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
vldmia.f32 AO2!, { s4 - s5 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
vldmia.f32 AO2!, { s6 - s7 }
|
||||
|
||||
vmla.f32 s2 , s12 , s8
|
||||
vmla.f32 s3 , s12 , s4
|
||||
@@ -334,9 +334,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F2X1
|
||||
|
||||
fldmias XO! , { s1 }
|
||||
fldmias AO1!, { s8 }
|
||||
fldmias AO2!, { s4 }
|
||||
vldmia.f32 XO! , { s1 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
vldmia.f32 AO2!, { s4 }
|
||||
vmla.f32 s2 , s1 , s8
|
||||
vmla.f32 s3 , s1 , s4
|
||||
|
||||
@@ -344,10 +344,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F2
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
vmla.f32 s4, s0, s2
|
||||
vmla.f32 s5, s0, s3
|
||||
fstmias YO!, { s4 - s5 }
|
||||
vstmia.f32 YO!, { s4 - s5 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -359,9 +359,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X4
|
||||
|
||||
fldmias XO! , { s12 - s15 }
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
vldmia.f32 XO! , { s12 - s15 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
vmla.f32 s2 , s12 , s8
|
||||
vmla.f32 s2 , s13 , s9
|
||||
vmla.f32 s2 , s14, s10
|
||||
@@ -371,17 +371,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmias XO! , { s1 }
|
||||
fldmias AO1!, { s8 }
|
||||
vldmia.f32 XO! , { s1 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
vmla.f32 s2 , s1 , s8
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmias YO, { s4 }
|
||||
vldmia.f32 YO, { s4 }
|
||||
vmla.f32 s4, s0, s2
|
||||
fstmias YO!, { s4 }
|
||||
vstmia.f32 YO!, { s4 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -395,21 +395,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X4
|
||||
|
||||
fldmias XO , { s12 }
|
||||
vldmia.f32 XO , { s12 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
fldmias AO2!, { s4 - s5 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
vldmia.f32 AO2!, { s4 - s5 }
|
||||
|
||||
fldmias XO , { s13 }
|
||||
vldmia.f32 XO , { s13 }
|
||||
add XO, XO, INC_X
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
fldmias AO2!, { s6 - s7 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
vldmia.f32 AO2!, { s6 - s7 }
|
||||
|
||||
fldmias XO , { s14 }
|
||||
vldmia.f32 XO , { s14 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
fldmias XO , { s15 }
|
||||
vldmia.f32 XO , { s15 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
vmla.f32 s2 , s12 , s8
|
||||
@@ -425,9 +425,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X1
|
||||
|
||||
fldmias XO , { s1 }
|
||||
fldmias AO1!, { s8 }
|
||||
fldmias AO2!, { s4 }
|
||||
vldmia.f32 XO , { s1 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
vldmia.f32 AO2!, { s4 }
|
||||
vmla.f32 s2 , s1 , s8
|
||||
add XO, XO, INC_X
|
||||
vmla.f32 s3 , s1 , s4
|
||||
@@ -436,14 +436,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S2
|
||||
|
||||
fldmias YO, { s4 }
|
||||
vldmia.f32 YO, { s4 }
|
||||
vmla.f32 s4, s0, s2
|
||||
fstmias YO, { s4 }
|
||||
vstmia.f32 YO, { s4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s5 }
|
||||
vldmia.f32 YO, { s5 }
|
||||
vmla.f32 s5, s0, s3
|
||||
fstmias YO, { s5 }
|
||||
vstmia.f32 YO, { s5 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -456,20 +456,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X4
|
||||
|
||||
fldmias XO , { s12 }
|
||||
vldmia.f32 XO , { s12 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
|
||||
fldmias XO , { s13 }
|
||||
vldmia.f32 XO , { s13 }
|
||||
add XO, XO, INC_X
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
|
||||
fldmias XO , { s14 }
|
||||
vldmia.f32 XO , { s14 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
fldmias XO , { s15 }
|
||||
vldmia.f32 XO , { s15 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
vmla.f32 s2 , s12 , s8
|
||||
@@ -481,8 +481,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmias XO , { s1 }
|
||||
fldmias AO1!, { s8 }
|
||||
vldmia.f32 XO , { s1 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
vmla.f32 s2 , s1 , s8
|
||||
add XO, XO, INC_X
|
||||
|
||||
@@ -490,9 +490,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmias YO, { s4 }
|
||||
vldmia.f32 YO, { s4 }
|
||||
vmla.f32 s4, s0, s2
|
||||
fstmias YO, { s4 }
|
||||
vstmia.f32 YO, { s4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
|
||||
@@ -108,17 +108,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F2X4
|
||||
|
||||
pld [ XO , #X_PRE ]
|
||||
fldmiad XO! , { d28 - d31 }
|
||||
vldmia.f64 XO! , { d28 - d31 }
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmiad AO2!, { d16 - d17 }
|
||||
vldmia.f64 AO2!, { d16 - d17 }
|
||||
vmla.f64 d4 , d28 , d8
|
||||
vmla.f64 d5 , d28 , d16
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
vmla.f64 d4 , d29 , d9
|
||||
vmla.f64 d5 , d29 , d17
|
||||
fldmiad AO2!, { d18 - d19 }
|
||||
vldmia.f64 AO2!, { d18 - d19 }
|
||||
vmla.f64 d4 , d30, d10
|
||||
vmla.f64 d5 , d30, d18
|
||||
vmla.f64 d4 , d31, d11
|
||||
@@ -129,9 +129,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F2X1
|
||||
|
||||
fldmiad XO! , { d2 }
|
||||
fldmiad AO1!, { d8 }
|
||||
fldmiad AO2!, { d16 }
|
||||
vldmia.f64 XO! , { d2 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
vldmia.f64 AO2!, { d16 }
|
||||
vmla.f64 d4 , d2 , d8
|
||||
vmla.f64 d5 , d2 , d16
|
||||
|
||||
@@ -139,10 +139,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F2
|
||||
|
||||
fldmiad YO, { d24 - d25 }
|
||||
vldmia.f64 YO, { d24 - d25 }
|
||||
vmla.f64 d24, d0, d4
|
||||
vmla.f64 d25, d0, d5
|
||||
fstmiad YO!, { d24 - d25 }
|
||||
vstmia.f64 YO!, { d24 - d25 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -156,23 +156,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_S2X4
|
||||
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad XO , { d28 }
|
||||
vldmia.f64 XO , { d28 }
|
||||
add XO, XO, INC_X
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmiad AO2!, { d16 - d17 }
|
||||
vldmia.f64 AO2!, { d16 - d17 }
|
||||
vmla.f64 d4 , d28 , d8
|
||||
fldmiad XO , { d29 }
|
||||
vldmia.f64 XO , { d29 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f64 d5 , d28 , d16
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
vmla.f64 d4 , d29 , d9
|
||||
fldmiad XO , { d30 }
|
||||
vldmia.f64 XO , { d30 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f64 d5 , d29 , d17
|
||||
fldmiad AO2!, { d18 - d19 }
|
||||
vldmia.f64 AO2!, { d18 - d19 }
|
||||
vmla.f64 d4 , d30, d10
|
||||
fldmiad XO , { d31 }
|
||||
vldmia.f64 XO , { d31 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f64 d5 , d30, d18
|
||||
vmla.f64 d4 , d31, d11
|
||||
@@ -183,10 +183,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X1
|
||||
|
||||
fldmiad XO , { d2 }
|
||||
fldmiad AO1!, { d8 }
|
||||
vldmia.f64 XO , { d2 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
add XO, XO, INC_X
|
||||
fldmiad AO2!, { d16 }
|
||||
vldmia.f64 AO2!, { d16 }
|
||||
vmla.f64 d4 , d2 , d8
|
||||
vmla.f64 d5 , d2 , d16
|
||||
|
||||
@@ -194,14 +194,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S2
|
||||
|
||||
fldmiad YO, { d24 }
|
||||
vldmia.f64 YO, { d24 }
|
||||
vmla.f64 d24, d0, d4
|
||||
fstmiad YO, { d24 }
|
||||
vstmia.f64 YO, { d24 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d24 }
|
||||
vldmia.f64 YO, { d24 }
|
||||
vmla.f64 d24, d0, d5
|
||||
fstmiad YO, { d24 }
|
||||
vstmia.f64 YO, { d24 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F1X4
|
||||
|
||||
pld [ XO , #X_PRE ]
|
||||
fldmiad XO! , { d28 - d31 }
|
||||
vldmia.f64 XO! , { d28 - d31 }
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
vmla.f64 d4 , d28 , d8
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
vmla.f64 d4 , d29 , d9
|
||||
vmla.f64 d4 , d30, d10
|
||||
vmla.f64 d4 , d31, d11
|
||||
@@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmiad XO! , { d2 }
|
||||
fldmiad AO1!, { d8 }
|
||||
vldmia.f64 XO! , { d2 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
vmla.f64 d4 , d2 , d8
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmiad YO, { d24 }
|
||||
vldmia.f64 YO, { d24 }
|
||||
vmla.f64 d24, d0, d4
|
||||
fstmiad YO!, { d24 }
|
||||
vstmia.f64 YO!, { d24 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -252,18 +252,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_S1X4
|
||||
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad XO , { d28 }
|
||||
vldmia.f64 XO , { d28 }
|
||||
add XO, XO, INC_X
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
vmla.f64 d4 , d28 , d8
|
||||
fldmiad XO , { d29 }
|
||||
vldmia.f64 XO , { d29 }
|
||||
add XO, XO, INC_X
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
vmla.f64 d4 , d29 , d9
|
||||
fldmiad XO , { d30 }
|
||||
vldmia.f64 XO , { d30 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f64 d4 , d30, d10
|
||||
fldmiad XO , { d31 }
|
||||
vldmia.f64 XO , { d31 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f64 d4 , d31, d11
|
||||
|
||||
@@ -272,8 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmiad XO , { d2 }
|
||||
fldmiad AO1!, { d8 }
|
||||
vldmia.f64 XO , { d2 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f64 d4 , d2 , d8
|
||||
|
||||
@@ -281,9 +281,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmiad YO, { d24 }
|
||||
vldmia.f64 YO, { d24 }
|
||||
vmla.f64 d24, d0, d4
|
||||
fstmiad YO, { d24 }
|
||||
vstmia.f64 YO, { d24 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -300,15 +300,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F2X4
|
||||
|
||||
fldmias XO! , { s28 - s31 }
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
fldmias AO2!, { s16 - s17 }
|
||||
vldmia.f32 XO! , { s28 - s31 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
vldmia.f32 AO2!, { s16 - s17 }
|
||||
vmla.f32 s4 , s28 , s8
|
||||
vmla.f32 s5 , s28 , s16
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
vmla.f32 s4 , s29 , s9
|
||||
vmla.f32 s5 , s29 , s17
|
||||
fldmias AO2!, { s18 - s19 }
|
||||
vldmia.f32 AO2!, { s18 - s19 }
|
||||
vmla.f32 s4 , s30, s10
|
||||
vmla.f32 s5 , s30, s18
|
||||
vmla.f32 s4 , s31, s11
|
||||
@@ -319,9 +319,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F2X1
|
||||
|
||||
fldmias XO! , { s2 }
|
||||
fldmias AO1!, { s8 }
|
||||
fldmias AO2!, { s16 }
|
||||
vldmia.f32 XO! , { s2 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
vldmia.f32 AO2!, { s16 }
|
||||
vmla.f32 s4 , s2 , s8
|
||||
vmla.f32 s5 , s2 , s16
|
||||
|
||||
@@ -329,10 +329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F2
|
||||
|
||||
fldmias YO, { s24 - s25 }
|
||||
vldmia.f32 YO, { s24 - s25 }
|
||||
vmla.f32 s24, s0, s4
|
||||
vmla.f32 s25, s0, s5
|
||||
fstmias YO!, { s24 - s25 }
|
||||
vstmia.f32 YO!, { s24 - s25 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -345,22 +345,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X4
|
||||
|
||||
fldmias XO , { s28 }
|
||||
vldmia.f32 XO , { s28 }
|
||||
add XO, XO, INC_X
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
fldmias AO2!, { s16 - s17 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
vldmia.f32 AO2!, { s16 - s17 }
|
||||
vmla.f32 s4 , s28 , s8
|
||||
fldmias XO , { s29 }
|
||||
vldmia.f32 XO , { s29 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f32 s5 , s28 , s16
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
vmla.f32 s4 , s29 , s9
|
||||
fldmias XO , { s30 }
|
||||
vldmia.f32 XO , { s30 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f32 s5 , s29 , s17
|
||||
fldmias AO2!, { s18 - s19 }
|
||||
vldmia.f32 AO2!, { s18 - s19 }
|
||||
vmla.f32 s4 , s30, s10
|
||||
fldmias XO , { s31 }
|
||||
vldmia.f32 XO , { s31 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f32 s5 , s30, s18
|
||||
vmla.f32 s4 , s31, s11
|
||||
@@ -371,10 +371,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X1
|
||||
|
||||
fldmias XO , { s2 }
|
||||
fldmias AO1!, { s8 }
|
||||
vldmia.f32 XO , { s2 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
add XO, XO, INC_X
|
||||
fldmias AO2!, { s16 }
|
||||
vldmia.f32 AO2!, { s16 }
|
||||
vmla.f32 s4 , s2 , s8
|
||||
vmla.f32 s5 , s2 , s16
|
||||
|
||||
@@ -382,14 +382,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S2
|
||||
|
||||
fldmias YO, { s24 }
|
||||
vldmia.f32 YO, { s24 }
|
||||
vmla.f32 s24, s0, s4
|
||||
fstmias YO, { s24 }
|
||||
vstmia.f32 YO, { s24 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s24 }
|
||||
vldmia.f32 YO, { s24 }
|
||||
vmla.f32 s24, s0, s5
|
||||
fstmias YO, { s24 }
|
||||
vstmia.f32 YO, { s24 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -402,10 +402,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X4
|
||||
|
||||
fldmias XO! , { s28 - s31 }
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
vldmia.f32 XO! , { s28 - s31 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
vmla.f32 s4 , s28 , s8
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
vmla.f32 s4 , s29 , s9
|
||||
vmla.f32 s4 , s30, s10
|
||||
vmla.f32 s4 , s31, s11
|
||||
@@ -415,17 +415,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmias XO! , { s2 }
|
||||
fldmias AO1!, { s8 }
|
||||
vldmia.f32 XO! , { s2 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
vmla.f32 s4 , s2 , s8
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmias YO, { s24 }
|
||||
vldmia.f32 YO, { s24 }
|
||||
vmla.f32 s24, s0, s4
|
||||
fstmias YO!, { s24 }
|
||||
vstmia.f32 YO!, { s24 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -437,18 +437,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X4
|
||||
|
||||
fldmias XO , { s28 }
|
||||
vldmia.f32 XO , { s28 }
|
||||
add XO, XO, INC_X
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
vmla.f32 s4 , s28 , s8
|
||||
fldmias XO , { s29 }
|
||||
vldmia.f32 XO , { s29 }
|
||||
add XO, XO, INC_X
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
vmla.f32 s4 , s29 , s9
|
||||
fldmias XO , { s30 }
|
||||
vldmia.f32 XO , { s30 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f32 s4 , s30, s10
|
||||
fldmias XO , { s31 }
|
||||
vldmia.f32 XO , { s31 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f32 s4 , s31, s11
|
||||
|
||||
@@ -457,8 +457,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmias XO , { s2 }
|
||||
fldmias AO1!, { s8 }
|
||||
vldmia.f32 XO , { s2 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f32 s4 , s2 , s8
|
||||
|
||||
@@ -466,9 +466,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmias YO, { s24 }
|
||||
vldmia.f32 YO, { s24 }
|
||||
vmla.f32 s24, s0, s4
|
||||
fstmias YO, { s24 }
|
||||
vstmia.f32 YO, { s24 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user