Compare commits
264 Commits
revert-164
...
v0.3.4
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c0827a7164 | ||
|
|
86cff4effc | ||
|
|
b028960aba | ||
|
|
dceff5542c | ||
|
|
6c7b691083 | ||
|
|
5f4c550c27 | ||
|
|
731b2722ba | ||
|
|
2601cd58ab | ||
|
|
95a5542e3c | ||
|
|
7a2e1bc804 | ||
|
|
35653e38b3 | ||
|
|
71e25ae42f | ||
|
|
97d7298973 | ||
|
|
de0d0ed52f | ||
|
|
081ceb3e02 | ||
|
|
a29ec458c2 | ||
|
|
816775e309 | ||
|
|
b6363f4539 | ||
|
|
19c4bdd8b3 | ||
|
|
f049a4c84f | ||
|
|
f72fdf525c | ||
|
|
5393759a98 | ||
|
|
5cf18e2875 | ||
|
|
910050985a | ||
|
|
0184713e1a | ||
|
|
45c3c459e1 | ||
|
|
113cb00b95 | ||
|
|
5192651706 | ||
|
|
310ea55f29 | ||
|
|
2e6fae2aad | ||
|
|
368d14f8c8 | ||
|
|
42bc2a9202 | ||
|
|
43bb386b10 | ||
|
|
c171b8ad13 | ||
|
|
2f04cf22ac | ||
|
|
807f6e6922 | ||
|
|
ecbeb802a0 | ||
|
|
2c5725cc39 | ||
|
|
e3666931d8 | ||
|
|
ae02a57261 | ||
|
|
a6a52a73f7 | ||
|
|
0427277cef | ||
|
|
4f43668eec | ||
|
|
b0c15bacc1 | ||
|
|
cfb0f5b0f8 | ||
|
|
667fed579d | ||
|
|
96d2f2c9b2 | ||
|
|
653e657a58 | ||
|
|
5f8f0583d4 | ||
|
|
974a6a30f2 | ||
|
|
9531d0e175 | ||
|
|
40cce0e353 | ||
|
|
3fd41313fc | ||
|
|
a931afe269 | ||
|
|
7d3502b500 | ||
|
|
066f8065d1 | ||
|
|
fb5b2177ca | ||
|
|
f1c02273cb | ||
|
|
661035477c | ||
|
|
aa7e47aa0a | ||
|
|
9c177d270b | ||
|
|
b025523197 | ||
|
|
5b50bd36f7 | ||
|
|
5b708e5eb1 | ||
|
|
dcc5d6291e | ||
|
|
7b5aea52bb | ||
|
|
f5595d0262 | ||
|
|
326d394a0f | ||
|
|
6af8e35a24 | ||
|
|
38cf5d9364 | ||
|
|
8a43baacb2 | ||
|
|
64ca44873b | ||
|
|
2d8064174c | ||
|
|
76a66eaac8 | ||
|
|
2992e3886a | ||
|
|
d5aeff636f | ||
|
|
af2837c392 | ||
|
|
e7b66cd36e | ||
|
|
d50abc8903 | ||
|
|
351a0c777c | ||
|
|
e3c262e5cf | ||
|
|
a293bdcd5e | ||
|
|
c7bbf9c987 | ||
|
|
898a8dcaba | ||
|
|
71c6deed60 | ||
|
|
21f46a1cf2 | ||
|
|
caf339412f | ||
|
|
8001fdcd2a | ||
|
|
162e312832 | ||
|
|
c3d93caa8d | ||
|
|
a71923514f | ||
|
|
55b244ca0d | ||
|
|
2263d3906c | ||
|
|
81c9985c3a | ||
|
|
56ebc7b53e | ||
|
|
c5f88f5a57 | ||
|
|
8a11ec19d1 | ||
|
|
fa53b903db | ||
|
|
84bcdf9c66 | ||
|
|
8f7e986184 | ||
|
|
d0e83666ad | ||
|
|
d4bad73834 | ||
|
|
065763adde | ||
|
|
210b03b543 | ||
|
|
6234a32656 | ||
|
|
c0d7cd3dac | ||
|
|
667f0cc1cb | ||
|
|
d4c8853a02 | ||
|
|
d3d58f8ee5 | ||
|
|
697dc1baf8 | ||
|
|
a9b51b8448 | ||
|
|
eba394c711 | ||
|
|
582c589727 | ||
|
|
adbf6afa25 | ||
|
|
32bec8afbb | ||
|
|
6e2c494556 | ||
|
|
20c5d668fe | ||
|
|
6d43c51ccf | ||
|
|
d74dc39b0f | ||
|
|
41951da6d4 | ||
|
|
474f7e9583 | ||
|
|
79ea839b63 | ||
|
|
f7f97c6148 | ||
|
|
6f22e1cfb8 | ||
|
|
66b43affbc | ||
|
|
1938819c25 | ||
|
|
bda3dbe2eb | ||
|
|
c3e0f0eb38 | ||
|
|
a980953bd7 | ||
|
|
78c99d5231 | ||
|
|
b7496c3638 | ||
|
|
95f4e87579 | ||
|
|
b095f2fad6 | ||
|
|
02ef20a1e4 | ||
|
|
4c3643ed7f | ||
|
|
591cca7cb0 | ||
|
|
3439158dea | ||
|
|
45fe8cb0c5 | ||
|
|
544b069e85 | ||
|
|
9b2a7ad40d | ||
|
|
10ce70701a | ||
|
|
6fc85a6359 | ||
|
|
831c661386 | ||
|
|
7e5df34e6a | ||
|
|
4f45040b89 | ||
|
|
28aa94bf4b | ||
|
|
56e7c68810 | ||
|
|
cf6df9464c | ||
|
|
6f77af2eef | ||
|
|
4d183e5567 | ||
|
|
34d55fd165 | ||
|
|
b991570210 | ||
|
|
288aeea8a2 | ||
|
|
1ad1e79062 | ||
|
|
b402626509 | ||
|
|
ec0cac1669 | ||
|
|
2349e15149 | ||
|
|
f3c262156e | ||
|
|
30f5a69ab8 | ||
|
|
fd081a91e4 | ||
|
|
094f8c3b57 | ||
|
|
5cf090f516 | ||
|
|
58363542e7 | ||
|
|
3abc22a5bf | ||
|
|
1e531701b7 | ||
|
|
5d42b6ea04 | ||
|
|
ba4f433321 | ||
|
|
4cf7315a5d | ||
|
|
b57af93792 | ||
|
|
8aeab0601e | ||
|
|
1cb7b9015e | ||
|
|
a4bd41e9f2 | ||
|
|
9e2bb0c641 | ||
|
|
dbfd7524cd | ||
|
|
2982ce505d | ||
|
|
fd8d1868a1 | ||
|
|
f0563f14ba | ||
|
|
3197f86762 | ||
|
|
422a8fa953 | ||
|
|
5bac15adbd | ||
|
|
e17f969fa0 | ||
|
|
e11126b26a | ||
|
|
74608e470d | ||
|
|
f3fd44a731 | ||
|
|
9e917b16db | ||
|
|
8440a4cb1a | ||
|
|
b55690a659 | ||
|
|
b902a40986 | ||
|
|
5991d1a6cd | ||
|
|
b1b743f434 | ||
|
|
2caa2210bb | ||
|
|
2a589c4b28 | ||
|
|
fd42ca462d | ||
|
|
52d3f7af50 | ||
|
|
5c6e020f49 | ||
|
|
d4d3113adc | ||
|
|
375dff54fc | ||
|
|
a5f165275a | ||
|
|
8c13aa495a | ||
|
|
1ee6d087c3 | ||
|
|
a95a784ab2 | ||
|
|
9bec34cb67 | ||
|
|
87bebdbd8a | ||
|
|
9493f26309 | ||
|
|
36add7570a | ||
|
|
cacacc8007 | ||
|
|
1a00ef3d27 | ||
|
|
4c0d832ec3 | ||
|
|
fc33cbc7bb | ||
|
|
c52a831ae4 | ||
|
|
2e99873ff7 | ||
|
|
00abaa865b | ||
|
|
33043f563f | ||
|
|
66da7677bd | ||
|
|
7932ff3ea9 | ||
|
|
62f4c69708 | ||
|
|
73478664d4 | ||
|
|
ee955757f9 | ||
|
|
48610a4524 | ||
|
|
4a553e8678 | ||
|
|
e788102c10 | ||
|
|
165f00c159 | ||
|
|
40c068a875 | ||
|
|
933896a1d0 | ||
|
|
a4e321400b | ||
|
|
9e65430504 | ||
|
|
2cfa86b406 | ||
|
|
2a9a9389ef | ||
|
|
6463bffd59 | ||
|
|
8ef7d4fb54 | ||
|
|
6400868e55 | ||
|
|
8ebf541e97 | ||
|
|
b03ae3f4dc | ||
|
|
2cc8fb0ad2 | ||
|
|
e8a68ef261 | ||
|
|
64826a0d7d | ||
|
|
25f2d25cfe | ||
|
|
73131fa30a | ||
|
|
66fcdd5be8 | ||
|
|
43ac839c16 | ||
|
|
7ba5936ecd | ||
|
|
b14f44d2ad | ||
|
|
e71d70ba87 | ||
|
|
d671870f5f | ||
|
|
4e103c822c | ||
|
|
d2142760e0 | ||
|
|
2fbfc64da8 | ||
|
|
8d5b33b6be | ||
|
|
36aea5ce2d | ||
|
|
1309711e24 | ||
|
|
571e9de2ac | ||
|
|
448ed15115 | ||
|
|
045fb5ea2c | ||
|
|
4dd70d98d7 | ||
|
|
504310eeb9 | ||
|
|
ea1f39518f | ||
|
|
d0ec4325cf | ||
|
|
3f73e8b8cf | ||
|
|
a49203b48c | ||
|
|
c6aec89d10 | ||
|
|
e6d7711199 | ||
|
|
7a914347c5 | ||
|
|
3a8f0a6a1f | ||
|
|
939452ea9d |
47
.travis.yml
47
.travis.yml
@@ -4,11 +4,10 @@ dist: precise
|
||||
sudo: true
|
||||
language: c
|
||||
|
||||
jobs:
|
||||
matrix:
|
||||
include:
|
||||
- &test-ubuntu
|
||||
os: linux
|
||||
stage: test
|
||||
compiler: gcc
|
||||
addons:
|
||||
apt:
|
||||
@@ -59,7 +58,6 @@ jobs:
|
||||
- BTYPE="BINARY=32"
|
||||
|
||||
- os: linux
|
||||
stage: test
|
||||
compiler: gcc
|
||||
addons:
|
||||
apt:
|
||||
@@ -80,13 +78,12 @@ jobs:
|
||||
# that don't require sudo.
|
||||
- &test-alpine
|
||||
os: linux
|
||||
stage: test
|
||||
dist: trusty
|
||||
sudo: true
|
||||
language: minimal
|
||||
before_install:
|
||||
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \
|
||||
&& echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1"
|
||||
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
|
||||
&& echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1"
|
||||
- alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||
install:
|
||||
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
|
||||
@@ -124,7 +121,6 @@ jobs:
|
||||
|
||||
- &test-cmake
|
||||
os: linux
|
||||
stage: test
|
||||
compiler: clang
|
||||
addons:
|
||||
apt:
|
||||
@@ -153,7 +149,6 @@ jobs:
|
||||
|
||||
- &test-macos
|
||||
os: osx
|
||||
stage: test
|
||||
osx_image: xcode8
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
@@ -168,6 +163,42 @@ jobs:
|
||||
env:
|
||||
- BTYPE="BINARY=32"
|
||||
|
||||
- &emulated-arm
|
||||
dist: trusty
|
||||
sudo: required
|
||||
services: docker
|
||||
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
|
||||
name: "Emulated Build for ARMV6 with gcc"
|
||||
before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset
|
||||
script: |
|
||||
echo "FROM openblas/alpine:${IMAGE_ARCH}
|
||||
COPY . /tmp/openblas
|
||||
RUN mkdir /tmp/openblas/build && \
|
||||
cd /tmp/openblas/build && \
|
||||
CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \
|
||||
-D TARGET=${TARGET_ARCH} \
|
||||
-D BUILD_SHARED_LIBS=ON \
|
||||
-D BUILD_WITHOUT_LAPACK=ON \
|
||||
-D BUILD_WITHOUT_CBLAS=ON \
|
||||
-D CMAKE_BUILD_TYPE=Release ../ && \
|
||||
cmake --build ." > Dockerfile
|
||||
docker build .
|
||||
- <<: *emulated-arm
|
||||
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
|
||||
name: "Emulated Build for ARMV6 with clang"
|
||||
- <<: *emulated-arm
|
||||
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
|
||||
name: "Emulated Build for ARMV8 with gcc"
|
||||
- <<: *emulated-arm
|
||||
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
|
||||
name: "Emulated Build for ARMV8 with clang"
|
||||
|
||||
allow_failures:
|
||||
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
|
||||
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
|
||||
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
|
||||
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
|
||||
|
||||
# whitelist
|
||||
branches:
|
||||
only:
|
||||
|
||||
118
CMakeLists.txt
118
CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
||||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 2.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 4)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
@@ -15,16 +15,21 @@ include(GNUInstallDirs)
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
|
||||
set(OpenBLAS_LIBNAME openblas)
|
||||
|
||||
#######
|
||||
if(MSVC)
|
||||
option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
|
||||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
|
||||
endif()
|
||||
option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF)
|
||||
option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF)
|
||||
option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF)
|
||||
option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF)
|
||||
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
||||
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF)
|
||||
option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
|
||||
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
|
||||
|
||||
# Add a prefix or suffix to all exported symbol names in the shared library.
|
||||
# Avoids conflicts with other BLAS libraries, especially when using
|
||||
# 64 bit integer interfaces in OpenBLAS.
|
||||
|
||||
set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" )
|
||||
set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" )
|
||||
#######
|
||||
if(BUILD_WITHOUT_LAPACK)
|
||||
set(NO_LAPACK 1)
|
||||
@@ -38,11 +43,13 @@ endif()
|
||||
#######
|
||||
|
||||
|
||||
message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.")
|
||||
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
|
||||
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
|
||||
|
||||
set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE})
|
||||
|
||||
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
|
||||
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
@@ -150,6 +157,7 @@ endif()
|
||||
|
||||
# add objects to the openblas lib
|
||||
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
|
||||
|
||||
# Android needs to explicitly link against libm
|
||||
if(ANDROID)
|
||||
@@ -169,6 +177,7 @@ endif()
|
||||
# Set output for libopenblas
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
|
||||
|
||||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
||||
@@ -208,15 +217,84 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
|
||||
SOVERSION ${OpenBLAS_MAJOR_VERSION}
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
|
||||
if (NOT DEFINED ARCH)
|
||||
set(ARCH_IN "x86_64")
|
||||
else()
|
||||
set(ARCH_IN ${ARCH})
|
||||
endif()
|
||||
|
||||
if (${CORE} STREQUAL "generic")
|
||||
set(ARCH_IN "GENERIC")
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED EXPRECISION)
|
||||
set(EXPRECISION_IN 0)
|
||||
else()
|
||||
set(EXPRECISION_IN ${EXPRECISION})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NO_CBLAS)
|
||||
set(NO_CBLAS_IN 0)
|
||||
else()
|
||||
set(NO_CBLAS_IN ${NO_CBLAS})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NO_LAPACK)
|
||||
set(NO_LAPACK_IN 0)
|
||||
else()
|
||||
set(NO_LAPACK_IN ${NO_LAPACK})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NO_LAPACKE)
|
||||
set(NO_LAPACKE_IN 0)
|
||||
else()
|
||||
set(NO_LAPACKE_IN ${NO_LAPACKE})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NEED2UNDERSCORES)
|
||||
set(NEED2UNDERSCORES_IN 0)
|
||||
else()
|
||||
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED ONLY_CBLAS)
|
||||
set(ONLY_CBLAS_IN 0)
|
||||
else()
|
||||
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED BU)
|
||||
set(BU _)
|
||||
endif()
|
||||
|
||||
if (NOT ${SYMBOLPREFIX} STREQUAL "")
|
||||
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||
endif()
|
||||
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
|
||||
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||
endif()
|
||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD
|
||||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||
COMMENT "renaming symbols"
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
# Install project
|
||||
|
||||
# Install libraries
|
||||
install(TARGETS ${OpenBLAS_LIBNAME}
|
||||
EXPORT "OpenBLASTargets"
|
||||
EXPORT "OpenBLAS${SUFFIX64}Targets"
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
||||
|
||||
# Install headers
|
||||
set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
|
||||
set(CMAKE_INSTALL_FULL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})
|
||||
|
||||
message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
|
||||
set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h)
|
||||
@@ -264,29 +342,31 @@ if(NOT NO_LAPACKE)
|
||||
ADD_CUSTOM_TARGET(genlapacke
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
|
||||
)
|
||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
|
||||
endif()
|
||||
|
||||
include(FindPkgConfig QUIET)
|
||||
if(PKG_CONFIG_FOUND)
|
||||
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY)
|
||||
install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
||||
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY)
|
||||
install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
||||
endif()
|
||||
|
||||
|
||||
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
|
||||
set(PN OpenBLAS)
|
||||
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}")
|
||||
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}")
|
||||
configure_package_config_file(cmake/${PN}Config.cmake.in
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
|
||||
INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||
write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
|
||||
VERSION ${${PN}_VERSION}
|
||||
COMPATIBILITY AnyNewerVersion)
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake
|
||||
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||
install(EXPORT "${PN}Targets"
|
||||
NAMESPACE "${PN}::"
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
|
||||
RENAME ${PN}${SUFFIX64}ConfigVersion.cmake
|
||||
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||
install(EXPORT "${PN}${SUFFIX64}Targets"
|
||||
NAMESPACE "${PN}${SUFFIX64}::"
|
||||
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||
|
||||
|
||||
211
Changelog.txt
211
Changelog.txt
@@ -1,4 +1,215 @@
|
||||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.4
|
||||
02-Dec-2018
|
||||
|
||||
common:
|
||||
* the new, experimental thread-local memory allocation had
|
||||
inadvertently been left enabled for gmake builds in 0.3.3
|
||||
despite the announcement. It is now disabled by default, and
|
||||
single-threaded builds will keep using the old allocator even
|
||||
if the USE_TLS option is turned on.
|
||||
* OpenBLAS will now provide enough buffer space for at least 50
|
||||
threads by default.
|
||||
* The output of openblas_get_config() now contains the version
|
||||
number.
|
||||
* A serious thread safety bug in GEMV operation with small M and
|
||||
large N size has been fixed.
|
||||
* The code will now automatically call blas_thread_init after a
|
||||
fork if needed before handling a call to openblas_set_num_threads
|
||||
* Accesses to parallelized level3 functions from multiple callers
|
||||
are now serialized to avoid thread races (unless using OpenMP).
|
||||
This should provide better performance than the known-threadsafe
|
||||
(but non-default) USE_SIMPLE_THREADED_LEVEL3 option.
|
||||
* When building LAPACK with gfortran, -frecursive is now (again)
|
||||
enabled by default to ensure correct behaviour.
|
||||
* The OpenBLAS version cblas.h now supports both CBLAS_ORDER and
|
||||
CBLAS_LAYOUT as the name of the matrix row/column order option.
|
||||
* Externally set LDFLAGS are now passed through to the final compile/link
|
||||
steps to facilitate setting platform-specific linker flags.
|
||||
* A potential race condition during the build of LAPACK (that would
|
||||
usually manifest itself as a failure to build TESTING/MATGEN) has been
|
||||
fixed.
|
||||
* xHEMV has been changed to stay single-threaded for small input sizes
|
||||
where the overhead of multithreading exceeds any possible gains
|
||||
* CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or
|
||||
ThunderX hardware with sizable input.
|
||||
* Linker flags for the PGI compiler have been updated
|
||||
* Behaviour of AXPY with zero increments is now handled in the C interface,
|
||||
correcting the result on at least Intel Atom.
|
||||
* The result matrix from calling SGELSS with an all-zero input matrix is
|
||||
now zeroed completely.
|
||||
|
||||
x86_64:
|
||||
* Autodetection of AMD Ryzen2 has been fixed (again).
|
||||
* CMAKE builds now support labeling of an INTERFACE64=1 build of
|
||||
the library with the _64 suffix.
|
||||
* AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel
|
||||
has been sped up by rewriting with C intrinsics
|
||||
* Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS)
|
||||
|
||||
POWER:
|
||||
* added support for building on AIX (with gcc and GNU tools from AIX Toolbox).
|
||||
* CPU type detection has been implemented for AIX.
|
||||
* CPU type detection has been fixed for NETBSD.
|
||||
|
||||
MIPS64:
|
||||
* AXPY on LOONGSON3A has been corrected to pass "zero increment" utest.
|
||||
* DSDOT on LOONGSON3A has been fixed.
|
||||
* the SGEMM microkernel has been hardened against potential data loss.
|
||||
|
||||
ARMV8:
|
||||
* DYNAMic_ARCH support is now available for 64bit ARM
|
||||
* cross-compiling for ARMV8 under iOS now works.
|
||||
* cpu-specific code has been rearranged to make better use of both
|
||||
hardware commonalities and model-specific compiler optimizations.
|
||||
* XGENE1 has been removed as a TARGET, superseded by the improved generic
|
||||
ARMV8 support.
|
||||
|
||||
ARMV7:
|
||||
* Older assembly mnemonics have been converted to UAL form to allow
|
||||
building with clang 7.0
|
||||
* Cross compiling LAPACKE for Android has been fixed again (broken by
|
||||
update to LAPACK 3.7.0 some while ago).
|
||||
|
||||
====================================================================
|
||||
Version 0.3.3
|
||||
31-Aug-2018
|
||||
|
||||
common:
|
||||
* thread memory allocation has been switched back to the method
|
||||
used before version 0.3.1 due to unexpected problems caused by
|
||||
the new code under some circumstances. A new compile-time option
|
||||
USE_TLS has been added to enable the new code, and it is hoped
|
||||
that this can become the default again in the next version.
|
||||
* LAPAck PR272 has been integrated, which fixes spurious errors
|
||||
in DSYEVR and related functions caused by missing conversion
|
||||
from ILAENV to ILAENV_2STAGE in several _2stage routines.
|
||||
* the cmake-generated OpenBLASConfig.cmake now uses correct case
|
||||
for the name of the library
|
||||
* added support for Haiku OS
|
||||
|
||||
x86_64:
|
||||
* added AVX512 implementations of SDOT, DDOT, SAXPY, DAXPY,
|
||||
DSCAL, DGEMVN and DSYMVL
|
||||
* added a workaround for a cygwin issue that prevented compilation
|
||||
of AVX512 code
|
||||
|
||||
IBM Z:
|
||||
* added autodetection of Z14
|
||||
* fixed TRMM errors in the generic target
|
||||
|
||||
====================================================================
|
||||
Version 0.3.2
|
||||
30-Jul-2018
|
||||
|
||||
common:
|
||||
* fixes for regressions caused by the rewrite of the thread
|
||||
initialization code in 0.3.1
|
||||
|
||||
POWER:
|
||||
* fixed cpu autodetection for the BSDs
|
||||
|
||||
MIPS64:
|
||||
* fixed utest errors in AXPY, DSDOT, ROT and SWAP
|
||||
|
||||
x86_64:
|
||||
* added autodetection of AMD Ryzen 2
|
||||
* fixed build with older versions of MSVC
|
||||
|
||||
====================================================================
|
||||
Version 0.3.1
|
||||
01-Jul-2018
|
||||
|
||||
common:
|
||||
* rewritten thread initialization code with significantly reduced overhead
|
||||
* added CBLAS interfaces to the IxAMIN BLAS extension functions
|
||||
* fixed the lapack-test target
|
||||
* CMAKE builds now create an OpenBLASConfig.cmake file
|
||||
* ZAXPY now uses a single thread for small input sizes
|
||||
* the LAPACK code was updated from Reference-LAPACK/lapack#253
|
||||
(fixing LAPACKE interfaces to Aasen's functions)
|
||||
|
||||
POWER:
|
||||
* corrected CROT and ZROT behaviour with zero INC_X
|
||||
|
||||
ARMV7:
|
||||
* corrected xDOT behaviour with zero INC_X or INC_Y
|
||||
|
||||
x86_64:
|
||||
* retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER,
|
||||
this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO
|
||||
(which will still be supported via the slower PRESCOTT kernels when this option is not set)
|
||||
* added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to
|
||||
specify the list of x86_64 targets to include. Any target not on the list will be supported
|
||||
by the Sandybridge or Nehalem kernels if available, or by Prescott.
|
||||
* improved SWITCH_RATIO on Haswell for increased GEMM throughput
|
||||
* added initial support for Intel Skylake X, including an AVX512 SGEMM kernel
|
||||
* added autodetection of Intel Cannon Lake series as Skylake X
|
||||
* added a default L2 cache size for hypervisors that return zero here (Chromebook)
|
||||
* fixed a name clash with recent Windows10 headers that broke the build with (at least)
|
||||
recent mingw from MSYS2
|
||||
* fixed a link error in mixed clang/gfortran builds with OpenMP
|
||||
* updated the OSX deployment target to 10.8
|
||||
* switched on parallel make for builds on MS Windows by default
|
||||
|
||||
x86:
|
||||
* fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y
|
||||
|
||||
====================================================================
|
||||
Version 0.3.0
|
||||
23-May-2108
|
||||
|
||||
common:
|
||||
* fixed some more thread race and locking bugs
|
||||
* added preliminary support for calling an OpenMP build of the library from multiple threads
|
||||
* removed performance impact of thread locks added in 0.2.20 on OpenMP code
|
||||
* general code cleanup
|
||||
* optimized DSDOT implementation
|
||||
* improved thread distribution for GEMM
|
||||
* corrected IMATCOPY/OMATCOPY implementation
|
||||
* fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations
|
||||
* cmake build improvements
|
||||
* pkgconfig file now contains build options
|
||||
* openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build
|
||||
* corrections and improvements for systems with more than 64 cpus
|
||||
* LAPACK code updated to 3.8.0 including later fixes
|
||||
* added ReLAPACK, a recursive implementation of several LAPACK functions
|
||||
* Rewrote ROTMG to handle cases that the netlib code failed to address
|
||||
* Disabled (broken) multithreading code for xTRMV
|
||||
* corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard
|
||||
* shared memory access failures on startup are now handled more gracefully
|
||||
* restored utests from earlier releases (and made them pass on all affected systems)
|
||||
|
||||
SPARC:
|
||||
* several fixes for cpu autodetection
|
||||
|
||||
POWER:
|
||||
* corrected vector register overwriting in several Power8 kernels
|
||||
* optimized additional BLAS functions
|
||||
|
||||
ARM:
|
||||
* added support for CortexA53 and A72
|
||||
* added autodetection for ThunderX2T99
|
||||
* made most optimized kernels the default for generic ARMv8 targets
|
||||
|
||||
x86_64:
|
||||
* parallelized DDOT kernel for Haswell
|
||||
* changed alignment directives in assembly kernels to boost performance on OSX
|
||||
* fixed register handling in the GEMV microkernels (bug exposed by gcc7)
|
||||
* added support for building on OpenBSD and Dragonfly
|
||||
* updated compiler options to work with Intel release 2018
|
||||
* support fully optimized build with clang/flang on Microsoft Windows
|
||||
* fixed building on AIX
|
||||
|
||||
IBM Z:
|
||||
* added optimized BLAS 1/2 functions
|
||||
|
||||
MIPS:
|
||||
* fixed cpu autodetection helper code
|
||||
* added mips32 1004K cpu (Mediatek MT7621 and similar SoC)
|
||||
* added mips64 I6500 cpu
|
||||
|
||||
====================================================================
|
||||
Version 0.2.20
|
||||
24-Jul-2017
|
||||
|
||||
6
Makefile
6
Makefile
@@ -97,7 +97,7 @@ endif
|
||||
|
||||
shared :
|
||||
ifndef NO_SHARED
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
@@ -251,7 +251,7 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
@@ -267,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifdef SMP
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
else ifeq ($(OSNAME), Haiku)
|
||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
else
|
||||
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
|
||||
@@ -4,22 +4,37 @@ CCOMMON_OPT += -march=armv8-a
|
||||
FCOMMON_OPT += -march=armv8-a
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA57)
|
||||
CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
|
||||
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
|
||||
ifeq ($(CORE), CORTEXA53)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), VULCAN)
|
||||
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
|
||||
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
|
||||
ifeq ($(CORE), CORTEXA57)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA72)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA73)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX)
|
||||
CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
|
||||
FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), FALKOR)
|
||||
CCOMMON_OPT += -march=armv8.1-a -mtune=falkor
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=falkor
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX2T99)
|
||||
CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
|
||||
FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
|
||||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
|
||||
@@ -48,6 +48,7 @@ ifndef NO_CBLAS
|
||||
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
|
||||
endif
|
||||
|
||||
ifneq ($(OSNAME), AIX)
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@@ -66,12 +67,13 @@ endif
|
||||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
@@ -93,6 +95,33 @@ ifeq ($(OSNAME), CYGWIN_NT)
|
||||
endif
|
||||
endif
|
||||
|
||||
else
|
||||
#install on AIX has different options syntax
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
|
||||
endif
|
||||
|
||||
#for install static library
|
||||
ifndef NO_STATIC
|
||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
#Generating openblas.pc
|
||||
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.2.dev
|
||||
VERSION = 0.3.4
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
@@ -109,6 +109,12 @@ BUILD_LAPACK_DEPRECATED = 1
|
||||
# If you want to use legacy threaded Level 3 implementation.
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
||||
# If you want to use the new, still somewhat experimental code that uses
|
||||
# thread-local storage instead of a central memory buffer in memory.c
|
||||
# Note that if your system uses GLIBC, it needs to have at least glibc 2.21
|
||||
# for this to work.
|
||||
# USE_TLS = 1
|
||||
|
||||
# If you want to drive whole 64bit region by BLAS. Not all Fortran
|
||||
# compiler supports this. It's safe to keep comment it out if you
|
||||
# are not sure(equivalent to "-i8" option).
|
||||
@@ -146,6 +152,9 @@ NO_AFFINITY = 1
|
||||
# FUNCTION_PROFILE = 1
|
||||
|
||||
# Support for IEEE quad precision(it's *real* REAL*16)( under testing)
|
||||
# This option should not be used - it is a holdover from unfinished code present
|
||||
# in the original GotoBLAS2 library that may be usable as a starting point but
|
||||
# is not even expected to compile in its present form.
|
||||
# QUAD_PRECISION = 1
|
||||
|
||||
# Theads are still working for a while after finishing BLAS operation
|
||||
@@ -183,8 +192,8 @@ NO_AFFINITY = 1
|
||||
# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT
|
||||
# COMMON_OPT = -O2
|
||||
|
||||
# gfortran option for LAPACK
|
||||
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
|
||||
# gfortran option for LAPACK to improve thread-safety
|
||||
# It is enabled by default in Makefile.system for gfortran
|
||||
# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT
|
||||
# FCOMMON_OPT = -frecursive
|
||||
|
||||
|
||||
@@ -9,6 +9,11 @@ ifndef TOPDIR
|
||||
TOPDIR = .
|
||||
endif
|
||||
|
||||
# Catch conflicting usage of ARCH in some BSD environments
|
||||
ifeq ($(ARCH), amd64)
|
||||
override ARCH=x86_64
|
||||
endif
|
||||
|
||||
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
|
||||
|
||||
# Default C compiler
|
||||
@@ -505,6 +510,13 @@ CCOMMON_OPT += $(XCCOMMON_OPT)
|
||||
#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), arm64)
|
||||
DYNAMIC_CORE = ARMV8
|
||||
DYNAMIC_CORE += CORTEXA57
|
||||
DYNAMIC_CORE += THUNDERX
|
||||
DYNAMIC_CORE += THUNDERX2T99
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
ifndef DYNAMIC_CORE
|
||||
override DYNAMIC_ARCH=
|
||||
@@ -713,6 +725,8 @@ endif
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
CCOMMON_OPT += -DF_INTERFACE_GFORT
|
||||
FCOMMON_OPT += -Wall
|
||||
# make single-threaded LAPACK calls thread-safe #1847
|
||||
FCOMMON_OPT += -frecursive
|
||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
EXTRALIB += -lgfortran
|
||||
@@ -1018,6 +1032,12 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||
endif
|
||||
|
||||
ifdef USE_TLS
|
||||
CCOMMON_OPT += -DUSE_TLS
|
||||
endif
|
||||
|
||||
CCOMMON_OPT += -DVERSION=\"$(VERSION)\"
|
||||
|
||||
ifndef SYMBOLPREFIX
|
||||
SYMBOLPREFIX =
|
||||
endif
|
||||
@@ -1195,7 +1215,11 @@ endif
|
||||
|
||||
LIBDLLNAME = $(LIBPREFIX).dll
|
||||
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
|
||||
ifneq ($(OSNAME), AIX)
|
||||
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
|
||||
else
|
||||
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
|
||||
endif
|
||||
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
|
||||
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
|
||||
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
|
||||
|
||||
@@ -12,6 +12,14 @@ ifeq ($(CORE), SKYLAKEX)
|
||||
ifndef NO_AVX512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
@@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`.
|
||||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
|
||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||
@@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
|
||||
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
||||
Clang 3.0 will generate the wrong AVX binary code.
|
||||
* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels.
|
||||
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
|
||||
the library with `BIGNUMA=1`.
|
||||
|
||||
@@ -83,8 +83,11 @@ ARMV5
|
||||
|
||||
8.ARM 64-bit CPU:
|
||||
ARMV8
|
||||
CORTEXA53
|
||||
CORTEXA57
|
||||
VULCAN
|
||||
CORTEXA72
|
||||
CORTEXA73
|
||||
FALKOR
|
||||
THUNDERX
|
||||
THUNDERX2T99
|
||||
|
||||
|
||||
@@ -122,7 +122,7 @@ int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *x, *y;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
FLOAT beta [] = {1.0, 1.0};
|
||||
FLOAT beta [] = {1.0, 0.0};
|
||||
char trans='N';
|
||||
blasint m, i, j;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
|
||||
6
c_check
6
c_check
@@ -64,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/);
|
||||
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
|
||||
$os = Interix if ($data =~ /OS_INTERIX/);
|
||||
$os = Android if ($data =~ /OS_ANDROID/);
|
||||
$os = Haiku if ($data =~ /OS_HAIKU/);
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
@@ -204,7 +205,7 @@ $binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
$no_avx512= 0;
|
||||
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
|
||||
print $tmpf "int main(void){ __asm__ volatile($code); }\n";
|
||||
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
|
||||
$args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
@@ -223,7 +224,6 @@ $data =~ /globl\s([_\.]*)(.*)/;
|
||||
$need_fu = $1;
|
||||
|
||||
$cross = 0;
|
||||
$cross = 1 if ($os ne $hostos);
|
||||
|
||||
if ($architecture ne $hostarch) {
|
||||
$cross = 1;
|
||||
@@ -231,6 +231,8 @@ if ($architecture ne $hostarch) {
|
||||
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
|
||||
}
|
||||
|
||||
$cross = 1 if ($os ne $hostos);
|
||||
|
||||
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
||||
|
||||
$linker_L = "";
|
||||
|
||||
3
cblas.h
3
cblas.h
@@ -51,7 +51,8 @@ typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=1
|
||||
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
|
||||
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
|
||||
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
|
||||
|
||||
typedef CBLAS_ORDER CBLAS_LAYOUT;
|
||||
|
||||
float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
|
||||
double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
|
||||
float cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
|
||||
|
||||
@@ -3,6 +3,11 @@
|
||||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||
## Sets Fortran related variables.
|
||||
|
||||
if (INTERFACE64)
|
||||
set(SUFFIX64 64)
|
||||
set(SUFFIX64_UNDERSCORE _64)
|
||||
endif()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "FLANG")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
||||
if (BINARY64 AND INTERFACE64)
|
||||
@@ -39,7 +44,7 @@ endif ()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "GFORTRAN")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
|
||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||
if (NOT NO_LAPACK)
|
||||
set(EXTRALIB "{EXTRALIB} -lgfortran")
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
||||
libsuffix=@SUFFIX64_UNDERSCORE@
|
||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||
|
||||
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
|
||||
@@ -6,5 +7,5 @@ Name: OpenBLAS
|
||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||
Version: @OPENBLAS_VERSION@
|
||||
URL: https://github.com/xianyi/OpenBLAS
|
||||
Libs: -L${libdir} -lopenblas
|
||||
Libs: -L${libdir} -lopenblas${libsuffix}
|
||||
Cflags: -I${includedir}
|
||||
|
||||
@@ -85,7 +85,7 @@ if (NOT NOFORTRAN)
|
||||
endif ()
|
||||
|
||||
# Cannot run getarch on target if we are cross-compiling
|
||||
if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
|
||||
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
|
||||
# Write to config as getarch would
|
||||
|
||||
# TODO: Set up defines that getarch sets up based on every other target
|
||||
|
||||
@@ -41,6 +41,12 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (DEFINED TARGET)
|
||||
if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (DEFINED TARGET)
|
||||
message(STATUS "Targeting the ${TARGET} architecture.")
|
||||
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
|
||||
@@ -214,6 +220,10 @@ if (CONSISTENT_FPCSR)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR")
|
||||
endif ()
|
||||
|
||||
if (USE_TLS)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS")
|
||||
endif ()
|
||||
|
||||
# Only for development
|
||||
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST")
|
||||
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST")
|
||||
@@ -300,6 +310,8 @@ if (MIXED_MEMORY_ALLOCATION)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION")
|
||||
endif ()
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DVERSION=\"\\\"${OpenBLAS_VERSION}\\\"\"")
|
||||
|
||||
set(REVISION "-r${OpenBLAS_VERSION}")
|
||||
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})
|
||||
|
||||
|
||||
@@ -10,6 +10,16 @@ if (${HOST_OS} STREQUAL "WINDOWS")
|
||||
set(HOST_OS WINNT)
|
||||
endif ()
|
||||
|
||||
if (${HOST_OS} STREQUAL "LINUX")
|
||||
# check if we're building natively on Android (TERMUX)
|
||||
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
|
||||
if(${OPERATING_SYSTEM} MATCHES "Android")
|
||||
set(HOST_OS ANDROID)
|
||||
endif(${OPERATING_SYSTEM} MATCHES "Android")
|
||||
endif()
|
||||
|
||||
|
||||
|
||||
if(CMAKE_COMPILER_IS_GNUCC AND WIN32)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
|
||||
OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE
|
||||
@@ -67,7 +77,7 @@ else()
|
||||
endif()
|
||||
|
||||
if (X86_64 OR X86)
|
||||
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
|
||||
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
|
||||
if (NO_AVX512 EQUAL 1)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||
|
||||
12
common.h
12
common.h
@@ -105,6 +105,10 @@ extern "C" {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
#define NO_SYSV_IPC
|
||||
#endif
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
#ifdef ATOM
|
||||
#define GOTO_ATOM ATOM
|
||||
@@ -179,7 +183,7 @@ extern "C" {
|
||||
|
||||
#define ALLOCA_ALIGN 63UL
|
||||
|
||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
|
||||
#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
|
||||
|
||||
#ifdef NEEDBUNDERSCORE
|
||||
#define BLASFUNC(FUNC) FUNC##_
|
||||
@@ -253,8 +257,14 @@ typedef unsigned long BLASULONG;
|
||||
|
||||
#ifdef USE64BITINT
|
||||
typedef BLASLONG blasint;
|
||||
#if defined(OS_WINDOWS) && defined(__64BIT__)
|
||||
#define blasabs(x) llabs(x)
|
||||
#else
|
||||
#define blasabs(x) labs(x)
|
||||
#endif
|
||||
#else
|
||||
typedef int blasint;
|
||||
#define blasabs(x) abs(x)
|
||||
#endif
|
||||
#else
|
||||
#ifdef USE64BITINT
|
||||
|
||||
@@ -94,7 +94,7 @@ static inline unsigned int rpcc(void){
|
||||
#define RPCC_DEFINED
|
||||
|
||||
#ifndef NO_AFFINITY
|
||||
#define WHEREAMI
|
||||
//#define WHEREAMI
|
||||
static inline int WhereAmI(void){
|
||||
int ret=0;
|
||||
__asm__ __volatile__(".set push \n"
|
||||
|
||||
127
cpuid_arm64.c
127
cpuid_arm64.c
@@ -29,25 +29,37 @@
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_ARMV8 1
|
||||
#define CPU_CORTEXA57 2
|
||||
#define CPU_VULCAN 3
|
||||
#define CPU_THUNDERX 4
|
||||
#define CPU_THUNDERX2T99 5
|
||||
// Arm
|
||||
#define CPU_CORTEXA53 2
|
||||
#define CPU_CORTEXA57 3
|
||||
#define CPU_CORTEXA72 4
|
||||
#define CPU_CORTEXA73 5
|
||||
// Qualcomm
|
||||
#define CPU_FALKOR 6
|
||||
// Cavium
|
||||
#define CPU_THUNDERX 7
|
||||
#define CPU_THUNDERX2T99 8
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
"ARMV8" ,
|
||||
"CORTEXA53",
|
||||
"CORTEXA57",
|
||||
"VULCAN",
|
||||
"CORTEXA72",
|
||||
"CORTEXA73",
|
||||
"FALKOR",
|
||||
"THUNDERX",
|
||||
"THUNDERX2T99"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"unknown",
|
||||
"armv8" ,
|
||||
"armv8",
|
||||
"cortexa53",
|
||||
"cortexa57",
|
||||
"vulcan",
|
||||
"cortexa72",
|
||||
"cortexa73",
|
||||
"falkor",
|
||||
"thunderx",
|
||||
"thunderx2t99"
|
||||
};
|
||||
@@ -114,14 +126,24 @@ int detect(void)
|
||||
|
||||
fclose(infile);
|
||||
if(cpu_part != NULL && cpu_implementer != NULL) {
|
||||
if (strstr(cpu_implementer, "0x41") &&
|
||||
(strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08") || strstr(cpu_part,"0xd03") ))
|
||||
return CPU_CORTEXA57; //or compatible A53, A72
|
||||
else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42"))
|
||||
return CPU_VULCAN;
|
||||
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
|
||||
// Arm
|
||||
if (strstr(cpu_implementer, "0x41")) {
|
||||
if (strstr(cpu_part, "0xd03"))
|
||||
return CPU_CORTEXA53;
|
||||
else if (strstr(cpu_part, "0xd07"))
|
||||
return CPU_CORTEXA57;
|
||||
else if (strstr(cpu_part, "0xd08"))
|
||||
return CPU_CORTEXA72;
|
||||
else if (strstr(cpu_part, "0xd09"))
|
||||
return CPU_CORTEXA73;
|
||||
}
|
||||
// Qualcomm
|
||||
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
|
||||
return CPU_FALKOR;
|
||||
// Cavium
|
||||
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0a1"))
|
||||
return CPU_THUNDERX;
|
||||
else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43"))
|
||||
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
|
||||
return CPU_THUNDERX2T99;
|
||||
}
|
||||
|
||||
@@ -180,64 +202,63 @@ void get_subdirname(void)
|
||||
void get_cpuconfig(void)
|
||||
{
|
||||
|
||||
// All arches should define ARMv8
|
||||
printf("#define ARMV8\n");
|
||||
printf("#define HAVE_NEON\n"); // This shouldn't be necessary
|
||||
printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary
|
||||
|
||||
int d = detect();
|
||||
switch (d)
|
||||
{
|
||||
|
||||
case CPU_CORTEXA53:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
// Fall-through
|
||||
case CPU_ARMV8:
|
||||
printf("#define ARMV8\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
break;
|
||||
|
||||
case CPU_VULCAN:
|
||||
printf("#define VULCAN \n");
|
||||
printf("#define HAVE_VFP \n");
|
||||
printf("#define HAVE_VFPV3 \n");
|
||||
printf("#define HAVE_NEON \n");
|
||||
printf("#define HAVE_VFPV4 \n");
|
||||
printf("#define L1_CODE_SIZE 32768 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
printf("#define L1_DATA_SIZE 32768 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
||||
printf("#define L2_SIZE 262144 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define L3_SIZE 33554432 \n");
|
||||
printf("#define L3_LINESIZE 64 \n");
|
||||
printf("#define L3_ASSOCIATIVE 32 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
// Minimum parameters for ARMv8 (based on A53)
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
break;
|
||||
|
||||
case CPU_CORTEXA57:
|
||||
printf("#define CORTEXA57\n");
|
||||
printf("#define HAVE_VFP\n");
|
||||
printf("#define HAVE_VFPV3\n");
|
||||
printf("#define HAVE_NEON\n");
|
||||
printf("#define HAVE_VFPV4\n");
|
||||
case CPU_CORTEXA72:
|
||||
case CPU_CORTEXA73:
|
||||
// Common minimum settings for these Arm cores
|
||||
// Can change a lot, but we need to be conservative
|
||||
// TODO: detect info from /sys if possible
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 49152\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 3\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 2\n");
|
||||
printf("#define L2_SIZE 2097152\n");
|
||||
printf("#define L2_SIZE 524288\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
case CPU_FALKOR:
|
||||
printf("#define FALKOR\n");
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 128\n");
|
||||
printf("#define L2_SIZE 524288\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
|
||||
case CPU_THUNDERX:
|
||||
printf("#define ARMV8\n");
|
||||
printf("#define THUNDERX\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 128\n");
|
||||
@@ -250,10 +271,6 @@ void get_cpuconfig(void)
|
||||
|
||||
case CPU_THUNDERX2T99:
|
||||
printf("#define VULCAN \n");
|
||||
printf("#define HAVE_VFP \n");
|
||||
printf("#define HAVE_VFPV3 \n");
|
||||
printf("#define HAVE_NEON \n");
|
||||
printf("#define HAVE_VFPV4 \n");
|
||||
printf("#define L1_CODE_SIZE 32768 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
|
||||
@@ -56,6 +56,7 @@
|
||||
#define CPUTYPE_CELL 6
|
||||
#define CPUTYPE_PPCG4 7
|
||||
#define CPUTYPE_POWER8 8
|
||||
#define CPUTYPE_POWER9 9
|
||||
|
||||
char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
@@ -66,7 +67,8 @@ char *cpuname[] = {
|
||||
"POWER6",
|
||||
"CELL",
|
||||
"PPCG4",
|
||||
"POWER8"
|
||||
"POWER8",
|
||||
"POWER9"
|
||||
};
|
||||
|
||||
char *lowercpuname[] = {
|
||||
@@ -78,7 +80,8 @@ char *lowercpuname[] = {
|
||||
"power6",
|
||||
"cell",
|
||||
"ppcg4",
|
||||
"power8"
|
||||
"power8",
|
||||
"power9"
|
||||
};
|
||||
|
||||
char *corename[] = {
|
||||
@@ -90,7 +93,8 @@ char *corename[] = {
|
||||
"POWER6",
|
||||
"CELL",
|
||||
"PPCG4",
|
||||
"POWER8"
|
||||
"POWER8",
|
||||
"POWER8"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
@@ -120,6 +124,7 @@ int detect(void){
|
||||
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8;
|
||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
||||
|
||||
@@ -127,6 +132,33 @@ int detect(void){
|
||||
#endif
|
||||
|
||||
#ifdef _AIX
|
||||
FILE *infile;
|
||||
char buffer[512], *p;
|
||||
|
||||
p = (char *)NULL;
|
||||
infile = popen("prtconf|grep 'Processor Type'");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("Pro", buffer, 3)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
pclose(infile);
|
||||
|
||||
if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3;
|
||||
if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4;
|
||||
if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970;
|
||||
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
|
||||
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8;
|
||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
||||
return CPUTYPE_POWER5;
|
||||
#endif
|
||||
|
||||
@@ -142,6 +174,52 @@ int detect(void){
|
||||
|
||||
return CPUTYPE_PPC970;
|
||||
#endif
|
||||
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
|
||||
int id;
|
||||
__asm __volatile("mfpvr %0" : "=r"(id));
|
||||
switch ( id >> 16 ) {
|
||||
case 0x4e: // POWER9
|
||||
return CPUTYPE_POWER8;
|
||||
break;
|
||||
case 0x4d:
|
||||
case 0x4b: // POWER8/8E
|
||||
return CPUTYPE_POWER8;
|
||||
break;
|
||||
case 0x4a:
|
||||
case 0x3f: // POWER7/7E
|
||||
return CPUTYPE_POWER6;
|
||||
break;
|
||||
case 0x3e:
|
||||
return CPUTYPE_POWER6;
|
||||
break;
|
||||
case 0x3a:
|
||||
return CPUTYPE_POWER5;
|
||||
break;
|
||||
case 0x35:
|
||||
case 0x38: // POWER4 /4+
|
||||
return CPUTYPE_POWER4;
|
||||
break;
|
||||
case 0x40:
|
||||
case 0x41: // POWER3 /3+
|
||||
return CPUTYPE_POWER3;
|
||||
break;
|
||||
case 0x39:
|
||||
case 0x3c:
|
||||
case 0x44:
|
||||
case 0x45:
|
||||
return CPUTYPE_PPC970;
|
||||
break;
|
||||
case 0x70:
|
||||
return CPUTYPE_CELL;
|
||||
break;
|
||||
case 0x8003:
|
||||
return CPUTYPE_PPCG4;
|
||||
break;
|
||||
default:
|
||||
return CPUTYPE_UNKNOWN;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void get_architecture(void){
|
||||
|
||||
@@ -1452,6 +1452,8 @@ int get_cpuname(void){
|
||||
switch (model) {
|
||||
case 1:
|
||||
// AMD Ryzen
|
||||
case 8:
|
||||
// AMD Ryzen2
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_ZEN;
|
||||
@@ -2007,6 +2009,8 @@ int get_coretype(void){
|
||||
switch (model) {
|
||||
case 1:
|
||||
// AMD Ryzen
|
||||
case 8:
|
||||
// Ryzen 2
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_ZEN;
|
||||
|
||||
@@ -29,15 +29,18 @@
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13"
|
||||
"Z13",
|
||||
"Z14"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13"
|
||||
"z13",
|
||||
"z14"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
@@ -62,6 +65,10 @@ int detect(void)
|
||||
if (strstr(p, "2964")) return CPU_Z13;
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
|
||||
/* detect z14, but fall back to z13 */
|
||||
if (strstr(p, "3906")) return CPU_Z13;
|
||||
if (strstr(p, "3907")) return CPU_Z13;
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
@@ -107,5 +114,9 @@ void get_cpuconfig(void)
|
||||
printf("#define Z13\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
case CPU_Z14:
|
||||
printf("#define Z14\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
4
ctest.c
4
ctest.c
@@ -101,6 +101,10 @@ OS_INTERIX
|
||||
OS_LINUX
|
||||
#endif
|
||||
|
||||
#if defined(__HAIKU__)
|
||||
OS_HAIKU
|
||||
#endif
|
||||
|
||||
#if defined(__i386) || defined(_X86)
|
||||
ARCH_X86
|
||||
#endif
|
||||
|
||||
@@ -62,9 +62,36 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef TRANSA
|
||||
#ifndef thread_local
|
||||
# if __STDC_VERSION__ >= 201112 && !defined __STDC_NO_THREADS__
|
||||
# define thread_local _Thread_local
|
||||
# elif defined _WIN32 && ( \
|
||||
defined _MSC_VER || \
|
||||
defined __ICL || \
|
||||
defined __DMC__ || \
|
||||
defined __BORLANDC__ )
|
||||
# define thread_local __declspec(thread)
|
||||
/* note that ICC (linux) and Clang are covered by __GNUC__ */
|
||||
# elif defined __GNUC__ || \
|
||||
defined __SUNPRO_C || \
|
||||
defined __xlC__
|
||||
# define thread_local __thread
|
||||
# else
|
||||
# define UNSAFE
|
||||
#endif
|
||||
#endif
|
||||
#if defined USE_OPENMP
|
||||
#undef UNSAFE
|
||||
#endif
|
||||
|
||||
#if !defined(TRANSA) && !defined(UNSAFE)
|
||||
#define Y_DUMMY_NUM 1024
|
||||
#if defined(USE_OPENMP)
|
||||
static FLOAT y_dummy[Y_DUMMY_NUM];
|
||||
#pragma omp threadprivate(y_dummy)
|
||||
# else
|
||||
static thread_local FLOAT y_dummy[Y_DUMMY_NUM];
|
||||
# endif
|
||||
#endif
|
||||
|
||||
static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
|
||||
@@ -105,10 +132,12 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||
#ifdef TRANSA
|
||||
y += n_from * incy * COMPSIZE;
|
||||
#else
|
||||
# ifndef UNSAFE
|
||||
//for split matrix row (n) direction and vector x of gemv_n
|
||||
x += n_from * incx * COMPSIZE;
|
||||
//store partial result for every thread
|
||||
y += (m_to - m_from) * 1 * COMPSIZE * pos;
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -136,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
||||
#ifndef TRANSA
|
||||
#if !defined(TRANSA) && !defined(UNSAFE)
|
||||
int split_x=0;
|
||||
#endif
|
||||
|
||||
@@ -212,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||
i -= width;
|
||||
}
|
||||
|
||||
#ifndef TRANSA
|
||||
#if !defined(TRANSA) && !defined(UNSAFE)
|
||||
//try to split matrix on row direction and x.
|
||||
//Then, reduction.
|
||||
if (num_cpu < nthreads) {
|
||||
@@ -272,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||
exec_blas(num_cpu, queue);
|
||||
}
|
||||
|
||||
#ifndef TRANSA
|
||||
#if !defined(TRANSA) && !defined(UNSAFE)
|
||||
if(split_x==1){
|
||||
//reduction
|
||||
for(i=0; i<num_cpu; i++){
|
||||
|
||||
@@ -48,6 +48,10 @@
|
||||
#define SWITCH_RATIO 2
|
||||
#endif
|
||||
|
||||
#ifndef GEMM_PREFERED_SIZE
|
||||
#define GEMM_PREFERED_SIZE 1
|
||||
#endif
|
||||
|
||||
//The array of job_t may overflow the stack.
|
||||
//Instead, use malloc to alloc job_t.
|
||||
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
||||
@@ -510,10 +514,29 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int round_up(int remainder, int width, int multiple)
|
||||
{
|
||||
if (multiple > remainder || width <= multiple)
|
||||
return width;
|
||||
width = (width + multiple - 1) / multiple;
|
||||
width = width * multiple;
|
||||
return width;
|
||||
}
|
||||
|
||||
|
||||
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
*range_n, FLOAT *sa, FLOAT *sb,
|
||||
BLASLONG nthreads_m, BLASLONG nthreads_n) {
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
#else
|
||||
CRITICAL_SECTION level3_lock;
|
||||
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
blas_arg_t newarg;
|
||||
|
||||
#ifndef USE_ALLOC_HEAP
|
||||
@@ -554,6 +577,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
pthread_mutex_lock(&level3_lock);
|
||||
#else
|
||||
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
/* Dynamically allocate workspace */
|
||||
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
||||
@@ -601,9 +632,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
num_parts = 0;
|
||||
while (m > 0){
|
||||
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
|
||||
|
||||
width = round_up(m, width, GEMM_PREFERED_SIZE);
|
||||
|
||||
m -= width;
|
||||
|
||||
if (m < 0) width = width + m;
|
||||
range_M[num_parts + 1] = range_M[num_parts] + width;
|
||||
|
||||
num_parts ++;
|
||||
}
|
||||
for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
|
||||
@@ -645,9 +681,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
if (width < SWITCH_RATIO) {
|
||||
width = SWITCH_RATIO;
|
||||
}
|
||||
width = round_up(n, width, GEMM_PREFERED_SIZE);
|
||||
|
||||
n -= width;
|
||||
if (n < 0) width = width + n;
|
||||
range_N[num_parts + 1] = range_N[num_parts] + width;
|
||||
|
||||
num_parts ++;
|
||||
}
|
||||
for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
|
||||
@@ -671,6 +710,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
free(job);
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
pthread_mutex_unlock(&level3_lock);
|
||||
#else
|
||||
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -15,7 +15,11 @@ endif
|
||||
# COMMONOBJS += info.$(SUFFIX)
|
||||
|
||||
ifeq ($(DYNAMIC_ARCH), 1)
|
||||
ifeq ($(ARCH),arm64)
|
||||
COMMONOBJS += dynamic_arm64.$(SUFFIX)
|
||||
else
|
||||
COMMONOBJS += dynamic.$(SUFFIX)
|
||||
endif
|
||||
else
|
||||
COMMONOBJS += parameter.$(SUFFIX)
|
||||
endif
|
||||
@@ -71,7 +75,11 @@ BLAS_SERVER = blas_server.c
|
||||
endif
|
||||
|
||||
ifeq ($(DYNAMIC_ARCH), 1)
|
||||
ifeq ($(ARCH),arm64)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX)
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
|
||||
endif
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
|
||||
endif
|
||||
|
||||
@@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU)
|
||||
#include <dlfcn.h>
|
||||
#include <signal.h>
|
||||
#include <sys/resource.h>
|
||||
@@ -582,7 +582,7 @@ int blas_thread_init(void){
|
||||
if(ret!=0){
|
||||
struct rlimit rlim;
|
||||
const char *msg = strerror(ret);
|
||||
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg);
|
||||
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg);
|
||||
#ifdef RLIMIT_NPROC
|
||||
if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
|
||||
fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
|
||||
@@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) {
|
||||
|
||||
long i;
|
||||
|
||||
#ifdef SMP_SERVER
|
||||
// Handle lazy re-init of the thread-pool after a POSIX fork
|
||||
if (unlikely(blas_server_avail == 0)) blas_thread_init();
|
||||
#endif
|
||||
|
||||
if (num_threads < 1) num_threads = blas_num_threads;
|
||||
|
||||
#ifndef NO_AFFINITY
|
||||
|
||||
@@ -478,7 +478,12 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||
|
||||
void goto_set_num_threads(int num_threads)
|
||||
{
|
||||
long i;
|
||||
long i;
|
||||
|
||||
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
|
||||
// Handle lazy re-init of the thread-pool after a POSIX fork
|
||||
if (unlikely(blas_server_avail == 0)) blas_thread_init();
|
||||
#endif
|
||||
|
||||
if (num_threads < 1) num_threads = blas_cpu_number;
|
||||
|
||||
|
||||
@@ -607,7 +607,7 @@ static gotoblas_t *get_coretype(void){
|
||||
}
|
||||
}
|
||||
} else if (exfamily == 8) {
|
||||
if (model == 1) {
|
||||
if (model == 1 || model == 8) {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
|
||||
198
driver/others/dynamic_arm64.c
Normal file
198
driver/others/dynamic_arm64.c
Normal file
@@ -0,0 +1,198 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <asm/hwcap.h>
|
||||
#include <sys/auxv.h>
|
||||
|
||||
extern gotoblas_t gotoblas_ARMV8;
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
extern gotoblas_t gotoblas_THUNDERX;
|
||||
extern gotoblas_t gotoblas_THUNDERX2T99;
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#define NUM_CORETYPES 4
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
* that HWCAP_CPUID is defined
|
||||
*/
|
||||
#ifndef HWCAP_CPUID
|
||||
#define HWCAP_CPUID (1 << 11)
|
||||
#endif
|
||||
|
||||
#define get_cpu_ftr(id, var) ({ \
|
||||
asm("mrs %0, "#id : "=r" (var)); \
|
||||
})
|
||||
|
||||
static char *corename[] = {
|
||||
"armv8",
|
||||
"cortexa57",
|
||||
"thunderx",
|
||||
"thunderx2t99",
|
||||
"unknown"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
|
||||
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1];
|
||||
if (gotoblas == &gotoblas_THUNDERX) return corename[ 2];
|
||||
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
static gotoblas_t *force_coretype(char *coretype) {
|
||||
int i ;
|
||||
int found = -1;
|
||||
char message[128];
|
||||
|
||||
for ( i=0 ; i < NUM_CORETYPES; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype, corename[i], 20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 0: return (&gotoblas_ARMV8);
|
||||
case 1: return (&gotoblas_CORTEXA57);
|
||||
case 2: return (&gotoblas_THUNDERX);
|
||||
case 3: return (&gotoblas_THUNDERX2T99);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
int implementer, variant, part, arch, revision, midr_el1;
|
||||
|
||||
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
|
||||
char coremsg[128];
|
||||
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
|
||||
openblas_warning(1, coremsg);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
get_cpu_ftr(MIDR_EL1, midr_el1);
|
||||
/*
|
||||
* MIDR_EL1
|
||||
*
|
||||
* 31 24 23 20 19 16 15 4 3 0
|
||||
* -----------------------------------------------------------------
|
||||
* | Implementer | Variant | Architecture | Part Number | Revision |
|
||||
* -----------------------------------------------------------------
|
||||
*/
|
||||
implementer = (midr_el1 >> 24) & 0xFF;
|
||||
part = (midr_el1 >> 4) & 0xFFF;
|
||||
|
||||
switch(implementer)
|
||||
{
|
||||
case 0x41: // ARM
|
||||
switch (part)
|
||||
{
|
||||
case 0xd07: // Cortex A57
|
||||
case 0xd08: // Cortex A72
|
||||
case 0xd03: // Cortex A53
|
||||
return &gotoblas_CORTEXA57;
|
||||
}
|
||||
break;
|
||||
case 0x42: // Broadcom
|
||||
switch (part)
|
||||
{
|
||||
case 0x516: // Vulcan
|
||||
return &gotoblas_THUNDERX2T99;
|
||||
}
|
||||
break;
|
||||
case 0x43: // Cavium
|
||||
switch (part)
|
||||
{
|
||||
case 0x0a1: // ThunderX
|
||||
return &gotoblas_THUNDERX;
|
||||
case 0x0af: // ThunderX2
|
||||
return &gotoblas_THUNDERX2T99;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_init(void) {
|
||||
|
||||
char coremsg[128];
|
||||
char coren[22];
|
||||
char *p;
|
||||
|
||||
if (gotoblas) return;
|
||||
|
||||
p = getenv("OPENBLAS_CORETYPE");
|
||||
if ( p )
|
||||
{
|
||||
gotoblas = force_coretype(p);
|
||||
}
|
||||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
}
|
||||
|
||||
if (gotoblas == NULL)
|
||||
{
|
||||
snprintf(coremsg, 128, "Falling back to generic ARMV8 core\n");
|
||||
openblas_warning(1, coremsg);
|
||||
gotoblas = &gotoblas_ARMV8;
|
||||
}
|
||||
|
||||
if (gotoblas && gotoblas->init) {
|
||||
strncpy(coren, gotoblas_corename(), 20);
|
||||
sprintf(coremsg, "Core: %s\n", coren);
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas -> init();
|
||||
} else {
|
||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_quit(void) {
|
||||
gotoblas = NULL;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -35,9 +35,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#if defined(_WIN32) && defined(_MSC_VER)
|
||||
#if _MSC_VER < 1900
|
||||
#define snprintf _snprintf
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static char* openblas_config_str=""
|
||||
"OpenBLAS "
|
||||
VERSION
|
||||
" "
|
||||
#ifdef USE64BITINT
|
||||
"USE64BITINT "
|
||||
" USE64BITINT "
|
||||
#endif
|
||||
#ifdef NO_CBLAS
|
||||
"NO_CBLAS "
|
||||
|
||||
@@ -730,35 +730,8 @@ void blas_set_parameter(void){
|
||||
|
||||
#if defined(ARCH_ARM64)
|
||||
|
||||
#if defined(VULCAN) || defined(THUNDERX2T99)
|
||||
unsigned long dgemm_prefetch_size_a;
|
||||
unsigned long dgemm_prefetch_size_b;
|
||||
unsigned long dgemm_prefetch_size_c;
|
||||
#endif
|
||||
|
||||
void blas_set_parameter(void)
|
||||
{
|
||||
#if defined(VULCAN) || defined(THUNDERX2T99)
|
||||
dgemm_p = 160;
|
||||
dgemm_q = 128;
|
||||
dgemm_r = 4096;
|
||||
|
||||
sgemm_p = 128;
|
||||
sgemm_q = 352;
|
||||
sgemm_r = 4096;
|
||||
|
||||
cgemm_p = 128;
|
||||
cgemm_q = 224;
|
||||
cgemm_r = 4096;
|
||||
|
||||
zgemm_p = 128;
|
||||
zgemm_q = 112;
|
||||
zgemm_r = 4096;
|
||||
|
||||
dgemm_prefetch_size_a = 3584;
|
||||
dgemm_prefetch_size_b = 512;
|
||||
dgemm_prefetch_size_c = 128;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -114,15 +114,15 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
|
||||
endif
|
||||
ifneq (,$(filter 1 2,$(NOFORTRAN)))
|
||||
#only build without Fortran
|
||||
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
else
|
||||
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
endif
|
||||
|
||||
dllinit.$(SUFFIX) : dllinit.c
|
||||
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
||||
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||
|
||||
so : ../$(LIBSONAME)
|
||||
|
||||
|
||||
10
f_check
10
f_check
@@ -292,9 +292,6 @@ if ($link ne "") {
|
||||
&& ($flags !~ /^-LIST:/)
|
||||
&& ($flags !~ /^-LANG:/)
|
||||
) {
|
||||
if ($vendor eq "PGI") {
|
||||
$flags =~ s/lib$/libso/;
|
||||
}
|
||||
$linker_L .= $flags . " ";
|
||||
}
|
||||
|
||||
@@ -311,17 +308,11 @@ if ($link ne "") {
|
||||
|
||||
if ($flags =~ /^\-rpath\@/) {
|
||||
$flags =~ s/\@/\,/g;
|
||||
if ($vendor eq "PGI") {
|
||||
$flags =~ s/lib$/libso/;
|
||||
}
|
||||
$linker_L .= "-Wl,". $flags . " " ;
|
||||
}
|
||||
|
||||
if ($flags =~ /^\-rpath-link\@/) {
|
||||
$flags =~ s/\@/\,/g;
|
||||
if ($vendor eq "PGI") {
|
||||
$flags =~ s/lib$/libso/;
|
||||
}
|
||||
$linker_L .= "-Wl,". $flags . " " ;
|
||||
}
|
||||
|
||||
@@ -330,7 +321,6 @@ if ($link ne "") {
|
||||
&& ($flags !~ /gfortranbegin/)
|
||||
&& ($flags !~ /frtbegin/)
|
||||
&& ($flags !~ /pathfstart/)
|
||||
&& ($flags !~ /numa/)
|
||||
&& ($flags !~ /crt[0-9]/)
|
||||
&& ($flags !~ /gcc/)
|
||||
&& ($flags !~ /user32/)
|
||||
|
||||
78
getarch.c
78
getarch.c
@@ -927,11 +927,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define ARCHCONFIG "-DARMV8 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 "
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "armv8"
|
||||
#define CORENAME "ARMV8"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA53
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "CORTEXA53"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DCORTEXA53 " \
|
||||
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa53"
|
||||
#define CORENAME "CORTEXA53"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA57
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
@@ -942,26 +959,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa57"
|
||||
#define CORENAME "CORTEXA57"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_VULCAN
|
||||
#ifdef FORCE_CORTEXA72
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "VULCAN"
|
||||
#define SUBARCHITECTURE "CORTEXA72"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DVULCAN " \
|
||||
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
|
||||
#define ARCHCONFIG "-DCORTEXA72 " \
|
||||
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
|
||||
#define LIBNAME "vulcan"
|
||||
#define CORENAME "VULCAN"
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa72"
|
||||
#define CORENAME "CORTEXA72"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA73
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "CORTEXA73"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DCORTEXA73 " \
|
||||
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa73"
|
||||
#define CORENAME "CORTEXA73"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_FALKOR
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "FALKOR"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DFALKOR " \
|
||||
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "falkor"
|
||||
#define CORENAME "FALKOR"
|
||||
#else
|
||||
#endif
|
||||
|
||||
@@ -973,13 +1021,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define ARCHCONFIG "-DTHUNDERX " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
|
||||
"-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 "
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "thunderx"
|
||||
#define CORENAME "THUNDERX"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_THUNDERX2T99
|
||||
#define ARMV8
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "THUNDERX2T99"
|
||||
@@ -990,7 +1040,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "thunderx2t99"
|
||||
#define CORENAME "THUNDERX2T99"
|
||||
#else
|
||||
|
||||
@@ -75,6 +75,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
if (incx == 0 && incy == 0) {
|
||||
*y += n * alpha *(*x);
|
||||
return;
|
||||
}
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
@@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (trans) lenx = m;
|
||||
if (trans) leny = n;
|
||||
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
||||
@@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (trans) lenx = m;
|
||||
if (trans) leny = n;
|
||||
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
||||
@@ -97,7 +97,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
|
||||
|
||||
blas_level1_thread(mode, n, k1, k2, dummyalpha,
|
||||
a, lda, NULL, 0, ipiv, incx,
|
||||
laswp[flag], nthreads);
|
||||
(int(*)())laswp[flag], nthreads);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -96,7 +96,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
|
||||
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
|
||||
blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads);
|
||||
blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)())laswp[flag], nthreads);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -22,8 +22,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||
long double s;
|
||||
long double r, roe, z;
|
||||
|
||||
long double ada = fabs(da);
|
||||
long double adb = fabs(db);
|
||||
long double ada = fabsl(da);
|
||||
long double adb = fabsl(db);
|
||||
long double scale = ada + adb;
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
@@ -184,7 +184,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
||||
@@ -168,7 +168,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#if defined(THUNDERX2T99) || defined(VULCAN)
|
||||
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
|
||||
// Multithreaded swap gives performance benefits in ThunderX2T99
|
||||
#else
|
||||
// Disable multi-threading as it does not show any performance
|
||||
|
||||
@@ -166,7 +166,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta != ONE) SCAL_K(n, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
||||
@@ -82,6 +82,12 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
if (incx == 0 && incy == 0) {
|
||||
*y += n * (alpha_r * (*x) - alpha_i* (*(x+1)) );
|
||||
*(y+1) += n * (alpha_i * (*x) + alpha_r * (*(x +1)) );
|
||||
return;
|
||||
}
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
@@ -237,7 +237,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (trans & 1) lenx = m;
|
||||
if (trans & 1) leny = n;
|
||||
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha_r == ZERO && alpha_i == ZERO) return;
|
||||
|
||||
|
||||
@@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (trans & 1) lenx = m;
|
||||
if (trans & 1) leny = n;
|
||||
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha_r == ZERO && alpha_i == ZERO) return;
|
||||
|
||||
|
||||
@@ -190,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
|
||||
@@ -43,6 +43,10 @@
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
// this is smallest dimension N of square input a to permit threading
|
||||
// see graph in issue #1820 for explanation
|
||||
#define MULTI_THREAD_MINIMAL 362
|
||||
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "XHEMV "
|
||||
#elif defined(DOUBLE)
|
||||
@@ -181,7 +185,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
@@ -195,7 +199,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
#ifdef SMP
|
||||
nthreads = num_cpu_avail(2);
|
||||
if (n<MULTI_THREAD_MINIMAL) {
|
||||
nthreads = 1 ;
|
||||
} else {
|
||||
nthreads = num_cpu_avail(2);
|
||||
};
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -180,7 +180,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||
long double db_i = *(DB + 1);
|
||||
long double r;
|
||||
|
||||
long double ada = fabs(da_r) + fabs(da_i);
|
||||
long double ada = fabsl(da_r) + fabsl(da_i);
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0);
|
||||
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
|
||||
@@ -42,6 +42,14 @@
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
|
||||
// Multithreaded swap gives performance benefits in ThunderX2T99
|
||||
#else
|
||||
// Disable multi-threading as it does not show any performance
|
||||
// benefits. Keep the multi-threading code for the record.
|
||||
#undef SMP
|
||||
#endif
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
|
||||
@@ -81,7 +89,7 @@ FLOAT *y = (FLOAT*)vy;
|
||||
#ifdef SMP
|
||||
//disable multi-thread when incx==0 or incy==0
|
||||
//In that case, the threads would be dependent.
|
||||
if (incx == 0 || incy == 0)
|
||||
if (incx == 0 || incy == 0 || n < 1048576 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT))
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
@@ -88,7 +88,11 @@ lsame.$(SUFFIX): $(KERNELDIR)/$(LSAME_KERNEL)
|
||||
$(CC) -c $(CFLAGS) -DF_INTERFACE $< -o $(@F)
|
||||
|
||||
setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h
|
||||
ifeq ($(USE_GEMM3M), 1)
|
||||
$(CC) -c $(CFLAGS) -DUSE_GEMM3M $< -o $@
|
||||
else
|
||||
$(CC) -c $(CFLAGS) $< -o $@
|
||||
endif
|
||||
|
||||
setparam$(TSUFFIX).c : setparam-ref.c
|
||||
sed 's/TS/$(TSUFFIX)/g' $< > $(@F)
|
||||
|
||||
@@ -44,7 +44,7 @@ ifeq ($(CORE), POWER8)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z13)
|
||||
ifeq ($(ARCH), zarch)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
|
||||
@@ -58,11 +58,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d4 - d5 }
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
fldmiad X!, { d6 - d7 }
|
||||
vldmia.f64 X!, { d6 - d7 }
|
||||
vabs.f64 d6, d6
|
||||
vadd.f64 d1 , d1, d5
|
||||
vabs.f64 d7, d7
|
||||
@@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 }
|
||||
vldmia.f64 X!, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
|
||||
@@ -82,22 +82,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
fldmiad X, { d4 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
fldmiad X, { d4 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
fldmiad X, { d4 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
fldmiad X, { d4 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
@@ -107,7 +107,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
@@ -118,11 +118,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
fldmias X!, { s4 - s5 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
fldmias X!, { s6 - s7 }
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
vabs.f32 s6, s6
|
||||
vadd.f32 s1 , s1, s5
|
||||
vabs.f32 s7, s7
|
||||
@@ -133,7 +133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 }
|
||||
vldmia.f32 X!, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
|
||||
@@ -142,22 +142,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
fldmias X, { s4 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
fldmias X, { s4 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
fldmias X, { s4 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
fldmias X, { s4 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
@@ -167,7 +167,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
@@ -184,11 +184,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d4 - d5 }
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
fldmiad X!, { d6 - d7 }
|
||||
vldmia.f64 X!, { d6 - d7 }
|
||||
vabs.f64 d6, d6
|
||||
vadd.f64 d1 , d1, d5
|
||||
vabs.f64 d7, d7
|
||||
@@ -196,11 +196,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vadd.f64 d1 , d1, d7
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d4 - d5 }
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
fldmiad X!, { d6 - d7 }
|
||||
vldmia.f64 X!, { d6 - d7 }
|
||||
vabs.f64 d6, d6
|
||||
vadd.f64 d1 , d1, d5
|
||||
vabs.f64 d7, d7
|
||||
@@ -212,11 +212,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 }
|
||||
vldmia.f64 X!, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
|
||||
fldmiad X!, { d4 }
|
||||
vldmia.f64 X!, { d4 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
|
||||
@@ -226,28 +226,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
fldmiad X, { d4 -d5 }
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
fldmiad X, { d4 -d5 }
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
fldmiad X, { d4 -d5 }
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
fldmiad X, { d4 -d5 }
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
@@ -259,7 +259,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 -d5 }
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vabs.f64 d4, d4
|
||||
vadd.f64 d0 , d0, d4
|
||||
vabs.f64 d5, d5
|
||||
@@ -273,22 +273,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmias X!, { s4 - s5 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
fldmias X!, { s6 - s7 }
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
vabs.f32 s6, s6
|
||||
vadd.f32 s1 , s1, s5
|
||||
vabs.f32 s7, s7
|
||||
vadd.f32 s0 , s0, s6
|
||||
vadd.f32 s1 , s1, s7
|
||||
|
||||
fldmias X!, { s4 - s5 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
fldmias X!, { s6 - s7 }
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
vabs.f32 s6, s6
|
||||
vadd.f32 s1 , s1, s5
|
||||
vabs.f32 s7, s7
|
||||
@@ -300,11 +300,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 }
|
||||
vldmia.f32 X!, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
|
||||
fldmias X!, { s4 }
|
||||
vldmia.f32 X!, { s4 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
|
||||
@@ -313,28 +313,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
fldmias X, { s4 -s5 }
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
fldmias X, { s4 -s5 }
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
fldmias X, { s4 -s5 }
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
fldmias X, { s4 -s5 }
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
@@ -346,7 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 -s5 }
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vabs.f32 s4, s4
|
||||
vadd.f32 s0 , s0, s4
|
||||
vabs.f32 s5, s5
|
||||
|
||||
@@ -146,17 +146,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d4 - d7 }
|
||||
vldmia.f64 X!, { d4 - d7 }
|
||||
pld [ Y, #X_PRE ]
|
||||
fldmiad Y , { d8 - d11 }
|
||||
vldmia.f64 Y , { d8 - d11 }
|
||||
fmacd d8 , d0, d4
|
||||
fstmiad Y!, { d8 }
|
||||
vstmia.f64 Y!, { d8 }
|
||||
fmacd d9 , d0, d5
|
||||
fstmiad Y!, { d9 }
|
||||
vstmia.f64 Y!, { d9 }
|
||||
fmacd d10, d0, d6
|
||||
fstmiad Y!, { d10 }
|
||||
vstmia.f64 Y!, { d10 }
|
||||
fmacd d11, d0, d7
|
||||
fstmiad Y!, { d11 }
|
||||
vstmia.f64 Y!, { d11 }
|
||||
|
||||
|
||||
.endm
|
||||
@@ -164,19 +164,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 }
|
||||
fldmiad Y , { d8 }
|
||||
vldmia.f64 X!, { d4 }
|
||||
vldmia.f64 Y , { d8 }
|
||||
fmacd d8 , d0, d4
|
||||
fstmiad Y!, { d8 }
|
||||
vstmia.f64 Y!, { d8 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X , { d4 }
|
||||
fldmiad Y , { d8 }
|
||||
vldmia.f64 X , { d4 }
|
||||
vldmia.f64 Y , { d8 }
|
||||
fmacd d8 , d0, d4
|
||||
fstmiad Y , { d8 }
|
||||
vstmia.f64 Y , { d8 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -186,16 +186,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
fldmias X!, { s4 - s7 }
|
||||
fldmias Y , { s8 - s11 }
|
||||
vldmia.f32 X!, { s4 - s7 }
|
||||
vldmia.f32 Y , { s8 - s11 }
|
||||
fmacs s8 , s0, s4
|
||||
fstmias Y!, { s8 }
|
||||
vstmia.f32 Y!, { s8 }
|
||||
fmacs s9 , s0, s5
|
||||
fstmias Y!, { s9 }
|
||||
vstmia.f32 Y!, { s9 }
|
||||
fmacs s10, s0, s6
|
||||
fstmias Y!, { s10 }
|
||||
vstmia.f32 Y!, { s10 }
|
||||
fmacs s11, s0, s7
|
||||
fstmias Y!, { s11 }
|
||||
vstmia.f32 Y!, { s11 }
|
||||
|
||||
|
||||
.endm
|
||||
@@ -203,19 +203,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 }
|
||||
fldmias Y , { s8 }
|
||||
vldmia.f32 X!, { s4 }
|
||||
vldmia.f32 Y , { s8 }
|
||||
fmacs s8 , s0, s4
|
||||
fstmias Y!, { s8 }
|
||||
vstmia.f32 Y!, { s8 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X , { s4 }
|
||||
fldmias Y , { s8 }
|
||||
vldmia.f32 X , { s4 }
|
||||
vldmia.f32 Y , { s8 }
|
||||
fmacs s8 , s0, s4
|
||||
fstmias Y , { s8 }
|
||||
vstmia.f32 Y , { s8 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -231,42 +231,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d4 - d7 }
|
||||
vldmia.f64 X!, { d4 - d7 }
|
||||
pld [ Y, #X_PRE ]
|
||||
fldmiad Y , { d8 - d11 }
|
||||
vldmia.f64 Y , { d8 - d11 }
|
||||
|
||||
FMAC_R1 d8 , d0, d4
|
||||
FMAC_R2 d8 , d1, d5
|
||||
FMAC_I1 d9 , d0, d5
|
||||
FMAC_I2 d9 , d1, d4
|
||||
fstmiad Y!, { d8 }
|
||||
fstmiad Y!, { d9 }
|
||||
vstmia.f64 Y!, { d8 }
|
||||
vstmia.f64 Y!, { d9 }
|
||||
|
||||
FMAC_R1 d10, d0, d6
|
||||
FMAC_R2 d10, d1, d7
|
||||
FMAC_I1 d11, d0, d7
|
||||
FMAC_I2 d11, d1, d6
|
||||
fstmiad Y!, { d10 }
|
||||
fstmiad Y!, { d11 }
|
||||
vstmia.f64 Y!, { d10 }
|
||||
vstmia.f64 Y!, { d11 }
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d4 - d7 }
|
||||
vldmia.f64 X!, { d4 - d7 }
|
||||
pld [ Y, #X_PRE ]
|
||||
fldmiad Y , { d8 - d11 }
|
||||
vldmia.f64 Y , { d8 - d11 }
|
||||
|
||||
FMAC_R1 d8 , d0, d4
|
||||
FMAC_R2 d8 , d1, d5
|
||||
FMAC_I1 d9 , d0, d5
|
||||
FMAC_I2 d9 , d1, d4
|
||||
fstmiad Y!, { d8 }
|
||||
fstmiad Y!, { d9 }
|
||||
vstmia.f64 Y!, { d8 }
|
||||
vstmia.f64 Y!, { d9 }
|
||||
|
||||
FMAC_R1 d10, d0, d6
|
||||
FMAC_R2 d10, d1, d7
|
||||
FMAC_I1 d11, d0, d7
|
||||
FMAC_I2 d11, d1, d6
|
||||
fstmiad Y!, { d10 }
|
||||
fstmiad Y!, { d11 }
|
||||
vstmia.f64 Y!, { d10 }
|
||||
vstmia.f64 Y!, { d11 }
|
||||
|
||||
|
||||
|
||||
@@ -277,15 +277,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 - d5 }
|
||||
fldmiad Y , { d8 - d9 }
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vldmia.f64 Y , { d8 - d9 }
|
||||
|
||||
FMAC_R1 d8 , d0, d4
|
||||
FMAC_R2 d8 , d1, d5
|
||||
FMAC_I1 d9 , d0, d5
|
||||
FMAC_I2 d9 , d1, d4
|
||||
fstmiad Y!, { d8 }
|
||||
fstmiad Y!, { d9 }
|
||||
vstmia.f64 Y!, { d8 }
|
||||
vstmia.f64 Y!, { d9 }
|
||||
|
||||
|
||||
|
||||
@@ -293,14 +293,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X , { d4 - d5 }
|
||||
fldmiad Y , { d8 - d9 }
|
||||
vldmia.f64 X , { d4 - d5 }
|
||||
vldmia.f64 Y , { d8 - d9 }
|
||||
|
||||
FMAC_R1 d8 , d0, d4
|
||||
FMAC_R2 d8 , d1, d5
|
||||
FMAC_I1 d9 , d0, d5
|
||||
FMAC_I2 d9 , d1, d4
|
||||
fstmiad Y , { d8 - d9 }
|
||||
vstmia.f64 Y , { d8 - d9 }
|
||||
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
@@ -314,40 +314,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmias X!, { s4 - s7 }
|
||||
vldmia.f32 X!, { s4 - s7 }
|
||||
pld [ Y, #X_PRE ]
|
||||
fldmias Y , { s8 - s11 }
|
||||
vldmia.f32 Y , { s8 - s11 }
|
||||
|
||||
FMAC_R1 s8 , s0, s4
|
||||
FMAC_R2 s8 , s1, s5
|
||||
FMAC_I1 s9 , s0, s5
|
||||
FMAC_I2 s9 , s1, s4
|
||||
fstmias Y!, { s8 }
|
||||
fstmias Y!, { s9 }
|
||||
vstmia.f32 Y!, { s8 }
|
||||
vstmia.f32 Y!, { s9 }
|
||||
|
||||
FMAC_R1 s10, s0, s6
|
||||
FMAC_R2 s10, s1, s7
|
||||
FMAC_I1 s11, s0, s7
|
||||
FMAC_I2 s11, s1, s6
|
||||
fstmias Y!, { s10 }
|
||||
fstmias Y!, { s11 }
|
||||
vstmia.f32 Y!, { s10 }
|
||||
vstmia.f32 Y!, { s11 }
|
||||
|
||||
fldmias X!, { s4 - s7 }
|
||||
fldmias Y , { s8 - s11 }
|
||||
vldmia.f32 X!, { s4 - s7 }
|
||||
vldmia.f32 Y , { s8 - s11 }
|
||||
|
||||
FMAC_R1 s8 , s0, s4
|
||||
FMAC_R2 s8 , s1, s5
|
||||
FMAC_I1 s9 , s0, s5
|
||||
FMAC_I2 s9 , s1, s4
|
||||
fstmias Y!, { s8 }
|
||||
fstmias Y!, { s9 }
|
||||
vstmia.f32 Y!, { s8 }
|
||||
vstmia.f32 Y!, { s9 }
|
||||
|
||||
FMAC_R1 s10, s0, s6
|
||||
FMAC_R2 s10, s1, s7
|
||||
FMAC_I1 s11, s0, s7
|
||||
FMAC_I2 s11, s1, s6
|
||||
fstmias Y!, { s10 }
|
||||
fstmias Y!, { s11 }
|
||||
vstmia.f32 Y!, { s10 }
|
||||
vstmia.f32 Y!, { s11 }
|
||||
|
||||
|
||||
|
||||
@@ -358,15 +358,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 - s5 }
|
||||
fldmias Y , { s8 - s9 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vldmia.f32 Y , { s8 - s9 }
|
||||
|
||||
FMAC_R1 s8 , s0, s4
|
||||
FMAC_R2 s8 , s1, s5
|
||||
FMAC_I1 s9 , s0, s5
|
||||
FMAC_I2 s9 , s1, s4
|
||||
fstmias Y!, { s8 }
|
||||
fstmias Y!, { s9 }
|
||||
vstmia.f32 Y!, { s8 }
|
||||
vstmia.f32 Y!, { s9 }
|
||||
|
||||
|
||||
|
||||
@@ -374,14 +374,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X , { s4 - s5 }
|
||||
fldmias Y , { s8 - s9 }
|
||||
vldmia.f32 X , { s4 - s5 }
|
||||
vldmia.f32 Y , { s8 - s9 }
|
||||
|
||||
FMAC_R1 s8 , s0, s4
|
||||
FMAC_R2 s8 , s1, s5
|
||||
FMAC_I1 s9 , s0, s5
|
||||
FMAC_I2 s9 , s1, s4
|
||||
fstmias Y , { s8 - s9 }
|
||||
vstmia.f32 Y , { s8 - s9 }
|
||||
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -65,15 +65,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmias X!, { s0 - s7 }
|
||||
fstmias Y!, { s0 - s7 }
|
||||
vldmia.f32 X!, { s0 - s7 }
|
||||
vstmia.f32 Y!, { s0 - s7 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro COPY_F1
|
||||
|
||||
fldmias X!, { s0 - s1 }
|
||||
fstmias Y!, { s0 - s1 }
|
||||
vldmia.f32 X!, { s0 - s1 }
|
||||
vstmia.f32 Y!, { s0 - s1 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -83,23 +83,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY_S4
|
||||
|
||||
nop
|
||||
fldmias X, { s0 - s1 }
|
||||
fstmias Y, { s0 - s1 }
|
||||
vldmia.f32 X, { s0 - s1 }
|
||||
vstmia.f32 Y, { s0 - s1 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s2 - s3 }
|
||||
fstmias Y, { s2 - s3 }
|
||||
vldmia.f32 X, { s2 - s3 }
|
||||
vstmia.f32 Y, { s2 - s3 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s0 - s1 }
|
||||
fstmias Y, { s0 - s1 }
|
||||
vldmia.f32 X, { s0 - s1 }
|
||||
vstmia.f32 Y, { s0 - s1 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s2 - s3 }
|
||||
fstmias Y, { s2 - s3 }
|
||||
vldmia.f32 X, { s2 - s3 }
|
||||
vstmia.f32 Y, { s2 - s3 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -108,8 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY_S1
|
||||
|
||||
fldmias X, { s0 - s1 }
|
||||
fstmias Y, { s0 - s1 }
|
||||
vldmia.f32 X, { s0 - s1 }
|
||||
vstmia.f32 Y, { s0 - s1 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
|
||||
@@ -76,30 +76,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
pld [ X, #X_PRE ]
|
||||
pld [ Y, #X_PRE ]
|
||||
|
||||
fldmias X!, { s4 - s5 }
|
||||
fldmias Y!, { s8 - s9 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vldmia.f32 Y!, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fldmias X!, { s6 - s7 }
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
fmacs s2 , s5, s9
|
||||
fmacs s3 , s5, s8
|
||||
|
||||
fldmias Y!, { s10 - s11 }
|
||||
vldmia.f32 Y!, { s10 - s11 }
|
||||
fmacs s0 , s6, s10
|
||||
fmacs s1 , s6, s11
|
||||
fmacs s2 , s7, s11
|
||||
fmacs s3 , s7, s10
|
||||
|
||||
|
||||
fldmias X!, { s4 - s5 }
|
||||
fldmias Y!, { s8 - s9 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vldmia.f32 Y!, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fldmias X!, { s6 - s7 }
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
fmacs s2 , s5, s9
|
||||
fmacs s3 , s5, s8
|
||||
|
||||
fldmias Y!, { s10 - s11 }
|
||||
vldmia.f32 Y!, { s10 - s11 }
|
||||
fmacs s0 , s6, s10
|
||||
fmacs s1 , s6, s11
|
||||
fmacs s2 , s7, s11
|
||||
@@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 - s5 }
|
||||
fldmias Y!, { s8 - s9 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vldmia.f32 Y!, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fmacs s2 , s5, s9
|
||||
@@ -125,8 +125,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
nop
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s8 - s9 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fmacs s2 , s5, s9
|
||||
@@ -134,8 +134,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s8 - s9 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fmacs s2 , s5, s9
|
||||
@@ -143,8 +143,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s8 - s9 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fmacs s2 , s5, s9
|
||||
@@ -152,8 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s8 - s9 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fmacs s2 , s5, s9
|
||||
@@ -166,8 +166,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s8 - s9 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s8 - s9 }
|
||||
fmacs s0 , s4, s8
|
||||
fmacs s1 , s4, s9
|
||||
fmacs s2 , s5, s9
|
||||
|
||||
@@ -165,9 +165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_I
|
||||
|
||||
pld [ AO, #A_PRE ]
|
||||
fldmias AO!, { s0 - s3 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
pld [ BO, #B_PRE ]
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
|
||||
fmuls s8 , s0, s4
|
||||
@@ -197,9 +197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_M1
|
||||
|
||||
pld [ AO, #A_PRE ]
|
||||
fldmias AO!, { s0 - s3 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
pld [ BO, #B_PRE ]
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -225,8 +225,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_M2
|
||||
|
||||
fldmias AO!, { s0 - s3 }
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -254,8 +254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_E
|
||||
|
||||
fldmias AO!, { s0 - s3 }
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_SUB
|
||||
|
||||
fldmias AO!, { s0 - s3 }
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -317,7 +317,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s7 }
|
||||
vldmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
@@ -329,9 +329,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s11
|
||||
FMAC_I2 s7 , s1 , s10
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
fldmias CO2, { s4 - s7 }
|
||||
vldmia.f32 CO2, { s4 - s7 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
@@ -343,7 +343,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s15
|
||||
FMAC_I2 s7 , s1 , s14
|
||||
|
||||
fstmias CO2, { s4 - s7 }
|
||||
vstmia.f32 CO2, { s4 - s7 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -500,23 +500,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s5 }
|
||||
vldmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
FMAC_R2 s4 , s1 , s9
|
||||
FMAC_I2 s5 , s1 , s8
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
fldmias CO2, { s4 - s5 }
|
||||
vldmia.f32 CO2, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
FMAC_R2 s4 , s1 , s13
|
||||
FMAC_I2 s5 , s1 , s12
|
||||
|
||||
fstmias CO2, { s4 - s5 }
|
||||
vstmia.f32 CO2, { s4 - s5 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
@@ -671,7 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s7 }
|
||||
vldmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
@@ -683,7 +683,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s11
|
||||
FMAC_I2 s7 , s1 , s10
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -800,14 +800,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s5 }
|
||||
vldmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
FMAC_R2 s4 , s1 , s9
|
||||
FMAC_I2 s5 , s1 , s8
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
|
||||
@@ -182,30 +182,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_I
|
||||
pld [ AO , #A_PRE ]
|
||||
pld [ BO , #B_PRE ]
|
||||
fldmias AO!, { s0 - s1 }
|
||||
fldmias BO!, { s8 - s9 }
|
||||
vldmia.f32 AO!, { s0 - s1 }
|
||||
vldmia.f32 BO!, { s8 - s9 }
|
||||
|
||||
fmuls s16 , s0, s8
|
||||
fmuls s24 , s1, s9
|
||||
fldmias AO!, { s2 - s3 }
|
||||
vldmia.f32 AO!, { s2 - s3 }
|
||||
fmuls s17 , s0, s9
|
||||
fmuls s25 , s1, s8
|
||||
|
||||
fldmias BO!, { s10 - s11 }
|
||||
vldmia.f32 BO!, { s10 - s11 }
|
||||
fmuls s18 , s2, s8
|
||||
fmuls s26 , s3, s9
|
||||
fldmias AO!, { s4 - s5 }
|
||||
vldmia.f32 AO!, { s4 - s5 }
|
||||
fmuls s19 , s2, s9
|
||||
fmuls s27 , s3, s8
|
||||
|
||||
fldmias BO!, { s12 - s13 }
|
||||
vldmia.f32 BO!, { s12 - s13 }
|
||||
fmuls s20 , s0, s10
|
||||
fmuls s28 , s1, s11
|
||||
fldmias AO!, { s6 - s7 }
|
||||
vldmia.f32 AO!, { s6 - s7 }
|
||||
fmuls s21 , s0, s11
|
||||
fmuls s29 , s1, s10
|
||||
|
||||
fldmias BO!, { s14 - s15 }
|
||||
vldmia.f32 BO!, { s14 - s15 }
|
||||
fmuls s22 , s2, s10
|
||||
fmuls s30 , s3, s11
|
||||
fmuls s23 , s2, s11
|
||||
@@ -218,17 +218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_M1
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
fldmias AO!, { s4 - s5 }
|
||||
vldmia.f32 AO!, { s4 - s5 }
|
||||
fmacs s24 , s1, s9
|
||||
fmacs s17 , s0, s9
|
||||
fldmias BO!, { s12 - s13 }
|
||||
vldmia.f32 BO!, { s12 - s13 }
|
||||
fmacs s25 , s1, s8
|
||||
|
||||
fmacs s18 , s2, s8
|
||||
fldmias AO!, { s6 - s7 }
|
||||
vldmia.f32 AO!, { s6 - s7 }
|
||||
fmacs s26 , s3, s9
|
||||
fmacs s19 , s2, s9
|
||||
fldmias BO!, { s14 - s15 }
|
||||
vldmia.f32 BO!, { s14 - s15 }
|
||||
fmacs s27 , s3, s8
|
||||
|
||||
fmacs s20 , s0, s10
|
||||
@@ -250,19 +250,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
pld [ BO , #B_PRE ]
|
||||
fmacs s24 , s5, s13
|
||||
fmacs s17 , s4, s13
|
||||
fldmias AO!, { s0 - s1 }
|
||||
vldmia.f32 AO!, { s0 - s1 }
|
||||
fmacs s25 , s5, s12
|
||||
|
||||
fmacs s18 , s6, s12
|
||||
fmacs s26 , s7, s13
|
||||
fldmias BO!, { s8 - s9 }
|
||||
vldmia.f32 BO!, { s8 - s9 }
|
||||
fmacs s19 , s6, s13
|
||||
fmacs s27 , s7, s12
|
||||
|
||||
fldmias AO!, { s2 - s3 }
|
||||
vldmia.f32 AO!, { s2 - s3 }
|
||||
fmacs s20 , s4, s14
|
||||
fmacs s28 , s5, s15
|
||||
fldmias BO!, { s10 - s11 }
|
||||
vldmia.f32 BO!, { s10 - s11 }
|
||||
fmacs s21 , s4, s15
|
||||
fmacs s29 , s5, s14
|
||||
|
||||
@@ -300,16 +300,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_SUB
|
||||
|
||||
fldmias AO!, { s0 - s1 }
|
||||
fldmias BO!, { s8 - s9 }
|
||||
vldmia.f32 AO!, { s0 - s1 }
|
||||
vldmia.f32 BO!, { s8 - s9 }
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
fmacs s24 , s1, s9
|
||||
fldmias AO!, { s2 - s3 }
|
||||
vldmia.f32 AO!, { s2 - s3 }
|
||||
fmacs s17 , s0, s9
|
||||
fmacs s25 , s1, s8
|
||||
|
||||
fldmias BO!, { s10 - s11 }
|
||||
vldmia.f32 BO!, { s10 - s11 }
|
||||
fmacs s18 , s2, s8
|
||||
fmacs s26 , s3, s9
|
||||
fmacs s19 , s2, s9
|
||||
@@ -338,8 +338,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s7 }
|
||||
fldmias CO2, { s8 - s11 }
|
||||
vldmia.f32 CO1, { s4 - s7 }
|
||||
vldmia.f32 CO2, { s8 - s11 }
|
||||
|
||||
FADD_R s16, s24 , s16
|
||||
FADD_I s17, s25 , s17
|
||||
@@ -370,8 +370,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s10, s1 , s23
|
||||
FMAC_I2 s11, s1 , s22
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
fstmias CO2, { s8 - s11 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
vstmia.f32 CO2, { s8 - s11 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -534,8 +534,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s5 }
|
||||
fldmias CO2, { s8 - s9 }
|
||||
vldmia.f32 CO1, { s4 - s5 }
|
||||
vldmia.f32 CO2, { s8 - s9 }
|
||||
|
||||
FADD_R s16, s24 , s16
|
||||
FADD_I s17, s25 , s17
|
||||
@@ -552,8 +552,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s8 , s1 , s21
|
||||
FMAC_I2 s9 , s1 , s20
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
fstmias CO2, { s8 - s9 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
vstmia.f32 CO2, { s8 - s9 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
@@ -716,7 +716,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s7 }
|
||||
vldmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
FADD_R s16, s24 , s16
|
||||
FADD_I s17, s25 , s17
|
||||
@@ -733,7 +733,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s19
|
||||
FMAC_I2 s7 , s1 , s18
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -851,7 +851,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias CO1, { s4 - s5 }
|
||||
vldmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
FADD_R s16, s24 , s16
|
||||
FADD_I s17, s25 , s17
|
||||
@@ -861,7 +861,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s4 , s1 , s17
|
||||
FMAC_I2 s5 , s1 , s16
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
|
||||
@@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s6 , [ AO2, #8 ]
|
||||
flds s7 , [ AO2, #12 ]
|
||||
|
||||
fstmias BO!, { s0 - s7 }
|
||||
vstmia.f32 BO!, { s0 - s7 }
|
||||
add AO2, AO2, #16
|
||||
|
||||
.endm
|
||||
@@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s3 , [ AO2, #4 ]
|
||||
|
||||
add AO1, AO1, #8
|
||||
fstmias BO!, { s0 - s3 }
|
||||
vstmia.f32 BO!, { s0 - s3 }
|
||||
add AO2, AO2, #8
|
||||
|
||||
.endm
|
||||
@@ -111,7 +111,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s2 , [ AO1, #8 ]
|
||||
flds s3 , [ AO1, #12 ]
|
||||
|
||||
fstmias BO!, { s0 - s3 }
|
||||
vstmia.f32 BO!, { s0 - s3 }
|
||||
add AO1, AO1, #16
|
||||
|
||||
.endm
|
||||
@@ -122,7 +122,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0 , [ AO1, #0 ]
|
||||
flds s1 , [ AO1, #4 ]
|
||||
|
||||
fstmias BO!, { s0 - s1 }
|
||||
vstmia.f32 BO!, { s0 - s1 }
|
||||
add AO1, AO1, #8
|
||||
|
||||
.endm
|
||||
|
||||
@@ -73,12 +73,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**************************************************************************************/
|
||||
.macro COPY2x2
|
||||
|
||||
fldmias AO1, { s0 - s3 }
|
||||
vldmia.f32 AO1, { s0 - s3 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmias r3, { s4 - s7 }
|
||||
vldmia.f32 r3, { s4 - s7 }
|
||||
|
||||
fstmias BO1, { s0 - s7 }
|
||||
vstmia.f32 BO1, { s0 - s7 }
|
||||
add AO1, AO1, #16
|
||||
add BO1, BO1, M4
|
||||
|
||||
@@ -86,12 +86,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY1x2
|
||||
|
||||
fldmias AO1, { s0 -s1 }
|
||||
vldmia.f32 AO1, { s0 -s1 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmias r3, { s2 - s3 }
|
||||
vldmia.f32 r3, { s2 - s3 }
|
||||
|
||||
fstmias BO2, { s0 - s3 }
|
||||
vstmia.f32 BO2, { s0 - s3 }
|
||||
add AO1, AO1, #8
|
||||
add BO2, BO2, #16
|
||||
|
||||
@@ -100,9 +100,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
/*************************************************************************************************************************/
|
||||
.macro COPY2x1
|
||||
|
||||
fldmias AO1, { s0 - s3 }
|
||||
vldmia.f32 AO1, { s0 - s3 }
|
||||
|
||||
fstmias BO1, { s0 - s3 }
|
||||
vstmia.f32 BO1, { s0 - s3 }
|
||||
add AO1, AO1, #16
|
||||
add BO1, BO1, M4
|
||||
|
||||
@@ -110,9 +110,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY1x1
|
||||
|
||||
fldmias AO1, { s0 - s1 }
|
||||
vldmia.f32 AO1, { s0 - s1 }
|
||||
|
||||
fstmias BO2, { s0 - s1 }
|
||||
vstmia.f32 BO2, { s0 - s1 }
|
||||
add AO1, AO1, #8
|
||||
add BO2, BO2, #8
|
||||
|
||||
|
||||
@@ -201,7 +201,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias YO, { s4 - s7 }
|
||||
vldmia.f32 YO, { s4 - s7 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
@@ -213,9 +213,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s11
|
||||
FMAC_I2 s7 , s1 , s10
|
||||
|
||||
fstmias YO!, { s4 - s7 }
|
||||
vstmia.f32 YO!, { s4 - s7 }
|
||||
|
||||
fldmias YO, { s4 - s7 }
|
||||
vldmia.f32 YO, { s4 - s7 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
@@ -227,7 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s15
|
||||
FMAC_I2 s7 , s1 , s14
|
||||
|
||||
fstmias YO!, { s4 - s7 }
|
||||
vstmia.f32 YO!, { s4 - s7 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -266,14 +266,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
FMAC_R2 s4 , s1 , s9
|
||||
FMAC_I2 s5 , s1 , s8
|
||||
|
||||
fstmias YO, { s4 - s5 }
|
||||
vstmia.f32 YO, { s4 - s5 }
|
||||
|
||||
add YO, YO, #8
|
||||
|
||||
@@ -349,47 +349,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
FMAC_R2 s4 , s1 , s9
|
||||
FMAC_I2 s5 , s1 , s8
|
||||
|
||||
fstmias YO, { s4 - s5 }
|
||||
vstmia.f32 YO, { s4 - s5 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s6 - s7 }
|
||||
vldmia.f32 YO, { s6 - s7 }
|
||||
|
||||
FMAC_R1 s6 , s0 , s10
|
||||
FMAC_I1 s7 , s0 , s11
|
||||
FMAC_R2 s6 , s1 , s11
|
||||
FMAC_I2 s7 , s1 , s10
|
||||
|
||||
fstmias YO, { s6 - s7 }
|
||||
vstmia.f32 YO, { s6 - s7 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
FMAC_R2 s4 , s1 , s13
|
||||
FMAC_I2 s5 , s1 , s12
|
||||
|
||||
fstmias YO, { s4 - s5 }
|
||||
vstmia.f32 YO, { s4 - s5 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s6 - s7 }
|
||||
vldmia.f32 YO, { s6 - s7 }
|
||||
|
||||
FMAC_R1 s6 , s0 , s14
|
||||
FMAC_I1 s7 , s0 , s15
|
||||
FMAC_R2 s6 , s1 , s15
|
||||
FMAC_I2 s7 , s1 , s14
|
||||
|
||||
fstmias YO, { s6 - s7 }
|
||||
vstmia.f32 YO, { s6 - s7 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
@@ -430,14 +430,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA_R
|
||||
flds s1, ALPHA_I
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s8
|
||||
FMAC_I1 s5 , s0 , s9
|
||||
FMAC_R2 s4 , s1 , s9
|
||||
FMAC_I2 s5 , s1 , s8
|
||||
|
||||
fstmias YO, { s4 - s5 }
|
||||
vstmia.f32 YO, { s4 - s5 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
|
||||
@@ -150,9 +150,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F2X1
|
||||
|
||||
fldmias XO! , { s2 - s3 }
|
||||
fldmias AO1!, { s4 - s5 }
|
||||
fldmias AO2!, { s8 - s9 }
|
||||
vldmia.f32 XO! , { s2 - s3 }
|
||||
vldmia.f32 AO1!, { s4 - s5 }
|
||||
vldmia.f32 AO2!, { s8 - s9 }
|
||||
|
||||
fmacs s12 , s4 , s2
|
||||
fmacs s13 , s4 , s3
|
||||
@@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F2
|
||||
|
||||
fldmias YO, { s4 - s7 }
|
||||
vldmia.f32 YO, { s4 - s7 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
@@ -180,7 +180,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s15
|
||||
FMAC_I2 s7 , s1 , s14
|
||||
|
||||
fstmias YO!, { s4 - s7 }
|
||||
vstmia.f32 YO!, { s4 - s7 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -204,8 +204,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmias XO! , { s2 - s3 }
|
||||
fldmias AO1!, { s4 - s5 }
|
||||
vldmia.f32 XO! , { s2 - s3 }
|
||||
vldmia.f32 AO1!, { s4 - s5 }
|
||||
|
||||
fmacs s12 , s4 , s2
|
||||
fmacs s13 , s4 , s3
|
||||
@@ -216,14 +216,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
FMAC_R2 s4 , s1 , s13
|
||||
FMAC_I2 s5 , s1 , s12
|
||||
|
||||
fstmias YO!, { s4 - s5 }
|
||||
vstmia.f32 YO!, { s4 - s5 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -249,9 +249,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X1
|
||||
|
||||
fldmias XO , { s2 - s3 }
|
||||
fldmias AO1!, { s4 - s5 }
|
||||
fldmias AO2!, { s8 - s9 }
|
||||
vldmia.f32 XO , { s2 - s3 }
|
||||
vldmia.f32 AO1!, { s4 - s5 }
|
||||
vldmia.f32 AO2!, { s8 - s9 }
|
||||
|
||||
fmacs s12 , s4 , s2
|
||||
fmacs s13 , s4 , s3
|
||||
@@ -269,25 +269,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S2
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
FMAC_R2 s4 , s1 , s13
|
||||
FMAC_I2 s5 , s1 , s12
|
||||
|
||||
fstmias YO, { s4 - s5 }
|
||||
vstmia.f32 YO, { s4 - s5 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s6 - s7 }
|
||||
vldmia.f32 YO, { s6 - s7 }
|
||||
|
||||
FMAC_R1 s6 , s0 , s14
|
||||
FMAC_I1 s7 , s0 , s15
|
||||
FMAC_R2 s6 , s1 , s15
|
||||
FMAC_I2 s7 , s1 , s14
|
||||
|
||||
fstmias YO, { s6 - s7 }
|
||||
vstmia.f32 YO, { s6 - s7 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
@@ -313,8 +313,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmias XO , { s2 - s3 }
|
||||
fldmias AO1!, { s4 - s5 }
|
||||
vldmia.f32 XO , { s2 - s3 }
|
||||
vldmia.f32 AO1!, { s4 - s5 }
|
||||
|
||||
fmacs s12 , s4 , s2
|
||||
fmacs s13 , s4 , s3
|
||||
@@ -327,14 +327,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
|
||||
FMAC_R1 s4 , s0 , s12
|
||||
FMAC_I1 s5 , s0 , s13
|
||||
FMAC_R2 s4 , s1 , s13
|
||||
FMAC_I2 s5 , s1 , s12
|
||||
|
||||
fstmias YO, { s4 - s5 }
|
||||
vstmia.f32 YO, { s4 - s5 }
|
||||
|
||||
add YO, YO, INC_Y
|
||||
|
||||
|
||||
@@ -165,9 +165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_I
|
||||
|
||||
pld [ AO, #A_PRE ]
|
||||
fldmias AO!, { s0 - s3 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
pld [ BO, #B_PRE ]
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
|
||||
fmuls s8 , s0, s4
|
||||
@@ -197,9 +197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_M1
|
||||
|
||||
pld [ AO, #A_PRE ]
|
||||
fldmias AO!, { s0 - s3 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
pld [ BO, #B_PRE ]
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -225,8 +225,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_M2
|
||||
|
||||
fldmias AO!, { s0 - s3 }
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -254,8 +254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_E
|
||||
|
||||
fldmias AO!, { s0 - s3 }
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_SUB
|
||||
|
||||
fldmias AO!, { s0 - s3 }
|
||||
fldmias BO!, { s4 - s7 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
vldmia.f32 BO!, { s4 - s7 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s0, s5
|
||||
@@ -331,7 +331,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s11
|
||||
FMAC_I2 s7 , s1 , s10
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
flds s4, FP_ZERO
|
||||
vmov.f32 s5, s4
|
||||
@@ -348,7 +348,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s15
|
||||
FMAC_I2 s7 , s1 , s14
|
||||
|
||||
fstmias CO2, { s4 - s7 }
|
||||
vstmia.f32 CO2, { s4 - s7 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -513,7 +513,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s4 , s1 , s9
|
||||
FMAC_I2 s5 , s1 , s8
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
flds s4, FP_ZERO
|
||||
vmov.f32 s5, s4
|
||||
@@ -523,7 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s4 , s1 , s13
|
||||
FMAC_I2 s5 , s1 , s12
|
||||
|
||||
fstmias CO2, { s4 - s5 }
|
||||
vstmia.f32 CO2, { s4 - s5 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
@@ -693,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s11
|
||||
FMAC_I2 s7 , s1 , s10
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -818,7 +818,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s4 , s1 , s9
|
||||
FMAC_I2 s5 , s1 , s8
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
|
||||
@@ -170,30 +170,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_I
|
||||
pld [ AO , #A_PRE ]
|
||||
pld [ BO , #B_PRE ]
|
||||
fldmias AO!, { s0 - s1 }
|
||||
fldmias BO!, { s8 - s9 }
|
||||
vldmia.f32 AO!, { s0 - s1 }
|
||||
vldmia.f32 BO!, { s8 - s9 }
|
||||
|
||||
fmuls s16 , s0, s8
|
||||
fmuls s24 , s1, s9
|
||||
fldmias AO!, { s2 - s3 }
|
||||
vldmia.f32 AO!, { s2 - s3 }
|
||||
fmuls s17 , s0, s9
|
||||
fmuls s25 , s1, s8
|
||||
|
||||
fldmias BO!, { s10 - s11 }
|
||||
vldmia.f32 BO!, { s10 - s11 }
|
||||
fmuls s18 , s2, s8
|
||||
fmuls s26 , s3, s9
|
||||
fldmias AO!, { s4 - s5 }
|
||||
vldmia.f32 AO!, { s4 - s5 }
|
||||
fmuls s19 , s2, s9
|
||||
fmuls s27 , s3, s8
|
||||
|
||||
fldmias BO!, { s12 - s13 }
|
||||
vldmia.f32 BO!, { s12 - s13 }
|
||||
fmuls s20 , s0, s10
|
||||
fmuls s28 , s1, s11
|
||||
fldmias AO!, { s6 - s7 }
|
||||
vldmia.f32 AO!, { s6 - s7 }
|
||||
fmuls s21 , s0, s11
|
||||
fmuls s29 , s1, s10
|
||||
|
||||
fldmias BO!, { s14 - s15 }
|
||||
vldmia.f32 BO!, { s14 - s15 }
|
||||
fmuls s22 , s2, s10
|
||||
fmuls s30 , s3, s11
|
||||
fmuls s23 , s2, s11
|
||||
@@ -206,17 +206,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL2x2_M1
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
fldmias AO!, { s4 - s5 }
|
||||
vldmia.f32 AO!, { s4 - s5 }
|
||||
fmacs s24 , s1, s9
|
||||
fmacs s17 , s0, s9
|
||||
fldmias BO!, { s12 - s13 }
|
||||
vldmia.f32 BO!, { s12 - s13 }
|
||||
fmacs s25 , s1, s8
|
||||
|
||||
fmacs s18 , s2, s8
|
||||
fldmias AO!, { s6 - s7 }
|
||||
vldmia.f32 AO!, { s6 - s7 }
|
||||
fmacs s26 , s3, s9
|
||||
fmacs s19 , s2, s9
|
||||
fldmias BO!, { s14 - s15 }
|
||||
vldmia.f32 BO!, { s14 - s15 }
|
||||
fmacs s27 , s3, s8
|
||||
|
||||
fmacs s20 , s0, s10
|
||||
@@ -238,19 +238,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
pld [ BO , #B_PRE ]
|
||||
fmacs s24 , s5, s13
|
||||
fmacs s17 , s4, s13
|
||||
fldmias AO!, { s0 - s1 }
|
||||
vldmia.f32 AO!, { s0 - s1 }
|
||||
fmacs s25 , s5, s12
|
||||
|
||||
fmacs s18 , s6, s12
|
||||
fmacs s26 , s7, s13
|
||||
fldmias BO!, { s8 - s9 }
|
||||
vldmia.f32 BO!, { s8 - s9 }
|
||||
fmacs s19 , s6, s13
|
||||
fmacs s27 , s7, s12
|
||||
|
||||
fldmias AO!, { s2 - s3 }
|
||||
vldmia.f32 AO!, { s2 - s3 }
|
||||
fmacs s20 , s4, s14
|
||||
fmacs s28 , s5, s15
|
||||
fldmias BO!, { s10 - s11 }
|
||||
vldmia.f32 BO!, { s10 - s11 }
|
||||
fmacs s21 , s4, s15
|
||||
fmacs s29 , s5, s14
|
||||
|
||||
@@ -288,16 +288,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL2x2_SUB
|
||||
|
||||
fldmias AO!, { s0 - s1 }
|
||||
fldmias BO!, { s8 - s9 }
|
||||
vldmia.f32 AO!, { s0 - s1 }
|
||||
vldmia.f32 BO!, { s8 - s9 }
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
fmacs s24 , s1, s9
|
||||
fldmias AO!, { s2 - s3 }
|
||||
vldmia.f32 AO!, { s2 - s3 }
|
||||
fmacs s17 , s0, s9
|
||||
fmacs s25 , s1, s8
|
||||
|
||||
fldmias BO!, { s10 - s11 }
|
||||
vldmia.f32 BO!, { s10 - s11 }
|
||||
fmacs s18 , s2, s8
|
||||
fmacs s26 , s3, s9
|
||||
fmacs s19 , s2, s9
|
||||
@@ -354,8 +354,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s10, s1 , s23
|
||||
FMAC_I2 s11, s1 , s22
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
fstmias CO2, { s8 - s11 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
vstmia.f32 CO2, { s8 - s11 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -532,8 +532,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s8 , s1 , s21
|
||||
FMAC_I2 s9 , s1 , s20
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
fstmias CO2, { s8 - s9 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
vstmia.f32 CO2, { s8 - s9 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
@@ -710,7 +710,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s6 , s1 , s19
|
||||
FMAC_I2 s7 , s1 , s18
|
||||
|
||||
fstmias CO1, { s4 - s7 }
|
||||
vstmia.f32 CO1, { s4 - s7 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -835,7 +835,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
FMAC_R2 s4 , s1 , s17
|
||||
FMAC_I2 s5 , s1 , s16
|
||||
|
||||
fstmias CO1, { s4 - s5 }
|
||||
vstmia.f32 CO1, { s4 - s5 }
|
||||
|
||||
add CO1, CO1, #8
|
||||
|
||||
|
||||
@@ -65,15 +65,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d0 - d3 }
|
||||
fstmiad Y!, { d0 - d3 }
|
||||
vldmia.f64 X!, { d0 - d3 }
|
||||
vstmia.f64 Y!, { d0 - d3 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro COPY_F1
|
||||
|
||||
fldmiad X!, { d0 }
|
||||
fstmiad Y!, { d0 }
|
||||
vldmia.f64 X!, { d0 }
|
||||
vstmia.f64 Y!, { d0 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -83,23 +83,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY_S4
|
||||
|
||||
nop
|
||||
fldmiad X, { d0 }
|
||||
fstmiad Y, { d0 }
|
||||
vldmia.f64 X, { d0 }
|
||||
vstmia.f64 Y, { d0 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmiad X, { d1 }
|
||||
fstmiad Y, { d1 }
|
||||
vldmia.f64 X, { d1 }
|
||||
vstmia.f64 Y, { d1 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmiad X, { d0 }
|
||||
fstmiad Y, { d0 }
|
||||
vldmia.f64 X, { d0 }
|
||||
vstmia.f64 Y, { d0 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmiad X, { d1 }
|
||||
fstmiad Y, { d1 }
|
||||
vldmia.f64 X, { d1 }
|
||||
vstmia.f64 Y, { d1 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -108,8 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY_S1
|
||||
|
||||
fldmiad X, { d0 }
|
||||
fstmiad Y, { d0 }
|
||||
vldmia.f64 X, { d0 }
|
||||
vstmia.f64 Y, { d0 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
|
||||
@@ -67,26 +67,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X!, { d8 }
|
||||
vldmia.f64 X!, { d8 }
|
||||
pld [ Y, #X_PRE ]
|
||||
fldmiad Y!, { d4 }
|
||||
fldmiad Y!, { d5 }
|
||||
vldmia.f64 Y!, { d4 }
|
||||
vldmia.f64 Y!, { d5 }
|
||||
fmacd d0 , d4, d8
|
||||
fldmiad X!, { d9 }
|
||||
fldmiad Y!, { d6 }
|
||||
vldmia.f64 X!, { d9 }
|
||||
vldmia.f64 Y!, { d6 }
|
||||
fmacd d1 , d5, d9
|
||||
fldmiad X!, { d10 }
|
||||
fldmiad X!, { d11 }
|
||||
vldmia.f64 X!, { d10 }
|
||||
vldmia.f64 X!, { d11 }
|
||||
fmacd d0 , d6, d10
|
||||
fldmiad Y!, { d7 }
|
||||
vldmia.f64 Y!, { d7 }
|
||||
fmacd d1 , d7, d11
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 }
|
||||
fldmiad Y!, { d8 }
|
||||
vldmia.f64 X!, { d4 }
|
||||
vldmia.f64 Y!, { d8 }
|
||||
fmacd d0 , d4, d8
|
||||
|
||||
.endm
|
||||
@@ -97,26 +97,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_S4
|
||||
|
||||
nop
|
||||
fldmiad X, { d4 }
|
||||
fldmiad Y, { d8 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vldmia.f64 Y, { d8 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
fmacd d0 , d4, d8
|
||||
|
||||
fldmiad X, { d5 }
|
||||
fldmiad Y, { d9 }
|
||||
vldmia.f64 X, { d5 }
|
||||
vldmia.f64 Y, { d9 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
fmacd d1 , d5, d9
|
||||
|
||||
fldmiad X, { d6 }
|
||||
fldmiad Y, { d10 }
|
||||
vldmia.f64 X, { d6 }
|
||||
vldmia.f64 Y, { d10 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
fmacd d0 , d6, d10
|
||||
|
||||
fldmiad X, { d7 }
|
||||
fldmiad Y, { d11 }
|
||||
vldmia.f64 X, { d7 }
|
||||
vldmia.f64 Y, { d11 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
fmacd d1 , d7, d11
|
||||
@@ -126,8 +126,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 }
|
||||
fldmiad Y, { d8 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vldmia.f64 Y, { d8 }
|
||||
add X, X, INC_X
|
||||
fmacd d0 , d4, d8
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -331,7 +331,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add r4 , CO2, r3
|
||||
pld [ CO2 , #C_PRE ]
|
||||
|
||||
fldmiad CO1, { d8 - d11 }
|
||||
vldmia.f64 CO1, { d8 - d11 }
|
||||
pld [ r4 , #C_PRE ]
|
||||
|
||||
fmacd d8 , d0 , d16
|
||||
@@ -352,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fmacd d15, d0 , d23
|
||||
fstd d11, [CO1, #24 ]
|
||||
|
||||
fldmiad r4, { d8 - d11 }
|
||||
vldmia.f64 r4, { d8 - d11 }
|
||||
|
||||
fmacd d8 , d0 , d24
|
||||
fstd d12, [CO2]
|
||||
@@ -367,7 +367,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
pld [ CO2 , #C_PRE ]
|
||||
|
||||
fldmiad CO2, { d12 - d15 }
|
||||
vldmia.f64 CO2, { d12 - d15 }
|
||||
|
||||
fstd d8 , [r4 ]
|
||||
fmacd d12, d0 , d28
|
||||
@@ -378,7 +378,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fstd d11, [r4 , #24 ]
|
||||
fmacd d15, d0 , d31
|
||||
|
||||
fstmiad CO2, { d12 - d15 }
|
||||
vstmia.f64 CO2, { d12 - d15 }
|
||||
|
||||
add CO1, CO1, #32
|
||||
|
||||
|
||||
@@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d3 , [ AO2, #8 ]
|
||||
|
||||
add AO1, AO1, #16
|
||||
fstmiad BO!, { d0 - d3 }
|
||||
vstmia.f64 BO!, { d0 - d3 }
|
||||
add AO2, AO2, #16
|
||||
|
||||
.endm
|
||||
@@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d1 , [ AO2, #0 ]
|
||||
add AO1, AO1, #8
|
||||
|
||||
fstmiad BO!, { d0 - d1 }
|
||||
vstmia.f64 BO!, { d0 - d1 }
|
||||
add AO2, AO2, #8
|
||||
|
||||
.endm
|
||||
@@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d0 , [ AO1, #0 ]
|
||||
fldd d1 , [ AO1, #8 ]
|
||||
|
||||
fstmiad BO!, { d0 - d1 }
|
||||
vstmia.f64 BO!, { d0 - d1 }
|
||||
add AO1, AO1, #16
|
||||
|
||||
.endm
|
||||
@@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
fldd d0 , [ AO1, #0 ]
|
||||
|
||||
fstmiad BO!, { d0 }
|
||||
vstmia.f64 BO!, { d0 }
|
||||
add AO1, AO1, #8
|
||||
|
||||
.endm
|
||||
|
||||
@@ -105,10 +105,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d11, [ AO4, #16 ]
|
||||
fldd d15, [ AO4, #24 ]
|
||||
|
||||
fstmiad BO!, { d0 - d3 }
|
||||
vstmia.f64 BO!, { d0 - d3 }
|
||||
add AO4, AO4, #32
|
||||
fstmiad BO!, { d4 - d7 }
|
||||
fstmiad BO!, { d8 - d15 }
|
||||
vstmia.f64 BO!, { d4 - d7 }
|
||||
vstmia.f64 BO!, { d8 - d15 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -122,7 +122,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d3 , [ AO4, #0 ]
|
||||
|
||||
add AO3, AO3, #8
|
||||
fstmiad BO!, { d0 - d3 }
|
||||
vstmia.f64 BO!, { d0 - d3 }
|
||||
add AO4, AO4, #8
|
||||
|
||||
.endm
|
||||
@@ -140,7 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d5 , [ AO2, #16 ]
|
||||
fldd d7 , [ AO2, #24 ]
|
||||
|
||||
fstmiad BO!, { d0 - d7 }
|
||||
vstmia.f64 BO!, { d0 - d7 }
|
||||
add AO2, AO2, #32
|
||||
|
||||
.endm
|
||||
@@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d1 , [ AO2, #0 ]
|
||||
add AO1, AO1, #8
|
||||
|
||||
fstmiad BO!, { d0 - d1 }
|
||||
vstmia.f64 BO!, { d0 - d1 }
|
||||
add AO2, AO2, #8
|
||||
|
||||
.endm
|
||||
@@ -164,7 +164,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d2 , [ AO1, #16 ]
|
||||
fldd d3 , [ AO1, #24 ]
|
||||
|
||||
fstmiad BO!, { d0 - d3 }
|
||||
vstmia.f64 BO!, { d0 - d3 }
|
||||
add AO1, AO1, #32
|
||||
|
||||
.endm
|
||||
@@ -174,7 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
fldd d0 , [ AO1, #0 ]
|
||||
|
||||
fstmiad BO!, { d0 }
|
||||
vstmia.f64 BO!, { d0 }
|
||||
add AO1, AO1, #8
|
||||
|
||||
.endm
|
||||
|
||||
@@ -76,21 +76,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY4x4
|
||||
|
||||
pld [ AO1, #A_PRE ]
|
||||
fldmiad AO1, { d0 - d3 }
|
||||
vldmia.f64 AO1, { d0 - d3 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
pld [ r3, #A_PRE ]
|
||||
fldmiad r3, { d4 - d7 }
|
||||
vldmia.f64 r3, { d4 - d7 }
|
||||
|
||||
add r3, r3, LDA
|
||||
pld [ r3, #A_PRE ]
|
||||
fldmiad r3, { d8 - d11 }
|
||||
vldmia.f64 r3, { d8 - d11 }
|
||||
|
||||
add r3, r3, LDA
|
||||
pld [ r3, #A_PRE ]
|
||||
fldmiad r3, { d12 - d15 }
|
||||
vldmia.f64 r3, { d12 - d15 }
|
||||
|
||||
fstmiad BO1, { d0 - d15 }
|
||||
vstmia.f64 BO1, { d0 - d15 }
|
||||
add AO1, AO1, #32
|
||||
add BO1, BO1, M4
|
||||
|
||||
@@ -98,18 +98,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY2x4
|
||||
|
||||
fldmiad AO1, { d0 - d1 }
|
||||
vldmia.f64 AO1, { d0 - d1 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmiad r3, { d2 - d3 }
|
||||
vldmia.f64 r3, { d2 - d3 }
|
||||
|
||||
add r3, r3, LDA
|
||||
fldmiad r3, { d4 - d5 }
|
||||
vldmia.f64 r3, { d4 - d5 }
|
||||
|
||||
add r3, r3, LDA
|
||||
fldmiad r3, { d6 - d7 }
|
||||
vldmia.f64 r3, { d6 - d7 }
|
||||
|
||||
fstmiad BO2, { d0 - d7 }
|
||||
vstmia.f64 BO2, { d0 - d7 }
|
||||
add AO1, AO1, #16
|
||||
add BO2, BO2, #64
|
||||
|
||||
@@ -117,18 +117,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY1x4
|
||||
|
||||
fldmiad AO1, { d0 }
|
||||
vldmia.f64 AO1, { d0 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmiad r3, { d1 }
|
||||
vldmia.f64 r3, { d1 }
|
||||
|
||||
add r3, r3, LDA
|
||||
fldmiad r3, { d2 }
|
||||
vldmia.f64 r3, { d2 }
|
||||
|
||||
add r3, r3, LDA
|
||||
fldmiad r3, { d3 }
|
||||
vldmia.f64 r3, { d3 }
|
||||
|
||||
fstmiad BO3, { d0 - d3 }
|
||||
vstmia.f64 BO3, { d0 - d3 }
|
||||
add AO1, AO1, #8
|
||||
add BO3, BO3, #32
|
||||
|
||||
@@ -139,13 +139,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY4x2
|
||||
|
||||
pld [ AO1, #A_PRE ]
|
||||
fldmiad AO1, { d0 - d3 }
|
||||
vldmia.f64 AO1, { d0 - d3 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
pld [ r3, #A_PRE ]
|
||||
fldmiad r3, { d4 - d7 }
|
||||
vldmia.f64 r3, { d4 - d7 }
|
||||
|
||||
fstmiad BO1, { d0 - d7 }
|
||||
vstmia.f64 BO1, { d0 - d7 }
|
||||
add AO1, AO1, #32
|
||||
add BO1, BO1, M4
|
||||
|
||||
@@ -153,12 +153,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY2x2
|
||||
|
||||
fldmiad AO1, { d0 - d1 }
|
||||
vldmia.f64 AO1, { d0 - d1 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmiad r3, { d2 - d3 }
|
||||
vldmia.f64 r3, { d2 - d3 }
|
||||
|
||||
fstmiad BO2, { d0 - d3 }
|
||||
vstmia.f64 BO2, { d0 - d3 }
|
||||
add AO1, AO1, #16
|
||||
add BO2, BO2, #32
|
||||
|
||||
@@ -166,12 +166,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY1x2
|
||||
|
||||
fldmiad AO1, { d0 }
|
||||
vldmia.f64 AO1, { d0 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmiad r3, { d1 }
|
||||
vldmia.f64 r3, { d1 }
|
||||
|
||||
fstmiad BO3, { d0 - d1 }
|
||||
vstmia.f64 BO3, { d0 - d1 }
|
||||
add AO1, AO1, #8
|
||||
add BO3, BO3, #16
|
||||
|
||||
@@ -182,9 +182,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY4x1
|
||||
|
||||
pld [ AO1, #A_PRE ]
|
||||
fldmiad AO1, { d0 - d3 }
|
||||
vldmia.f64 AO1, { d0 - d3 }
|
||||
|
||||
fstmiad BO1, { d0 - d3 }
|
||||
vstmia.f64 BO1, { d0 - d3 }
|
||||
add AO1, AO1, #32
|
||||
add BO1, BO1, M4
|
||||
|
||||
@@ -192,9 +192,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY2x1
|
||||
|
||||
fldmiad AO1, { d0 - d1 }
|
||||
vldmia.f64 AO1, { d0 - d1 }
|
||||
|
||||
fstmiad BO2, { d0 - d1 }
|
||||
vstmia.f64 BO2, { d0 - d1 }
|
||||
add AO1, AO1, #16
|
||||
add BO2, BO2, #16
|
||||
|
||||
@@ -202,9 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY1x1
|
||||
|
||||
fldmiad AO1, { d0 }
|
||||
vldmia.f64 AO1, { d0 }
|
||||
|
||||
fstmiad BO3, { d0 }
|
||||
vstmia.f64 BO3, { d0 }
|
||||
add AO1, AO1, #8
|
||||
add BO3, BO3, #8
|
||||
|
||||
|
||||
@@ -128,10 +128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d8 , [ BO ]
|
||||
|
||||
pld [ AO , #A_PRE ]
|
||||
fldmiad AO!, { d0 - d1}
|
||||
vldmia.f64 AO!, { d0 - d1}
|
||||
|
||||
fmuld d16 , d0, d8
|
||||
fldmiad AO!, { d2 - d3}
|
||||
vldmia.f64 AO!, { d2 - d3}
|
||||
fmuld d17 , d1, d8
|
||||
fldd d9 , [ BO, #8 ]
|
||||
fmuld d18 , d2, d8
|
||||
@@ -148,10 +148,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fmuld d23 , d3, d9
|
||||
|
||||
fmuld d24 , d0, d10
|
||||
fldmiad AO!, { d4 - d5 }
|
||||
vldmia.f64 AO!, { d4 - d5 }
|
||||
fmuld d25 , d1, d10
|
||||
fmuld d26 , d2, d10
|
||||
fldmiad AO!, { d6 - d7 }
|
||||
vldmia.f64 AO!, { d6 - d7 }
|
||||
fmuld d27 , d3, d10
|
||||
|
||||
fldd d13, [ BO, #8 ]
|
||||
@@ -173,10 +173,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d8 , [ BO ]
|
||||
|
||||
pld [ AO , #A_PRE ]
|
||||
fldmiad AO!, { d0 - d1}
|
||||
vldmia.f64 AO!, { d0 - d1}
|
||||
|
||||
fmacd d16 , d0, d8
|
||||
fldmiad AO!, { d2 - d3}
|
||||
vldmia.f64 AO!, { d2 - d3}
|
||||
fmacd d17 , d1, d8
|
||||
fldd d9 , [ BO, #8 ]
|
||||
fmacd d18 , d2, d8
|
||||
@@ -193,10 +193,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fmacd d23 , d3, d9
|
||||
|
||||
fmacd d24 , d0, d10
|
||||
fldmiad AO!, { d4 - d5 }
|
||||
vldmia.f64 AO!, { d4 - d5 }
|
||||
fmacd d25 , d1, d10
|
||||
fmacd d26 , d2, d10
|
||||
fldmiad AO!, { d6 - d7 }
|
||||
vldmia.f64 AO!, { d6 - d7 }
|
||||
fmacd d27 , d3, d10
|
||||
|
||||
fldd d13, [ BO, #8 ]
|
||||
@@ -225,11 +225,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fldd d8 , [ BO ]
|
||||
fmacd d21 , d5, d13
|
||||
fmacd d22 , d6, d13
|
||||
fldmiad AO!, { d0 - d1 }
|
||||
vldmia.f64 AO!, { d0 - d1 }
|
||||
fmacd d23 , d7, d13
|
||||
|
||||
fmacd d24 , d4, d14
|
||||
fldmiad AO!, { d2 - d3 }
|
||||
vldmia.f64 AO!, { d2 - d3 }
|
||||
fmacd d25 , d5, d14
|
||||
fldd d9 , [ BO, #8 ]
|
||||
fmacd d26 , d6, d14
|
||||
@@ -257,10 +257,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fmacd d19 , d3, d8
|
||||
|
||||
fmacd d20 , d0, d9
|
||||
fldmiad AO!, { d4 - d5 }
|
||||
vldmia.f64 AO!, { d4 - d5 }
|
||||
fmacd d21 , d1, d9
|
||||
fmacd d22 , d2, d9
|
||||
fldmiad AO!, { d6 - d7 }
|
||||
vldmia.f64 AO!, { d6 - d7 }
|
||||
fmacd d23 , d3, d9
|
||||
|
||||
fmacd d24 , d0, d10
|
||||
@@ -390,7 +390,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fstd d11, [r4 , #24 ]
|
||||
fmuld d15, d0 , d31
|
||||
|
||||
fstmiad CO2, { d12 - d15 }
|
||||
vstmia.f64 CO2, { d12 - d15 }
|
||||
|
||||
add CO1, CO1, #32
|
||||
|
||||
|
||||
@@ -139,8 +139,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F8X1
|
||||
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmiad XO! , { d2 }
|
||||
fldmiad AO1 , { d4 - d7 }
|
||||
vldmia.f64 XO! , { d2 }
|
||||
vldmia.f64 AO1 , { d4 - d7 }
|
||||
|
||||
vmla.f64 d8 , d2 , d4
|
||||
pld [ AO2 , #4*SIZE ]
|
||||
@@ -150,7 +150,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmla.f64 d11 , d2 , d7
|
||||
|
||||
|
||||
fldmiad r3 , { d4 - d7 }
|
||||
vldmia.f64 r3 , { d4 - d7 }
|
||||
|
||||
vmla.f64 d12 , d2 , d4
|
||||
vmla.f64 d13 , d2 , d5
|
||||
@@ -164,23 +164,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F8
|
||||
|
||||
fldmiad YO, { d4 - d7 }
|
||||
vldmia.f64 YO, { d4 - d7 }
|
||||
|
||||
vmla.f64 d4 , d0, d8
|
||||
vmla.f64 d5 , d0, d9
|
||||
vmla.f64 d6 , d0, d10
|
||||
vmla.f64 d7 , d0, d11
|
||||
|
||||
fstmiad YO!, { d4 - d7 }
|
||||
vstmia.f64 YO!, { d4 - d7 }
|
||||
|
||||
fldmiad YO, { d4 - d7 }
|
||||
vldmia.f64 YO, { d4 - d7 }
|
||||
|
||||
vmla.f64 d4 , d0, d12
|
||||
vmla.f64 d5 , d0, d13
|
||||
vmla.f64 d6 , d0, d14
|
||||
vmla.f64 d7 , d0, d15
|
||||
|
||||
fstmiad YO!, { d4 - d7 }
|
||||
vstmia.f64 YO!, { d4 - d7 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -195,8 +195,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmiad XO! , { d2 }
|
||||
fldmiad AO1 , { d8 }
|
||||
vldmia.f64 XO! , { d2 }
|
||||
vldmia.f64 AO1 , { d8 }
|
||||
vmla.f64 d12 , d2 , d8
|
||||
add AO1, AO1, LDA
|
||||
|
||||
@@ -204,9 +204,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmiad YO, { d4 }
|
||||
vldmia.f64 YO, { d4 }
|
||||
vmla.f64 d4, d0, d12
|
||||
fstmiad YO!, { d4 }
|
||||
vstmia.f64 YO!, { d4 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -234,8 +234,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_S4X1
|
||||
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmiad XO , { d2 }
|
||||
fldmiad AO1 , { d8 - d11 }
|
||||
vldmia.f64 XO , { d2 }
|
||||
vldmia.f64 AO1 , { d8 - d11 }
|
||||
|
||||
vmla.f64 d12 , d2 , d8
|
||||
add AO1, AO1, LDA
|
||||
@@ -249,24 +249,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S4
|
||||
|
||||
fldmiad YO, { d4 }
|
||||
vldmia.f64 YO, { d4 }
|
||||
vmla.f64 d4 , d0, d12
|
||||
fstmiad YO, { d4 }
|
||||
vstmia.f64 YO, { d4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d5 }
|
||||
vldmia.f64 YO, { d5 }
|
||||
vmla.f64 d5 , d0, d13
|
||||
fstmiad YO, { d5 }
|
||||
vstmia.f64 YO, { d5 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d4 }
|
||||
vldmia.f64 YO, { d4 }
|
||||
vmla.f64 d4 , d0, d14
|
||||
fstmiad YO, { d4 }
|
||||
vstmia.f64 YO, { d4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d5 }
|
||||
vldmia.f64 YO, { d5 }
|
||||
vmla.f64 d5 , d0, d15
|
||||
fstmiad YO, { d5 }
|
||||
vstmia.f64 YO, { d5 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmiad XO , { d2 }
|
||||
fldmiad AO1 , { d8 }
|
||||
vldmia.f64 XO , { d2 }
|
||||
vldmia.f64 AO1 , { d8 }
|
||||
vmla.f64 d12 , d2 , d8
|
||||
add AO1, AO1, LDA
|
||||
add XO, XO , INC_X
|
||||
@@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmiad YO, { d4 }
|
||||
vldmia.f64 YO, { d4 }
|
||||
vmla.f64 d4, d0, d12
|
||||
fstmiad YO , { d4 }
|
||||
vstmia.f64 YO , { d4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -338,8 +338,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F8X1
|
||||
|
||||
pld [ AO2, #A_PRE ]
|
||||
fldmias XO! , { s2 }
|
||||
fldmias AO1 , { s4 - s7 }
|
||||
vldmia.f32 XO! , { s2 }
|
||||
vldmia.f32 AO1 , { s4 - s7 }
|
||||
|
||||
vmla.f32 s8 , s2 , s4
|
||||
vmla.f32 s9 , s2 , s5
|
||||
@@ -348,7 +348,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
add r3, AO1, #4*SIZE
|
||||
|
||||
fldmias r3 , { s4 - s7 }
|
||||
vldmia.f32 r3 , { s4 - s7 }
|
||||
|
||||
vmla.f32 s12 , s2 , s4
|
||||
vmla.f32 s13 , s2 , s5
|
||||
@@ -362,24 +362,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F8
|
||||
|
||||
fldmias YO, { s4 - s7 }
|
||||
vldmia.f32 YO, { s4 - s7 }
|
||||
|
||||
vmla.f32 s4 , s0, s8
|
||||
vmla.f32 s5 , s0, s9
|
||||
vmla.f32 s6 , s0, s10
|
||||
vmla.f32 s7 , s0, s11
|
||||
|
||||
fstmias YO!, { s4 - s7 }
|
||||
vstmia.f32 YO!, { s4 - s7 }
|
||||
|
||||
|
||||
fldmias YO, { s4 - s7 }
|
||||
vldmia.f32 YO, { s4 - s7 }
|
||||
|
||||
vmla.f32 s4 , s0, s12
|
||||
vmla.f32 s5 , s0, s13
|
||||
vmla.f32 s6 , s0, s14
|
||||
vmla.f32 s7 , s0, s15
|
||||
|
||||
fstmias YO!, { s4 - s7 }
|
||||
vstmia.f32 YO!, { s4 - s7 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -394,8 +394,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmias XO! , { s2 }
|
||||
fldmias AO1 , { s8 }
|
||||
vldmia.f32 XO! , { s2 }
|
||||
vldmia.f32 AO1 , { s8 }
|
||||
vmla.f32 s12 , s2 , s8
|
||||
add AO1, AO1, LDA
|
||||
|
||||
@@ -403,9 +403,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmias YO, { s4 }
|
||||
vldmia.f32 YO, { s4 }
|
||||
vmla.f32 s4, s0, s12
|
||||
fstmias YO!, { s4 }
|
||||
vstmia.f32 YO!, { s4 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -434,8 +434,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S4X1
|
||||
|
||||
fldmias XO , { s2 }
|
||||
fldmias AO1 , { s8 - s11 }
|
||||
vldmia.f32 XO , { s2 }
|
||||
vldmia.f32 AO1 , { s8 - s11 }
|
||||
|
||||
vmla.f32 s12 , s2 , s8
|
||||
vmla.f32 s13 , s2 , s9
|
||||
@@ -449,24 +449,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S4
|
||||
|
||||
fldmias YO, { s4 }
|
||||
vldmia.f32 YO, { s4 }
|
||||
vmla.f32 s4 , s0, s12
|
||||
fstmias YO, { s4 }
|
||||
vstmia.f32 YO, { s4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s5 }
|
||||
vldmia.f32 YO, { s5 }
|
||||
vmla.f32 s5 , s0, s13
|
||||
fstmias YO, { s5 }
|
||||
vstmia.f32 YO, { s5 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s4 }
|
||||
vldmia.f32 YO, { s4 }
|
||||
vmla.f32 s4 , s0, s14
|
||||
fstmias YO, { s4 }
|
||||
vstmia.f32 YO, { s4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s5 }
|
||||
vldmia.f32 YO, { s5 }
|
||||
vmla.f32 s5 , s0, s15
|
||||
fstmias YO, { s5 }
|
||||
vstmia.f32 YO, { s5 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -482,8 +482,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmias XO , { s2 }
|
||||
fldmias AO1 , { s8 }
|
||||
vldmia.f32 XO , { s2 }
|
||||
vldmia.f32 AO1 , { s8 }
|
||||
vmla.f32 s12 , s2 , s8
|
||||
add AO1, AO1, LDA
|
||||
add XO, XO , INC_X
|
||||
@@ -492,9 +492,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmias YO, { s4 }
|
||||
vldmia.f32 YO, { s4 }
|
||||
vmla.f32 s4, s0, s12
|
||||
fstmias YO , { s4 }
|
||||
vstmia.f32 YO , { s4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
|
||||
@@ -138,8 +138,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F8X1
|
||||
|
||||
fldmiad XO! , { d4 }
|
||||
fldmiad AO1 , { d8 - d15 }
|
||||
vldmia.f64 XO! , { d4 }
|
||||
vldmia.f64 AO1 , { d8 - d15 }
|
||||
|
||||
vmla.f64 d24 , d4 , d8
|
||||
pld [ AO2 , #A_PRE ]
|
||||
@@ -158,7 +158,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F8
|
||||
|
||||
fldmiad YO, { d16 - d23 }
|
||||
vldmia.f64 YO, { d16 - d23 }
|
||||
|
||||
vmla.f64 d16, d0, d24
|
||||
vmla.f64 d17, d0, d25
|
||||
@@ -169,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmla.f64 d22, d0, d30
|
||||
vmla.f64 d23, d0, d31
|
||||
|
||||
fstmiad YO!, { d16 - d23 }
|
||||
vstmia.f64 YO!, { d16 - d23 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -184,8 +184,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmiad XO! , { d4 }
|
||||
fldmiad AO1 , { d8 }
|
||||
vldmia.f64 XO! , { d4 }
|
||||
vldmia.f64 AO1 , { d8 }
|
||||
vmla.f64 d24 , d4 , d8
|
||||
add AO1, AO1, LDA
|
||||
|
||||
@@ -193,9 +193,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmiad YO, { d16 }
|
||||
vldmia.f64 YO, { d16 }
|
||||
vmla.f64 d16, d0, d24
|
||||
fstmiad YO!, { d16 }
|
||||
vstmia.f64 YO!, { d16 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -234,8 +234,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
pld [ AO2 , #A_PRE ]
|
||||
pld [ AO2 , #A_PRE+32 ]
|
||||
fldmiad XO , { d4 }
|
||||
fldmiad AO1 , { d8 - d15 }
|
||||
vldmia.f64 XO , { d4 }
|
||||
vldmia.f64 AO1 , { d8 - d15 }
|
||||
|
||||
vmla.f64 d24 , d4 , d8
|
||||
vmla.f64 d25 , d4 , d9
|
||||
@@ -253,44 +253,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S8
|
||||
|
||||
fldmiad YO, { d16 }
|
||||
vldmia.f64 YO, { d16 }
|
||||
vmla.f64 d16, d0, d24
|
||||
fstmiad YO, { d16 }
|
||||
vstmia.f64 YO, { d16 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d17 }
|
||||
vldmia.f64 YO, { d17 }
|
||||
vmla.f64 d17, d0, d25
|
||||
fstmiad YO, { d17 }
|
||||
vstmia.f64 YO, { d17 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d18 }
|
||||
vldmia.f64 YO, { d18 }
|
||||
vmla.f64 d18, d0, d26
|
||||
fstmiad YO, { d18 }
|
||||
vstmia.f64 YO, { d18 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d19 }
|
||||
vldmia.f64 YO, { d19 }
|
||||
vmla.f64 d19, d0, d27
|
||||
fstmiad YO, { d19 }
|
||||
vstmia.f64 YO, { d19 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d20 }
|
||||
vldmia.f64 YO, { d20 }
|
||||
vmla.f64 d20, d0, d28
|
||||
fstmiad YO, { d20 }
|
||||
vstmia.f64 YO, { d20 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d21 }
|
||||
vldmia.f64 YO, { d21 }
|
||||
vmla.f64 d21, d0, d29
|
||||
fstmiad YO, { d21 }
|
||||
vstmia.f64 YO, { d21 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d22 }
|
||||
vldmia.f64 YO, { d22 }
|
||||
vmla.f64 d22, d0, d30
|
||||
fstmiad YO, { d22 }
|
||||
vstmia.f64 YO, { d22 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d23 }
|
||||
vldmia.f64 YO, { d23 }
|
||||
vmla.f64 d23, d0, d31
|
||||
fstmiad YO, { d23 }
|
||||
vstmia.f64 YO, { d23 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -306,8 +306,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmiad XO , { d4 }
|
||||
fldmiad AO1 , { d8 }
|
||||
vldmia.f64 XO , { d4 }
|
||||
vldmia.f64 AO1 , { d8 }
|
||||
vmla.f64 d24 , d4 , d8
|
||||
add AO1, AO1, LDA
|
||||
add XO, XO, INC_X
|
||||
@@ -316,9 +316,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmiad YO, { d16 }
|
||||
vldmia.f64 YO, { d16 }
|
||||
vmla.f64 d16, d0, d24
|
||||
fstmiad YO, { d16 }
|
||||
vstmia.f64 YO, { d16 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -361,8 +361,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F8X1
|
||||
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmias XO! , { s4 }
|
||||
fldmias AO1 , { s8 - s15 }
|
||||
vldmia.f32 XO! , { s4 }
|
||||
vldmia.f32 AO1 , { s8 - s15 }
|
||||
|
||||
vmla.f32 s24 , s4 , s8
|
||||
vmla.f32 s25 , s4 , s9
|
||||
@@ -379,7 +379,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F8
|
||||
|
||||
fldmias YO, { s16 - s23 }
|
||||
vldmia.f32 YO, { s16 - s23 }
|
||||
|
||||
vmla.f32 s16, s0, s24
|
||||
vmla.f32 s17, s0, s25
|
||||
@@ -390,7 +390,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmla.f32 s22, s0, s30
|
||||
vmla.f32 s23, s0, s31
|
||||
|
||||
fstmias YO!, { s16 - s23 }
|
||||
vstmia.f32 YO!, { s16 - s23 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -405,8 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmias XO! , { s4 }
|
||||
fldmias AO1 , { s8 }
|
||||
vldmia.f32 XO! , { s4 }
|
||||
vldmia.f32 AO1 , { s8 }
|
||||
vmla.f32 s24 , s4 , s8
|
||||
add AO1, AO1, LDA
|
||||
|
||||
@@ -414,9 +414,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmias YO, { s16 }
|
||||
vldmia.f32 YO, { s16 }
|
||||
vmla.f32 s16, s0, s24
|
||||
fstmias YO!, { s16 }
|
||||
vstmia.f32 YO!, { s16 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -454,8 +454,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_S8X1
|
||||
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmias XO , { s4 }
|
||||
fldmias AO1 , { s8 - s15 }
|
||||
vldmia.f32 XO , { s4 }
|
||||
vldmia.f32 AO1 , { s8 - s15 }
|
||||
|
||||
vmla.f32 s24 , s4 , s8
|
||||
vmla.f32 s25 , s4 , s9
|
||||
@@ -473,44 +473,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S8
|
||||
|
||||
fldmias YO, { s16 }
|
||||
vldmia.f32 YO, { s16 }
|
||||
vmla.f32 s16, s0, s24
|
||||
fstmias YO, { s16 }
|
||||
vstmia.f32 YO, { s16 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s17 }
|
||||
vldmia.f32 YO, { s17 }
|
||||
vmla.f32 s17, s0, s25
|
||||
fstmias YO, { s17 }
|
||||
vstmia.f32 YO, { s17 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s18 }
|
||||
vldmia.f32 YO, { s18 }
|
||||
vmla.f32 s18, s0, s26
|
||||
fstmias YO, { s18 }
|
||||
vstmia.f32 YO, { s18 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s19 }
|
||||
vldmia.f32 YO, { s19 }
|
||||
vmla.f32 s19, s0, s27
|
||||
fstmias YO, { s19 }
|
||||
vstmia.f32 YO, { s19 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s20 }
|
||||
vldmia.f32 YO, { s20 }
|
||||
vmla.f32 s20, s0, s28
|
||||
fstmias YO, { s20 }
|
||||
vstmia.f32 YO, { s20 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s21 }
|
||||
vldmia.f32 YO, { s21 }
|
||||
vmla.f32 s21, s0, s29
|
||||
fstmias YO, { s21 }
|
||||
vstmia.f32 YO, { s21 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s22 }
|
||||
vldmia.f32 YO, { s22 }
|
||||
vmla.f32 s22, s0, s30
|
||||
fstmias YO, { s22 }
|
||||
vstmia.f32 YO, { s22 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s23 }
|
||||
vldmia.f32 YO, { s23 }
|
||||
vmla.f32 s23, s0, s31
|
||||
fstmias YO, { s23 }
|
||||
vstmia.f32 YO, { s23 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -526,8 +526,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmias XO , { s4 }
|
||||
fldmias AO1 , { s8 }
|
||||
vldmia.f32 XO , { s4 }
|
||||
vldmia.f32 AO1 , { s8 }
|
||||
vmla.f32 s24 , s4 , s8
|
||||
add AO1, AO1, LDA
|
||||
add XO, XO, INC_X
|
||||
@@ -536,9 +536,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmias YO, { s16 }
|
||||
vldmia.f32 YO, { s16 }
|
||||
vmla.f32 s16, s0, s24
|
||||
fstmias YO, { s16 }
|
||||
vstmia.f32 YO, { s16 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
|
||||
@@ -112,13 +112,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F2X4
|
||||
|
||||
pld [ XO , #X_PRE ]
|
||||
fldmiad XO! , { d12 - d15 }
|
||||
vldmia.f64 XO! , { d12 - d15 }
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmiad AO2!, { d4 - d5 }
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
fldmiad AO2!, { d6 - d7 }
|
||||
vldmia.f64 AO2!, { d4 - d5 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO2!, { d6 - d7 }
|
||||
|
||||
vmla.f64 d2 , d12 , d8
|
||||
vmla.f64 d3 , d12 , d4
|
||||
@@ -133,9 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F2X1
|
||||
|
||||
fldmiad XO! , { d1 }
|
||||
fldmiad AO1!, { d8 }
|
||||
fldmiad AO2!, { d4 }
|
||||
vldmia.f64 XO! , { d1 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
vldmia.f64 AO2!, { d4 }
|
||||
vmla.f64 d2 , d1 , d8
|
||||
vmla.f64 d3 , d1 , d4
|
||||
|
||||
@@ -143,10 +143,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F2
|
||||
|
||||
fldmiad YO, { d4 - d5 }
|
||||
vldmia.f64 YO, { d4 - d5 }
|
||||
vmla.f64 d4, d0, d2
|
||||
vmla.f64 d5, d0, d3
|
||||
fstmiad YO!, { d4 - d5 }
|
||||
vstmia.f64 YO!, { d4 - d5 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -160,10 +160,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F1X4
|
||||
|
||||
pld [ XO , #X_PRE ]
|
||||
fldmiad XO! , { d12 - d15 }
|
||||
vldmia.f64 XO! , { d12 - d15 }
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
vmla.f64 d2 , d12 , d8
|
||||
vmla.f64 d2 , d13 , d9
|
||||
vmla.f64 d2 , d14, d10
|
||||
@@ -173,17 +173,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmiad XO! , { d1 }
|
||||
fldmiad AO1!, { d8 }
|
||||
vldmia.f64 XO! , { d1 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
vmla.f64 d2 , d1 , d8
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmiad YO, { d4 }
|
||||
vldmia.f64 YO, { d4 }
|
||||
vmla.f64 d4, d0, d2
|
||||
fstmiad YO!, { d4 }
|
||||
vstmia.f64 YO!, { d4 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -197,23 +197,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X4
|
||||
|
||||
fldmiad XO , { d12 }
|
||||
vldmia.f64 XO , { d12 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmiad AO2!, { d4 - d5 }
|
||||
vldmia.f64 AO2!, { d4 - d5 }
|
||||
|
||||
fldmiad XO , { d13 }
|
||||
vldmia.f64 XO , { d13 }
|
||||
add XO, XO, INC_X
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
fldmiad AO2!, { d6 - d7 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO2!, { d6 - d7 }
|
||||
|
||||
fldmiad XO , { d14 }
|
||||
vldmia.f64 XO , { d14 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
fldmiad XO , { d15 }
|
||||
vldmia.f64 XO , { d15 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
vmla.f64 d2 , d12 , d8
|
||||
@@ -229,9 +229,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X1
|
||||
|
||||
fldmiad XO , { d1 }
|
||||
fldmiad AO1!, { d8 }
|
||||
fldmiad AO2!, { d4 }
|
||||
vldmia.f64 XO , { d1 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
vldmia.f64 AO2!, { d4 }
|
||||
vmla.f64 d2 , d1 , d8
|
||||
add XO, XO, INC_X
|
||||
vmla.f64 d3 , d1 , d4
|
||||
@@ -240,14 +240,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S2
|
||||
|
||||
fldmiad YO, { d4 }
|
||||
vldmia.f64 YO, { d4 }
|
||||
vmla.f64 d4, d0, d2
|
||||
fstmiad YO, { d4 }
|
||||
vstmia.f64 YO, { d4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d5 }
|
||||
vldmia.f64 YO, { d5 }
|
||||
vmla.f64 d5, d0, d3
|
||||
fstmiad YO, { d5 }
|
||||
vstmia.f64 YO, { d5 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -261,20 +261,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X4
|
||||
|
||||
fldmiad XO , { d12 }
|
||||
vldmia.f64 XO , { d12 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
|
||||
fldmiad XO , { d13 }
|
||||
vldmia.f64 XO , { d13 }
|
||||
add XO, XO, INC_X
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
|
||||
fldmiad XO , { d14 }
|
||||
vldmia.f64 XO , { d14 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
fldmiad XO , { d15 }
|
||||
vldmia.f64 XO , { d15 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
vmla.f64 d2 , d12 , d8
|
||||
@@ -286,8 +286,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmiad XO , { d1 }
|
||||
fldmiad AO1!, { d8 }
|
||||
vldmia.f64 XO , { d1 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
vmla.f64 d2 , d1 , d8
|
||||
add XO, XO, INC_X
|
||||
|
||||
@@ -295,9 +295,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmiad YO, { d4 }
|
||||
vldmia.f64 YO, { d4 }
|
||||
vmla.f64 d4, d0, d2
|
||||
fstmiad YO, { d4 }
|
||||
vstmia.f64 YO, { d4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -315,11 +315,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F2X4
|
||||
|
||||
fldmias XO! , { s12 - s15 }
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
fldmias AO2!, { s4 - s5 }
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
fldmias AO2!, { s6 - s7 }
|
||||
vldmia.f32 XO! , { s12 - s15 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
vldmia.f32 AO2!, { s4 - s5 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
vldmia.f32 AO2!, { s6 - s7 }
|
||||
|
||||
vmla.f32 s2 , s12 , s8
|
||||
vmla.f32 s3 , s12 , s4
|
||||
@@ -334,9 +334,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F2X1
|
||||
|
||||
fldmias XO! , { s1 }
|
||||
fldmias AO1!, { s8 }
|
||||
fldmias AO2!, { s4 }
|
||||
vldmia.f32 XO! , { s1 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
vldmia.f32 AO2!, { s4 }
|
||||
vmla.f32 s2 , s1 , s8
|
||||
vmla.f32 s3 , s1 , s4
|
||||
|
||||
@@ -344,10 +344,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F2
|
||||
|
||||
fldmias YO, { s4 - s5 }
|
||||
vldmia.f32 YO, { s4 - s5 }
|
||||
vmla.f32 s4, s0, s2
|
||||
vmla.f32 s5, s0, s3
|
||||
fstmias YO!, { s4 - s5 }
|
||||
vstmia.f32 YO!, { s4 - s5 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -359,9 +359,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X4
|
||||
|
||||
fldmias XO! , { s12 - s15 }
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
vldmia.f32 XO! , { s12 - s15 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
vmla.f32 s2 , s12 , s8
|
||||
vmla.f32 s2 , s13 , s9
|
||||
vmla.f32 s2 , s14, s10
|
||||
@@ -371,17 +371,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmias XO! , { s1 }
|
||||
fldmias AO1!, { s8 }
|
||||
vldmia.f32 XO! , { s1 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
vmla.f32 s2 , s1 , s8
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmias YO, { s4 }
|
||||
vldmia.f32 YO, { s4 }
|
||||
vmla.f32 s4, s0, s2
|
||||
fstmias YO!, { s4 }
|
||||
vstmia.f32 YO!, { s4 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -395,21 +395,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X4
|
||||
|
||||
fldmias XO , { s12 }
|
||||
vldmia.f32 XO , { s12 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
fldmias AO2!, { s4 - s5 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
vldmia.f32 AO2!, { s4 - s5 }
|
||||
|
||||
fldmias XO , { s13 }
|
||||
vldmia.f32 XO , { s13 }
|
||||
add XO, XO, INC_X
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
fldmias AO2!, { s6 - s7 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
vldmia.f32 AO2!, { s6 - s7 }
|
||||
|
||||
fldmias XO , { s14 }
|
||||
vldmia.f32 XO , { s14 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
fldmias XO , { s15 }
|
||||
vldmia.f32 XO , { s15 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
vmla.f32 s2 , s12 , s8
|
||||
@@ -425,9 +425,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X1
|
||||
|
||||
fldmias XO , { s1 }
|
||||
fldmias AO1!, { s8 }
|
||||
fldmias AO2!, { s4 }
|
||||
vldmia.f32 XO , { s1 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
vldmia.f32 AO2!, { s4 }
|
||||
vmla.f32 s2 , s1 , s8
|
||||
add XO, XO, INC_X
|
||||
vmla.f32 s3 , s1 , s4
|
||||
@@ -436,14 +436,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S2
|
||||
|
||||
fldmias YO, { s4 }
|
||||
vldmia.f32 YO, { s4 }
|
||||
vmla.f32 s4, s0, s2
|
||||
fstmias YO, { s4 }
|
||||
vstmia.f32 YO, { s4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s5 }
|
||||
vldmia.f32 YO, { s5 }
|
||||
vmla.f32 s5, s0, s3
|
||||
fstmias YO, { s5 }
|
||||
vstmia.f32 YO, { s5 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -456,20 +456,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X4
|
||||
|
||||
fldmias XO , { s12 }
|
||||
vldmia.f32 XO , { s12 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
|
||||
fldmias XO , { s13 }
|
||||
vldmia.f32 XO , { s13 }
|
||||
add XO, XO, INC_X
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
|
||||
fldmias XO , { s14 }
|
||||
vldmia.f32 XO , { s14 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
fldmias XO , { s15 }
|
||||
vldmia.f32 XO , { s15 }
|
||||
add XO, XO, INC_X
|
||||
|
||||
vmla.f32 s2 , s12 , s8
|
||||
@@ -481,8 +481,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmias XO , { s1 }
|
||||
fldmias AO1!, { s8 }
|
||||
vldmia.f32 XO , { s1 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
vmla.f32 s2 , s1 , s8
|
||||
add XO, XO, INC_X
|
||||
|
||||
@@ -490,9 +490,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmias YO, { s4 }
|
||||
vldmia.f32 YO, { s4 }
|
||||
vmla.f32 s4, s0, s2
|
||||
fstmias YO, { s4 }
|
||||
vstmia.f32 YO, { s4 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
|
||||
@@ -108,17 +108,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F2X4
|
||||
|
||||
pld [ XO , #X_PRE ]
|
||||
fldmiad XO! , { d28 - d31 }
|
||||
vldmia.f64 XO! , { d28 - d31 }
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmiad AO2!, { d16 - d17 }
|
||||
vldmia.f64 AO2!, { d16 - d17 }
|
||||
vmla.f64 d4 , d28 , d8
|
||||
vmla.f64 d5 , d28 , d16
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
vmla.f64 d4 , d29 , d9
|
||||
vmla.f64 d5 , d29 , d17
|
||||
fldmiad AO2!, { d18 - d19 }
|
||||
vldmia.f64 AO2!, { d18 - d19 }
|
||||
vmla.f64 d4 , d30, d10
|
||||
vmla.f64 d5 , d30, d18
|
||||
vmla.f64 d4 , d31, d11
|
||||
@@ -129,9 +129,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F2X1
|
||||
|
||||
fldmiad XO! , { d2 }
|
||||
fldmiad AO1!, { d8 }
|
||||
fldmiad AO2!, { d16 }
|
||||
vldmia.f64 XO! , { d2 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
vldmia.f64 AO2!, { d16 }
|
||||
vmla.f64 d4 , d2 , d8
|
||||
vmla.f64 d5 , d2 , d16
|
||||
|
||||
@@ -139,10 +139,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F2
|
||||
|
||||
fldmiad YO, { d24 - d25 }
|
||||
vldmia.f64 YO, { d24 - d25 }
|
||||
vmla.f64 d24, d0, d4
|
||||
vmla.f64 d25, d0, d5
|
||||
fstmiad YO!, { d24 - d25 }
|
||||
vstmia.f64 YO!, { d24 - d25 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -156,23 +156,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_S2X4
|
||||
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad XO , { d28 }
|
||||
vldmia.f64 XO , { d28 }
|
||||
add XO, XO, INC_X
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
pld [ AO2 , #A_PRE ]
|
||||
fldmiad AO2!, { d16 - d17 }
|
||||
vldmia.f64 AO2!, { d16 - d17 }
|
||||
vmla.f64 d4 , d28 , d8
|
||||
fldmiad XO , { d29 }
|
||||
vldmia.f64 XO , { d29 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f64 d5 , d28 , d16
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
vmla.f64 d4 , d29 , d9
|
||||
fldmiad XO , { d30 }
|
||||
vldmia.f64 XO , { d30 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f64 d5 , d29 , d17
|
||||
fldmiad AO2!, { d18 - d19 }
|
||||
vldmia.f64 AO2!, { d18 - d19 }
|
||||
vmla.f64 d4 , d30, d10
|
||||
fldmiad XO , { d31 }
|
||||
vldmia.f64 XO , { d31 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f64 d5 , d30, d18
|
||||
vmla.f64 d4 , d31, d11
|
||||
@@ -183,10 +183,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X1
|
||||
|
||||
fldmiad XO , { d2 }
|
||||
fldmiad AO1!, { d8 }
|
||||
vldmia.f64 XO , { d2 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
add XO, XO, INC_X
|
||||
fldmiad AO2!, { d16 }
|
||||
vldmia.f64 AO2!, { d16 }
|
||||
vmla.f64 d4 , d2 , d8
|
||||
vmla.f64 d5 , d2 , d16
|
||||
|
||||
@@ -194,14 +194,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S2
|
||||
|
||||
fldmiad YO, { d24 }
|
||||
vldmia.f64 YO, { d24 }
|
||||
vmla.f64 d24, d0, d4
|
||||
fstmiad YO, { d24 }
|
||||
vstmia.f64 YO, { d24 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmiad YO, { d24 }
|
||||
vldmia.f64 YO, { d24 }
|
||||
vmla.f64 d24, d0, d5
|
||||
fstmiad YO, { d24 }
|
||||
vstmia.f64 YO, { d24 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F1X4
|
||||
|
||||
pld [ XO , #X_PRE ]
|
||||
fldmiad XO! , { d28 - d31 }
|
||||
vldmia.f64 XO! , { d28 - d31 }
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
vmla.f64 d4 , d28 , d8
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
vmla.f64 d4 , d29 , d9
|
||||
vmla.f64 d4 , d30, d10
|
||||
vmla.f64 d4 , d31, d11
|
||||
@@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmiad XO! , { d2 }
|
||||
fldmiad AO1!, { d8 }
|
||||
vldmia.f64 XO! , { d2 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
vmla.f64 d4 , d2 , d8
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmiad YO, { d24 }
|
||||
vldmia.f64 YO, { d24 }
|
||||
vmla.f64 d24, d0, d4
|
||||
fstmiad YO!, { d24 }
|
||||
vstmia.f64 YO!, { d24 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -252,18 +252,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_S1X4
|
||||
|
||||
pld [ AO1 , #A_PRE ]
|
||||
fldmiad XO , { d28 }
|
||||
vldmia.f64 XO , { d28 }
|
||||
add XO, XO, INC_X
|
||||
fldmiad AO1!, { d8 - d9 }
|
||||
vldmia.f64 AO1!, { d8 - d9 }
|
||||
vmla.f64 d4 , d28 , d8
|
||||
fldmiad XO , { d29 }
|
||||
vldmia.f64 XO , { d29 }
|
||||
add XO, XO, INC_X
|
||||
fldmiad AO1!, { d10 - d11 }
|
||||
vldmia.f64 AO1!, { d10 - d11 }
|
||||
vmla.f64 d4 , d29 , d9
|
||||
fldmiad XO , { d30 }
|
||||
vldmia.f64 XO , { d30 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f64 d4 , d30, d10
|
||||
fldmiad XO , { d31 }
|
||||
vldmia.f64 XO , { d31 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f64 d4 , d31, d11
|
||||
|
||||
@@ -272,8 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmiad XO , { d2 }
|
||||
fldmiad AO1!, { d8 }
|
||||
vldmia.f64 XO , { d2 }
|
||||
vldmia.f64 AO1!, { d8 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f64 d4 , d2 , d8
|
||||
|
||||
@@ -281,9 +281,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmiad YO, { d24 }
|
||||
vldmia.f64 YO, { d24 }
|
||||
vmla.f64 d24, d0, d4
|
||||
fstmiad YO, { d24 }
|
||||
vstmia.f64 YO, { d24 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -300,15 +300,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F2X4
|
||||
|
||||
fldmias XO! , { s28 - s31 }
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
fldmias AO2!, { s16 - s17 }
|
||||
vldmia.f32 XO! , { s28 - s31 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
vldmia.f32 AO2!, { s16 - s17 }
|
||||
vmla.f32 s4 , s28 , s8
|
||||
vmla.f32 s5 , s28 , s16
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
vmla.f32 s4 , s29 , s9
|
||||
vmla.f32 s5 , s29 , s17
|
||||
fldmias AO2!, { s18 - s19 }
|
||||
vldmia.f32 AO2!, { s18 - s19 }
|
||||
vmla.f32 s4 , s30, s10
|
||||
vmla.f32 s5 , s30, s18
|
||||
vmla.f32 s4 , s31, s11
|
||||
@@ -319,9 +319,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F2X1
|
||||
|
||||
fldmias XO! , { s2 }
|
||||
fldmias AO1!, { s8 }
|
||||
fldmias AO2!, { s16 }
|
||||
vldmia.f32 XO! , { s2 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
vldmia.f32 AO2!, { s16 }
|
||||
vmla.f32 s4 , s2 , s8
|
||||
vmla.f32 s5 , s2 , s16
|
||||
|
||||
@@ -329,10 +329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_F2
|
||||
|
||||
fldmias YO, { s24 - s25 }
|
||||
vldmia.f32 YO, { s24 - s25 }
|
||||
vmla.f32 s24, s0, s4
|
||||
vmla.f32 s25, s0, s5
|
||||
fstmias YO!, { s24 - s25 }
|
||||
vstmia.f32 YO!, { s24 - s25 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -345,22 +345,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X4
|
||||
|
||||
fldmias XO , { s28 }
|
||||
vldmia.f32 XO , { s28 }
|
||||
add XO, XO, INC_X
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
fldmias AO2!, { s16 - s17 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
vldmia.f32 AO2!, { s16 - s17 }
|
||||
vmla.f32 s4 , s28 , s8
|
||||
fldmias XO , { s29 }
|
||||
vldmia.f32 XO , { s29 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f32 s5 , s28 , s16
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
vmla.f32 s4 , s29 , s9
|
||||
fldmias XO , { s30 }
|
||||
vldmia.f32 XO , { s30 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f32 s5 , s29 , s17
|
||||
fldmias AO2!, { s18 - s19 }
|
||||
vldmia.f32 AO2!, { s18 - s19 }
|
||||
vmla.f32 s4 , s30, s10
|
||||
fldmias XO , { s31 }
|
||||
vldmia.f32 XO , { s31 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f32 s5 , s30, s18
|
||||
vmla.f32 s4 , s31, s11
|
||||
@@ -371,10 +371,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S2X1
|
||||
|
||||
fldmias XO , { s2 }
|
||||
fldmias AO1!, { s8 }
|
||||
vldmia.f32 XO , { s2 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
add XO, XO, INC_X
|
||||
fldmias AO2!, { s16 }
|
||||
vldmia.f32 AO2!, { s16 }
|
||||
vmla.f32 s4 , s2 , s8
|
||||
vmla.f32 s5 , s2 , s16
|
||||
|
||||
@@ -382,14 +382,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S2
|
||||
|
||||
fldmias YO, { s24 }
|
||||
vldmia.f32 YO, { s24 }
|
||||
vmla.f32 s24, s0, s4
|
||||
fstmias YO, { s24 }
|
||||
vstmia.f32 YO, { s24 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
fldmias YO, { s24 }
|
||||
vldmia.f32 YO, { s24 }
|
||||
vmla.f32 s24, s0, s5
|
||||
fstmias YO, { s24 }
|
||||
vstmia.f32 YO, { s24 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
@@ -402,10 +402,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X4
|
||||
|
||||
fldmias XO! , { s28 - s31 }
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
vldmia.f32 XO! , { s28 - s31 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
vmla.f32 s4 , s28 , s8
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
vmla.f32 s4 , s29 , s9
|
||||
vmla.f32 s4 , s30, s10
|
||||
vmla.f32 s4 , s31, s11
|
||||
@@ -415,17 +415,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1X1
|
||||
|
||||
fldmias XO! , { s2 }
|
||||
fldmias AO1!, { s8 }
|
||||
vldmia.f32 XO! , { s2 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
vmla.f32 s4 , s2 , s8
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVE_F1
|
||||
|
||||
fldmias YO, { s24 }
|
||||
vldmia.f32 YO, { s24 }
|
||||
vmla.f32 s24, s0, s4
|
||||
fstmias YO!, { s24 }
|
||||
vstmia.f32 YO!, { s24 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -437,18 +437,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X4
|
||||
|
||||
fldmias XO , { s28 }
|
||||
vldmia.f32 XO , { s28 }
|
||||
add XO, XO, INC_X
|
||||
fldmias AO1!, { s8 - s9 }
|
||||
vldmia.f32 AO1!, { s8 - s9 }
|
||||
vmla.f32 s4 , s28 , s8
|
||||
fldmias XO , { s29 }
|
||||
vldmia.f32 XO , { s29 }
|
||||
add XO, XO, INC_X
|
||||
fldmias AO1!, { s10 - s11 }
|
||||
vldmia.f32 AO1!, { s10 - s11 }
|
||||
vmla.f32 s4 , s29 , s9
|
||||
fldmias XO , { s30 }
|
||||
vldmia.f32 XO , { s30 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f32 s4 , s30, s10
|
||||
fldmias XO , { s31 }
|
||||
vldmia.f32 XO , { s31 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f32 s4 , s31, s11
|
||||
|
||||
@@ -457,8 +457,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1X1
|
||||
|
||||
fldmias XO , { s2 }
|
||||
fldmias AO1!, { s8 }
|
||||
vldmia.f32 XO , { s2 }
|
||||
vldmia.f32 AO1!, { s8 }
|
||||
add XO, XO, INC_X
|
||||
vmla.f32 s4 , s2 , s8
|
||||
|
||||
@@ -466,9 +466,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro SAVE_S1
|
||||
|
||||
fldmias YO, { s24 }
|
||||
vldmia.f32 YO, { s24 }
|
||||
vmla.f32 s24, s0, s4
|
||||
fstmias YO, { s24 }
|
||||
vstmia.f32 YO, { s24 }
|
||||
add YO, YO, INC_Y
|
||||
|
||||
.endm
|
||||
|
||||
@@ -114,7 +114,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_F
|
||||
|
||||
fldmiad X!, { d0 }
|
||||
vldmia.f64 X!, { d0 }
|
||||
VABS( d0, d0 )
|
||||
mov Z, #1
|
||||
mov INDEX, Z
|
||||
@@ -123,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 }
|
||||
vldmia.f64 X!, { d4 }
|
||||
add Z, Z, #1
|
||||
VABS( d4, d4 )
|
||||
vcmpe.f64 d4, d0
|
||||
@@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
fldmiad X, { d0 }
|
||||
vldmia.f64 X, { d0 }
|
||||
VABS( d0, d0 )
|
||||
mov Z, #1
|
||||
mov INDEX, Z
|
||||
@@ -146,7 +146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 }
|
||||
vldmia.f64 X, { d4 }
|
||||
add Z, Z, #1
|
||||
VABS( d4, d4 )
|
||||
vcmpe.f64 d4, d0
|
||||
@@ -161,7 +161,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_F
|
||||
|
||||
fldmias X!, { s0 }
|
||||
vldmia.f32 X!, { s0 }
|
||||
VABS( s0, s0 )
|
||||
mov Z, #1
|
||||
mov INDEX, Z
|
||||
@@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 }
|
||||
vldmia.f32 X!, { s4 }
|
||||
add Z, Z, #1
|
||||
VABS( s4, s4 )
|
||||
vcmpe.f32 s4, s0
|
||||
@@ -182,7 +182,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
fldmias X, { s0 }
|
||||
vldmia.f32 X, { s0 }
|
||||
VABS( s0, s0 )
|
||||
mov Z, #1
|
||||
mov INDEX, Z
|
||||
@@ -193,7 +193,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 }
|
||||
vldmia.f32 X, { s4 }
|
||||
add Z, Z, #1
|
||||
VABS( s4, s4 )
|
||||
vcmpe.f32 s4, s0
|
||||
@@ -215,7 +215,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_F
|
||||
|
||||
fldmiad X!, { d0 -d1 }
|
||||
vldmia.f64 X!, { d0 -d1 }
|
||||
vabs.f64 d0, d0
|
||||
vabs.f64 d1, d1
|
||||
vadd.f64 d0 , d0, d1
|
||||
@@ -227,7 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 - d5 }
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
add Z, Z, #1
|
||||
vabs.f64 d4, d4
|
||||
vabs.f64 d5, d5
|
||||
@@ -241,7 +241,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
fldmiad X, { d0 -d1 }
|
||||
vldmia.f64 X, { d0 -d1 }
|
||||
vabs.f64 d0, d0
|
||||
vabs.f64 d1, d1
|
||||
vadd.f64 d0 , d0, d1
|
||||
@@ -255,7 +255,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
add Z, Z, #1
|
||||
vabs.f64 d4, d4
|
||||
vabs.f64 d5, d5
|
||||
@@ -272,7 +272,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_F
|
||||
|
||||
fldmias X!, { s0 -s1 }
|
||||
vldmia.f32 X!, { s0 -s1 }
|
||||
vabs.f32 s0, s0
|
||||
vabs.f32 s1, s1
|
||||
vadd.f32 s0 , s0, s1
|
||||
@@ -284,7 +284,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 - s5 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
add Z, Z, #1
|
||||
vabs.f32 s4, s4
|
||||
vabs.f32 s5, s5
|
||||
@@ -298,7 +298,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
fldmias X, { s0 -s1 }
|
||||
vldmia.f32 X, { s0 -s1 }
|
||||
vabs.f32 s0, s0
|
||||
vabs.f32 s1, s1
|
||||
vadd.f32 s0 , s0, s1
|
||||
@@ -312,7 +312,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
add Z, Z, #1
|
||||
vabs.f32 s4, s4
|
||||
vabs.f32 s5, s5
|
||||
|
||||
@@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 }
|
||||
vldmia.f64 X!, { d4 }
|
||||
vcmpe.f64 d4, d6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
@@ -95,7 +95,7 @@ KERNEL_F1_NEXT_\@:
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vcmpe.f64 d4, d6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_S1_NEXT
|
||||
@@ -121,7 +121,7 @@ KERNEL_S1_NEXT:
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 }
|
||||
vldmia.f32 X!, { s4 }
|
||||
vcmpe.f32 s4, s6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
@@ -158,7 +158,7 @@ KERNEL_F1_NEXT_\@:
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vcmpe.f32 s4, s6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_S1_NEXT
|
||||
@@ -191,7 +191,7 @@ KERNEL_S1_NEXT:
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 - d5 }
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
|
||||
vcmpe.f64 d4, d6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
@@ -249,7 +249,7 @@ KERNEL_F1_END_\@:
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
|
||||
vcmpe.f64 d4, d6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
@@ -294,7 +294,7 @@ KERNEL_S1_END_\@:
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 - s5 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
|
||||
vcmpe.f32 s4, s6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
@@ -350,7 +350,7 @@ KERNEL_F1_END_\@:
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
|
||||
vcmpe.f32 s4, s6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
|
||||
@@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 }
|
||||
vldmia.f64 X!, { d4 }
|
||||
vcmpe.f64 d4, d6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
@@ -95,7 +95,7 @@ KERNEL_F1_NEXT_\@:
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vcmpe.f64 d4, d6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_S1_NEXT
|
||||
@@ -121,7 +121,7 @@ KERNEL_S1_NEXT:
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 }
|
||||
vldmia.f32 X!, { s4 }
|
||||
vcmpe.f32 s4, s6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
@@ -158,7 +158,7 @@ KERNEL_F1_NEXT_\@:
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vcmpe.f32 s4, s6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_S1_NEXT
|
||||
@@ -191,7 +191,7 @@ KERNEL_S1_NEXT:
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 - d5 }
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
|
||||
vcmpe.f64 d4, d6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
@@ -249,7 +249,7 @@ KERNEL_F1_END_\@:
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
|
||||
vcmpe.f64 d4, d6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
@@ -294,7 +294,7 @@ KERNEL_S1_END_\@:
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 - s5 }
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
|
||||
vcmpe.f32 s4, s6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
@@ -350,7 +350,7 @@ KERNEL_F1_END_\@:
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
|
||||
vcmpe.f32 s4, s6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
|
||||
@@ -77,68 +77,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
pld [ X, #X_PRE ]
|
||||
pld [ Y, #X_PRE ]
|
||||
|
||||
fldmiad X, { d4 }
|
||||
fldmiad Y, { d5 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vldmia.f64 Y, { d5 }
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vstmia.f64 X!, { d2 }
|
||||
vstmia.f64 Y!, { d3 }
|
||||
|
||||
fldmiad X, { d4 }
|
||||
fldmiad Y, { d5 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vldmia.f64 Y, { d5 }
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vstmia.f64 X!, { d2 }
|
||||
vstmia.f64 Y!, { d3 }
|
||||
|
||||
fldmiad X, { d4 }
|
||||
fldmiad Y, { d5 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vldmia.f64 Y, { d5 }
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vstmia.f64 X!, { d2 }
|
||||
vstmia.f64 Y!, { d3 }
|
||||
|
||||
fldmiad X, { d4 }
|
||||
fldmiad Y, { d5 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vldmia.f64 Y, { d5 }
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vstmia.f64 X!, { d2 }
|
||||
vstmia.f64 Y!, { d3 }
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X, { d4 }
|
||||
fldmiad Y, { d5 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vldmia.f64 Y, { d5 }
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vstmia.f64 X!, { d2 }
|
||||
vstmia.f64 Y!, { d3 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 }
|
||||
fldmiad Y, { d5 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vldmia.f64 Y, { d5 }
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X, { d2 }
|
||||
fstmiad Y, { d3 }
|
||||
vstmia.f64 X, { d2 }
|
||||
vstmia.f64 Y, { d3 }
|
||||
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
@@ -149,68 +149,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
fldmias X, { s4 }
|
||||
fldmias Y, { s5 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vldmia.f32 Y, { s5 }
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vstmia.f32 X!, { s2 }
|
||||
vstmia.f32 Y!, { s3 }
|
||||
|
||||
fldmias X, { s4 }
|
||||
fldmias Y, { s5 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vldmia.f32 Y, { s5 }
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vstmia.f32 X!, { s2 }
|
||||
vstmia.f32 Y!, { s3 }
|
||||
|
||||
fldmias X, { s4 }
|
||||
fldmias Y, { s5 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vldmia.f32 Y, { s5 }
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vstmia.f32 X!, { s2 }
|
||||
vstmia.f32 Y!, { s3 }
|
||||
|
||||
fldmias X, { s4 }
|
||||
fldmias Y, { s5 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vldmia.f32 Y, { s5 }
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vstmia.f32 X!, { s2 }
|
||||
vstmia.f32 Y!, { s3 }
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X, { s4 }
|
||||
fldmias Y, { s5 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vldmia.f32 Y, { s5 }
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vstmia.f32 X!, { s2 }
|
||||
vstmia.f32 Y!, { s3 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 }
|
||||
fldmias Y, { s5 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vldmia.f32 Y, { s5 }
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X, { s2 }
|
||||
fstmias Y, { s3 }
|
||||
vstmia.f32 X, { s2 }
|
||||
vstmia.f32 Y, { s3 }
|
||||
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
@@ -230,96 +230,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
pld [ X, #X_PRE ]
|
||||
pld [ Y, #X_PRE ]
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
fldmiad Y, { d6 - d7 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vldmia.f64 Y, { d6 - d7 }
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vstmia.f64 X!, { d2 }
|
||||
vstmia.f64 Y!, { d3 }
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
vmls.f64 d3 , d1, d5
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vstmia.f64 X!, { d2 }
|
||||
vstmia.f64 Y!, { d3 }
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
fldmiad Y, { d6 - d7 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vldmia.f64 Y, { d6 - d7 }
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vstmia.f64 X!, { d2 }
|
||||
vstmia.f64 Y!, { d3 }
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
vmls.f64 d3 , d1, d5
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vstmia.f64 X!, { d2 }
|
||||
vstmia.f64 Y!, { d3 }
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
pld [ Y, #X_PRE ]
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
fldmiad Y, { d6 - d7 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vldmia.f64 Y, { d6 - d7 }
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vstmia.f64 X!, { d2 }
|
||||
vstmia.f64 Y!, { d3 }
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
vmls.f64 d3 , d1, d5
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vstmia.f64 X!, { d2 }
|
||||
vstmia.f64 Y!, { d3 }
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
fldmiad Y, { d6 - d7 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vldmia.f64 Y, { d6 - d7 }
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vstmia.f64 X!, { d2 }
|
||||
vstmia.f64 Y!, { d3 }
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
vmls.f64 d3 , d1, d5
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vstmia.f64 X!, { d2 }
|
||||
vstmia.f64 Y!, { d3 }
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
fldmiad Y, { d6 - d7 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vldmia.f64 Y, { d6 - d7 }
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vstmia.f64 X!, { d2 }
|
||||
vstmia.f64 Y!, { d3 }
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
vmls.f64 d3 , d1, d5
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vstmia.f64 X!, { d2 }
|
||||
vstmia.f64 Y!, { d3 }
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
fldmiad Y, { d6 - d7 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vldmia.f64 Y, { d6 - d7 }
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
@@ -347,96 +347,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
pld [ X, #X_PRE ]
|
||||
pld [ Y, #X_PRE ]
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s6 - s7 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s6 - s7 }
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vstmia.f32 X!, { s2 }
|
||||
vstmia.f32 Y!, { s3 }
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
vmls.f32 s3 , s1, s5
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vstmia.f32 X!, { s2 }
|
||||
vstmia.f32 Y!, { s3 }
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s6 - s7 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s6 - s7 }
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vstmia.f32 X!, { s2 }
|
||||
vstmia.f32 Y!, { s3 }
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
vmls.f32 s3 , s1, s5
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vstmia.f32 X!, { s2 }
|
||||
vstmia.f32 Y!, { s3 }
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
pld [ Y, #X_PRE ]
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s6 - s7 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s6 - s7 }
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vstmia.f32 X!, { s2 }
|
||||
vstmia.f32 Y!, { s3 }
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
vmls.f32 s3 , s1, s5
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vstmia.f32 X!, { s2 }
|
||||
vstmia.f32 Y!, { s3 }
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s6 - s7 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s6 - s7 }
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vstmia.f32 X!, { s2 }
|
||||
vstmia.f32 Y!, { s3 }
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
vmls.f32 s3 , s1, s5
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vstmia.f32 X!, { s2 }
|
||||
vstmia.f32 Y!, { s3 }
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s6 - s7 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s6 - s7 }
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vstmia.f32 X!, { s2 }
|
||||
vstmia.f32 Y!, { s3 }
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
vmls.f32 s3 , s1, s5
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vstmia.f32 X!, { s2 }
|
||||
vstmia.f32 Y!, { s3 }
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
fldmias Y, { s6 - s7 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vldmia.f32 Y, { s6 - s7 }
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
|
||||
@@ -64,30 +64,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmiad X, { d4 - d7 }
|
||||
vldmia.f64 X, { d4 - d7 }
|
||||
vmul.f64 d4, d4, d0
|
||||
vmul.f64 d5, d5, d0
|
||||
vmul.f64 d6, d6, d0
|
||||
fstmiad X!, { d4 - d5 }
|
||||
vstmia.f64 X!, { d4 - d5 }
|
||||
vmul.f64 d7, d7, d0
|
||||
fstmiad X!, { d6 - d7 }
|
||||
vstmia.f64 X!, { d6 - d7 }
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X, { d4 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vmul.f64 d4, d4, d0
|
||||
fstmiad X!, { d4 }
|
||||
vstmia.f64 X!, { d4 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 }
|
||||
vldmia.f64 X, { d4 }
|
||||
vmul.f64 d4, d4, d0
|
||||
fstmiad X, { d4 }
|
||||
vstmia.f64 X, { d4 }
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
@@ -96,30 +96,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
fldmias X, { s4 - s7 }
|
||||
vldmia.f32 X, { s4 - s7 }
|
||||
vmul.f32 s4, s4, s0
|
||||
vmul.f32 s5, s5, s0
|
||||
vmul.f32 s6, s6, s0
|
||||
fstmias X!, { s4 - s5 }
|
||||
vstmia.f32 X!, { s4 - s5 }
|
||||
vmul.f32 s7, s7, s0
|
||||
fstmias X!, { s6 - s7 }
|
||||
vstmia.f32 X!, { s6 - s7 }
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X, { s4 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vmul.f32 s4, s4, s0
|
||||
fstmias X!, { s4 }
|
||||
vstmia.f32 X!, { s4 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vmul.f32 s4, s4, s0
|
||||
fstmias X, { s4 }
|
||||
vstmia.f32 X, { s4 }
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
@@ -136,58 +136,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X!, { d2 - d3 }
|
||||
vstmia.f64 X!, { d2 - d3 }
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X!, { d2 - d3 }
|
||||
vstmia.f64 X!, { d2 - d3 }
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X!, { d2 - d3 }
|
||||
vstmia.f64 X!, { d2 - d3 }
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X!, { d2 - d3 }
|
||||
vstmia.f64 X!, { d2 - d3 }
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X!, { d2 - d3 }
|
||||
vstmia.f64 X!, { d2 - d3 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X, { d2 - d3 }
|
||||
vstmia.f64 X, { d2 - d3 }
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
@@ -199,56 +199,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X!, { s2 - s3 }
|
||||
vstmia.f32 X!, { s2 - s3 }
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X!, { s2 - s3 }
|
||||
vstmia.f32 X!, { s2 - s3 }
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X!, { s2 - s3 }
|
||||
vstmia.f32 X!, { s2 - s3 }
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X!, { s2 - s3 }
|
||||
vstmia.f32 X!, { s2 - s3 }
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X!, { s2 - s3 }
|
||||
vstmia.f32 X!, { s2 - s3 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X, { s2 - s3 }
|
||||
vstmia.f32 X, { s2 - s3 }
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
@@ -65,17 +65,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY_F8
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
fldmias X!, { s0 - s3 }
|
||||
fldmias X!, { s4 - s7 }
|
||||
fstmias Y!, { s0 - s3 }
|
||||
fstmias Y!, { s4 - s7 }
|
||||
vldmia.f32 X!, { s0 - s3 }
|
||||
vldmia.f32 X!, { s4 - s7 }
|
||||
vstmia.f32 Y!, { s0 - s3 }
|
||||
vstmia.f32 Y!, { s4 - s7 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro COPY_F1
|
||||
|
||||
fldmias X!, { s0 }
|
||||
fstmias Y!, { s0 }
|
||||
vldmia.f32 X!, { s0 }
|
||||
vstmia.f32 Y!, { s0 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -85,23 +85,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY_S4
|
||||
|
||||
nop
|
||||
fldmias X, { s0 }
|
||||
fstmias Y, { s0 }
|
||||
vldmia.f32 X, { s0 }
|
||||
vstmia.f32 Y, { s0 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s1 }
|
||||
fstmias Y, { s1 }
|
||||
vldmia.f32 X, { s1 }
|
||||
vstmia.f32 Y, { s1 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s0 }
|
||||
fstmias Y, { s0 }
|
||||
vldmia.f32 X, { s0 }
|
||||
vstmia.f32 Y, { s0 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s1 }
|
||||
fstmias Y, { s1 }
|
||||
vldmia.f32 X, { s1 }
|
||||
vstmia.f32 Y, { s1 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -110,8 +110,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY_S1
|
||||
|
||||
fldmias X, { s0 }
|
||||
fstmias Y, { s0 }
|
||||
vldmia.f32 X, { s0 }
|
||||
vstmia.f32 Y, { s0 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
|
||||
@@ -68,26 +68,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
fldmias X!, { s14 }
|
||||
fldmias Y!, { s15 }
|
||||
vldmia.f32 X!, { s14 }
|
||||
vldmia.f32 Y!, { s15 }
|
||||
vmul.f32 s15, s14, s15
|
||||
vcvt.f64.f32 d4, s15
|
||||
vadd.f64 d0 , d0, d4
|
||||
|
||||
fldmias X!, { s14 }
|
||||
fldmias Y!, { s15 }
|
||||
vldmia.f32 X!, { s14 }
|
||||
vldmia.f32 Y!, { s15 }
|
||||
vmul.f32 s15, s14, s15
|
||||
vcvt.f64.f32 d4, s15
|
||||
vadd.f64 d0 , d0, d4
|
||||
|
||||
fldmias X!, { s14 }
|
||||
fldmias Y!, { s15 }
|
||||
vldmia.f32 X!, { s14 }
|
||||
vldmia.f32 Y!, { s15 }
|
||||
vmul.f32 s15, s14, s15
|
||||
vcvt.f64.f32 d4, s15
|
||||
vadd.f64 d0 , d0, d4
|
||||
|
||||
fldmias X!, { s14 }
|
||||
fldmias Y!, { s15 }
|
||||
vldmia.f32 X!, { s14 }
|
||||
vldmia.f32 Y!, { s15 }
|
||||
vmul.f32 s15, s14, s15
|
||||
vcvt.f64.f32 d4, s15
|
||||
vadd.f64 d0 , d0, d4
|
||||
@@ -96,8 +96,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s14 }
|
||||
fldmias Y!, { s15 }
|
||||
vldmia.f32 X!, { s14 }
|
||||
vldmia.f32 Y!, { s15 }
|
||||
vmul.f32 s15, s14, s15
|
||||
vcvt.f64.f32 d4, s15
|
||||
vadd.f64 d0 , d0, d4
|
||||
@@ -109,32 +109,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
nop
|
||||
|
||||
fldmias X, { s14 }
|
||||
fldmias Y, { s15 }
|
||||
vldmia.f32 X, { s14 }
|
||||
vldmia.f32 Y, { s15 }
|
||||
vmul.f32 s15, s14, s15
|
||||
vcvt.f64.f32 d4, s15
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s14 }
|
||||
fldmias Y, { s15 }
|
||||
vldmia.f32 X, { s14 }
|
||||
vldmia.f32 Y, { s15 }
|
||||
vmul.f32 s15, s14, s15
|
||||
vcvt.f64.f32 d4, s15
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s14 }
|
||||
fldmias Y, { s15 }
|
||||
vldmia.f32 X, { s14 }
|
||||
vldmia.f32 Y, { s15 }
|
||||
vmul.f32 s15, s14, s15
|
||||
vcvt.f64.f32 d4, s15
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmias X, { s14 }
|
||||
fldmias Y, { s15 }
|
||||
vldmia.f32 X, { s14 }
|
||||
vldmia.f32 Y, { s15 }
|
||||
vmul.f32 s15, s14, s15
|
||||
vcvt.f64.f32 d4, s15
|
||||
vadd.f64 d0 , d0, d4
|
||||
@@ -146,8 +146,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s14 }
|
||||
fldmias Y, { s15 }
|
||||
vldmia.f32 X, { s14 }
|
||||
vldmia.f32 Y, { s15 }
|
||||
vmul.f32 s15, s14, s15
|
||||
vcvt.f64.f32 d4, s15
|
||||
vadd.f64 d0 , d0, d4
|
||||
@@ -162,12 +162,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
fldmias X!, { s8 - s9 }
|
||||
fldmias Y!, { s4 - s5}
|
||||
vldmia.f32 X!, { s8 - s9 }
|
||||
vldmia.f32 Y!, { s4 - s5}
|
||||
fmacs s0 , s4, s8
|
||||
fldmias X!, { s10 - s11 }
|
||||
vldmia.f32 X!, { s10 - s11 }
|
||||
fmacs s1 , s5, s9
|
||||
fldmias Y!, { s6 - s7 }
|
||||
vldmia.f32 Y!, { s6 - s7 }
|
||||
fmacs s0 , s6, s10
|
||||
fmacs s1 , s7, s11
|
||||
|
||||
@@ -175,8 +175,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X!, { s4 }
|
||||
fldmias Y!, { s8 }
|
||||
vldmia.f32 X!, { s4 }
|
||||
vldmia.f32 Y!, { s8 }
|
||||
fmacs s0 , s4, s8
|
||||
|
||||
.endm
|
||||
@@ -185,26 +185,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL_S4
|
||||
|
||||
nop
|
||||
fldmias X, { s4 }
|
||||
fldmias Y, { s8 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vldmia.f32 Y, { s8 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
fmacs s0 , s4, s8
|
||||
|
||||
fldmias X, { s5 }
|
||||
fldmias Y, { s9 }
|
||||
vldmia.f32 X, { s5 }
|
||||
vldmia.f32 Y, { s9 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
fmacs s1 , s5, s9
|
||||
|
||||
fldmias X, { s6 }
|
||||
fldmias Y, { s10 }
|
||||
vldmia.f32 X, { s6 }
|
||||
vldmia.f32 Y, { s10 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
fmacs s0 , s6, s10
|
||||
|
||||
fldmias X, { s7 }
|
||||
fldmias Y, { s11 }
|
||||
vldmia.f32 X, { s7 }
|
||||
vldmia.f32 Y, { s11 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
fmacs s1 , s7, s11
|
||||
@@ -214,8 +214,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s4 }
|
||||
fldmias Y, { s8 }
|
||||
vldmia.f32 X, { s4 }
|
||||
vldmia.f32 Y, { s8 }
|
||||
add X, X, INC_X
|
||||
fmacs s0 , s4, s8
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -112,8 +112,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL4x2_SUB
|
||||
|
||||
fldmias AO! , { s0 - s3 }
|
||||
fldmias BO! , { s4 - s5 }
|
||||
vldmia.f32 AO! , { s0 - s3 }
|
||||
vldmia.f32 BO! , { s4 - s5 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s1, s4
|
||||
|
||||
@@ -136,29 +136,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL4x4_I
|
||||
|
||||
pld [ AO , #A_PRE ]
|
||||
fldmias AO!, { s0 - s1 }
|
||||
vldmia.f32 AO!, { s0 - s1 }
|
||||
pld [ BO , #B_PRE ]
|
||||
fldmias BO!, { s8 - s9 }
|
||||
vldmia.f32 BO!, { s8 - s9 }
|
||||
|
||||
fmuls s16 , s0, s8
|
||||
fldmias AO!, { s2 - s3 }
|
||||
vldmia.f32 AO!, { s2 - s3 }
|
||||
fmuls s17 , s1, s8
|
||||
fmuls s18 , s2, s8
|
||||
fldmias BO!, { s10 - s11 }
|
||||
vldmia.f32 BO!, { s10 - s11 }
|
||||
fmuls s19 , s3, s8
|
||||
|
||||
fmuls s20 , s0, s9
|
||||
fldmias AO!, { s4 - s5 }
|
||||
vldmia.f32 AO!, { s4 - s5 }
|
||||
fmuls s21 , s1, s9
|
||||
fmuls s22 , s2, s9
|
||||
fldmias AO!, { s6 - s7 }
|
||||
vldmia.f32 AO!, { s6 - s7 }
|
||||
fmuls s23 , s3, s9
|
||||
|
||||
fmuls s24 , s0, s10
|
||||
fldmias BO!, { s12 - s13 }
|
||||
vldmia.f32 BO!, { s12 - s13 }
|
||||
fmuls s25 , s1, s10
|
||||
fmuls s26 , s2, s10
|
||||
fldmias BO!, { s14 - s15 }
|
||||
vldmia.f32 BO!, { s14 - s15 }
|
||||
fmuls s27 , s3, s10
|
||||
|
||||
fmuls s28 , s0, s11
|
||||
@@ -174,20 +174,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
pld [ AO , #A_PRE ]
|
||||
fmacs s16 , s4, s12
|
||||
fmacs s17 , s5, s12
|
||||
fldmias AO!, { s0 - s3 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
fmacs s18 , s6, s12
|
||||
pld [ BO , #B_PRE ]
|
||||
fmacs s19 , s7, s12
|
||||
|
||||
fmacs s20 , s4, s13
|
||||
fldmias BO!, { s8 - s11 }
|
||||
vldmia.f32 BO!, { s8 - s11 }
|
||||
fmacs s21 , s5, s13
|
||||
fmacs s22 , s6, s13
|
||||
//fldmias AO!, { s2 - s3 }
|
||||
//vldmia.f32 AO!, { s2 - s3 }
|
||||
fmacs s23 , s7, s13
|
||||
|
||||
fmacs s24 , s4, s14
|
||||
//fldmias BO!, { s10 - s11 }
|
||||
//vldmia.f32 BO!, { s10 - s11 }
|
||||
fmacs s25 , s5, s14
|
||||
fmacs s26 , s6, s14
|
||||
fmacs s27 , s7, s14
|
||||
@@ -203,17 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL4x4_M1
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
fldmias AO!, { s4 - s7 }
|
||||
vldmia.f32 AO!, { s4 - s7 }
|
||||
fmacs s17 , s1, s8
|
||||
fmacs s18 , s2, s8
|
||||
fldmias BO!, { s12 - s15 }
|
||||
//fldmias AO!, { s6 - s7 }
|
||||
vldmia.f32 BO!, { s12 - s15 }
|
||||
//vldmia.f32 AO!, { s6 - s7 }
|
||||
fmacs s19 , s3, s8
|
||||
|
||||
fmacs s20 , s0, s9
|
||||
fmacs s21 , s1, s9
|
||||
fmacs s22 , s2, s9
|
||||
//fldmias BO!, { s14 - s15 }
|
||||
//vldmia.f32 BO!, { s14 - s15 }
|
||||
fmacs s23 , s3, s9
|
||||
|
||||
fmacs s24 , s0, s10
|
||||
@@ -300,7 +300,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0, ALPHA
|
||||
add r4 , CO2, r3
|
||||
|
||||
fldmias CO1, { s8 - s11 }
|
||||
vldmia.f32 CO1, { s8 - s11 }
|
||||
|
||||
fmacs s8 , s0 , s16
|
||||
flds s12, [CO2]
|
||||
@@ -322,7 +322,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
pld [ CO1 , #C_PRE ]
|
||||
|
||||
fldmias r4, { s8 - s11 }
|
||||
vldmia.f32 r4, { s8 - s11 }
|
||||
|
||||
fmacs s8 , s0 , s24
|
||||
fsts s12, [CO2]
|
||||
@@ -338,7 +338,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add CO2, r4 , r3
|
||||
|
||||
|
||||
fldmias CO2, { s12 - s15 }
|
||||
vldmia.f32 CO2, { s12 - s15 }
|
||||
|
||||
fsts s8 , [r4 ]
|
||||
fmacs s12, s0 , s28
|
||||
@@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fmacs s15, s0 , s31
|
||||
|
||||
pld [ r4 , #C_PRE ]
|
||||
fstmias CO2, { s12 - s15 }
|
||||
vstmia.f32 CO2, { s12 - s15 }
|
||||
pld [ CO2 , #C_PRE ]
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
@@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s3 , [ AO2, #4 ]
|
||||
|
||||
add AO1, AO1, #8
|
||||
fstmias BO!, { s0 - s3 }
|
||||
vstmia.f32 BO!, { s0 - s3 }
|
||||
add AO2, AO2, #8
|
||||
|
||||
.endm
|
||||
@@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s1 , [ AO2, #0 ]
|
||||
add AO1, AO1, #4
|
||||
|
||||
fstmias BO!, { s0 - s1 }
|
||||
vstmia.f32 BO!, { s0 - s1 }
|
||||
add AO2, AO2, #4
|
||||
|
||||
.endm
|
||||
@@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s0 , [ AO1, #0 ]
|
||||
flds s1 , [ AO1, #4 ]
|
||||
|
||||
fstmias BO!, { s0 - s1 }
|
||||
vstmia.f32 BO!, { s0 - s1 }
|
||||
add AO1, AO1, #8
|
||||
|
||||
.endm
|
||||
@@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
flds s0 , [ AO1, #0 ]
|
||||
|
||||
fstmias BO!, { s0 }
|
||||
vstmia.f32 BO!, { s0 }
|
||||
add AO1, AO1, #4
|
||||
|
||||
.endm
|
||||
|
||||
@@ -100,10 +100,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s11, [ AO4, #8 ]
|
||||
flds s15, [ AO4, #12 ]
|
||||
|
||||
fstmias BO!, { s0 - s3 }
|
||||
vstmia.f32 BO!, { s0 - s3 }
|
||||
add AO4, AO4, #16
|
||||
fstmias BO!, { s4 - s7 }
|
||||
fstmias BO!, { s8 - s15 }
|
||||
vstmia.f32 BO!, { s4 - s7 }
|
||||
vstmia.f32 BO!, { s8 - s15 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -117,7 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s3 , [ AO4, #0 ]
|
||||
|
||||
add AO3, AO3, #4
|
||||
fstmias BO!, { s0 - s3 }
|
||||
vstmia.f32 BO!, { s0 - s3 }
|
||||
add AO4, AO4, #4
|
||||
|
||||
.endm
|
||||
@@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s5 , [ AO2, #8 ]
|
||||
flds s7 , [ AO2, #12 ]
|
||||
|
||||
fstmias BO!, { s0 - s7 }
|
||||
vstmia.f32 BO!, { s0 - s7 }
|
||||
add AO2, AO2, #16
|
||||
|
||||
.endm
|
||||
@@ -147,7 +147,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s1 , [ AO2, #0 ]
|
||||
add AO1, AO1, #4
|
||||
|
||||
fstmias BO!, { s0 - s1 }
|
||||
vstmia.f32 BO!, { s0 - s1 }
|
||||
add AO2, AO2, #4
|
||||
|
||||
.endm
|
||||
@@ -159,7 +159,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
flds s2 , [ AO1, #8 ]
|
||||
flds s3 , [ AO1, #12 ]
|
||||
|
||||
fstmias BO!, { s0 - s3 }
|
||||
vstmia.f32 BO!, { s0 - s3 }
|
||||
add AO1, AO1, #16
|
||||
|
||||
.endm
|
||||
@@ -169,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
flds s0 , [ AO1, #0 ]
|
||||
|
||||
fstmias BO!, { s0 }
|
||||
vstmia.f32 BO!, { s0 }
|
||||
add AO1, AO1, #4
|
||||
|
||||
.endm
|
||||
|
||||
@@ -76,21 +76,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY4x4_1
|
||||
|
||||
pld [ AO1, #A_PRE ]
|
||||
fldmias AO1, { s0 - s3 }
|
||||
vldmia.f32 AO1, { s0 - s3 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
pld [ r3, #A_PRE ]
|
||||
fldmias r3, { s4 - s7 }
|
||||
vldmia.f32 r3, { s4 - s7 }
|
||||
|
||||
add r3, r3, LDA
|
||||
pld [ r3, #A_PRE ]
|
||||
fldmias r3, { s8 - s11 }
|
||||
vldmia.f32 r3, { s8 - s11 }
|
||||
|
||||
add r3, r3, LDA
|
||||
pld [ r3, #A_PRE ]
|
||||
fldmias r3, { s12 - s15 }
|
||||
vldmia.f32 r3, { s12 - s15 }
|
||||
|
||||
fstmias BO1, { s0 - s15 }
|
||||
vstmia.f32 BO1, { s0 - s15 }
|
||||
add AO1, AO1, #16
|
||||
add BO1, BO1, M4
|
||||
|
||||
@@ -98,18 +98,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY4x4_2
|
||||
|
||||
fldmias AO1, { s0 - s3 }
|
||||
vldmia.f32 AO1, { s0 - s3 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmias r3, { s4 - s7 }
|
||||
vldmia.f32 r3, { s4 - s7 }
|
||||
|
||||
add r3, r3, LDA
|
||||
fldmias r3, { s8 - s11 }
|
||||
vldmia.f32 r3, { s8 - s11 }
|
||||
|
||||
add r3, r3, LDA
|
||||
fldmias r3, { s12 - s15 }
|
||||
vldmia.f32 r3, { s12 - s15 }
|
||||
|
||||
fstmias BO1, { s0 - s15 }
|
||||
vstmia.f32 BO1, { s0 - s15 }
|
||||
add AO1, AO1, #16
|
||||
add BO1, BO1, M4
|
||||
|
||||
@@ -118,18 +118,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY2x4
|
||||
|
||||
fldmias AO1, { s0 - s1 }
|
||||
vldmia.f32 AO1, { s0 - s1 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmias r3, { s2 - s3 }
|
||||
vldmia.f32 r3, { s2 - s3 }
|
||||
|
||||
add r3, r3, LDA
|
||||
fldmias r3, { s4 - s5 }
|
||||
vldmia.f32 r3, { s4 - s5 }
|
||||
|
||||
add r3, r3, LDA
|
||||
fldmias r3, { s6 - s7 }
|
||||
vldmia.f32 r3, { s6 - s7 }
|
||||
|
||||
fstmias BO2, { s0 - s7 }
|
||||
vstmia.f32 BO2, { s0 - s7 }
|
||||
add AO1, AO1, #8
|
||||
add BO2, BO2, #32
|
||||
|
||||
@@ -137,18 +137,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY1x4
|
||||
|
||||
fldmias AO1, { s0 }
|
||||
vldmia.f32 AO1, { s0 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmias r3, { s1 }
|
||||
vldmia.f32 r3, { s1 }
|
||||
|
||||
add r3, r3, LDA
|
||||
fldmias r3, { s2 }
|
||||
vldmia.f32 r3, { s2 }
|
||||
|
||||
add r3, r3, LDA
|
||||
fldmias r3, { s3 }
|
||||
vldmia.f32 r3, { s3 }
|
||||
|
||||
fstmias BO3, { s0 - s3 }
|
||||
vstmia.f32 BO3, { s0 - s3 }
|
||||
add AO1, AO1, #4
|
||||
add BO3, BO3, #16
|
||||
|
||||
@@ -158,12 +158,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY4x2
|
||||
|
||||
fldmias AO1, { s0 - s3 }
|
||||
vldmia.f32 AO1, { s0 - s3 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmias r3, { s4 - s7 }
|
||||
vldmia.f32 r3, { s4 - s7 }
|
||||
|
||||
fstmias BO1, { s0 - s7 }
|
||||
vstmia.f32 BO1, { s0 - s7 }
|
||||
add AO1, AO1, #16
|
||||
add BO1, BO1, M4
|
||||
|
||||
@@ -171,12 +171,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY2x2
|
||||
|
||||
fldmias AO1, { s0 - s1 }
|
||||
vldmia.f32 AO1, { s0 - s1 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmias r3, { s2 - s3 }
|
||||
vldmia.f32 r3, { s2 - s3 }
|
||||
|
||||
fstmias BO2, { s0 - s3 }
|
||||
vstmia.f32 BO2, { s0 - s3 }
|
||||
add AO1, AO1, #8
|
||||
add BO2, BO2, #16
|
||||
|
||||
@@ -184,12 +184,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY1x2
|
||||
|
||||
fldmias AO1, { s0 }
|
||||
vldmia.f32 AO1, { s0 }
|
||||
|
||||
add r3, AO1, LDA
|
||||
fldmias r3, { s1 }
|
||||
vldmia.f32 r3, { s1 }
|
||||
|
||||
fstmias BO3, { s0 - s1 }
|
||||
vstmia.f32 BO3, { s0 - s1 }
|
||||
add AO1, AO1, #4
|
||||
add BO3, BO3, #8
|
||||
|
||||
@@ -199,9 +199,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY4x1
|
||||
|
||||
fldmias AO1, { s0 - s3 }
|
||||
vldmia.f32 AO1, { s0 - s3 }
|
||||
|
||||
fstmias BO1, { s0 - s3 }
|
||||
vstmia.f32 BO1, { s0 - s3 }
|
||||
add AO1, AO1, #16
|
||||
add BO1, BO1, M4
|
||||
|
||||
@@ -209,9 +209,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY2x1
|
||||
|
||||
fldmias AO1, { s0 - s1 }
|
||||
vldmia.f32 AO1, { s0 - s1 }
|
||||
|
||||
fstmias BO2, { s0 - s1 }
|
||||
vstmia.f32 BO2, { s0 - s1 }
|
||||
add AO1, AO1, #8
|
||||
add BO2, BO2, #8
|
||||
|
||||
@@ -219,9 +219,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY1x1
|
||||
|
||||
fldmias AO1, { s0 }
|
||||
vldmia.f32 AO1, { s0 }
|
||||
|
||||
fstmias BO3, { s0 }
|
||||
vstmia.f32 BO3, { s0 }
|
||||
add AO1, AO1, #4
|
||||
add BO3, BO3, #4
|
||||
|
||||
|
||||
@@ -118,8 +118,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL4x2_SUB
|
||||
|
||||
fldmias AO!, { s0 - s3 }
|
||||
fldmias BO!, { s4 - s5 }
|
||||
vldmia.f32 AO!, { s0 - s3 }
|
||||
vldmia.f32 BO!, { s4 - s5 }
|
||||
|
||||
fmacs s8 , s0, s4
|
||||
fmacs s9 , s1, s4
|
||||
|
||||
@@ -122,30 +122,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL4x4_I
|
||||
|
||||
fldmias AO!, { s0 - s1 }
|
||||
vldmia.f32 AO!, { s0 - s1 }
|
||||
pld [ AO , #A_PRE-8 ]
|
||||
fldmias BO!, { s8 - s9 }
|
||||
vldmia.f32 BO!, { s8 - s9 }
|
||||
pld [ BO , #B_PRE-8 ]
|
||||
|
||||
fmuls s16 , s0, s8
|
||||
fldmias AO!, { s2 - s3 }
|
||||
vldmia.f32 AO!, { s2 - s3 }
|
||||
fmuls s17 , s1, s8
|
||||
fmuls s18 , s2, s8
|
||||
fldmias BO!, { s10 - s11 }
|
||||
vldmia.f32 BO!, { s10 - s11 }
|
||||
fmuls s19 , s3, s8
|
||||
|
||||
fmuls s20 , s0, s9
|
||||
fldmias AO!, { s4 - s5 }
|
||||
vldmia.f32 AO!, { s4 - s5 }
|
||||
fmuls s21 , s1, s9
|
||||
fmuls s22 , s2, s9
|
||||
fldmias AO!, { s6 - s7 }
|
||||
vldmia.f32 AO!, { s6 - s7 }
|
||||
fmuls s23 , s3, s9
|
||||
|
||||
fmuls s24 , s0, s10
|
||||
fldmias BO!, { s12 - s13 }
|
||||
vldmia.f32 BO!, { s12 - s13 }
|
||||
fmuls s25 , s1, s10
|
||||
fmuls s26 , s2, s10
|
||||
fldmias BO!, { s14 - s15 }
|
||||
vldmia.f32 BO!, { s14 - s15 }
|
||||
fmuls s27 , s3, s10
|
||||
|
||||
fmuls s28 , s0, s11
|
||||
@@ -161,20 +161,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
pld [ AO , #A_PRE ]
|
||||
fmacs s16 , s4, s12
|
||||
fmacs s17 , s5, s12
|
||||
fldmias AO!, { s0 - s1 }
|
||||
vldmia.f32 AO!, { s0 - s1 }
|
||||
fmacs s18 , s6, s12
|
||||
pld [ BO , #B_PRE ]
|
||||
fmacs s19 , s7, s12
|
||||
|
||||
fmacs s20 , s4, s13
|
||||
fldmias AO!, { s2 - s3 }
|
||||
vldmia.f32 AO!, { s2 - s3 }
|
||||
fmacs s21 , s5, s13
|
||||
fmacs s22 , s6, s13
|
||||
fldmias BO!, { s8 - s9 }
|
||||
vldmia.f32 BO!, { s8 - s9 }
|
||||
fmacs s23 , s7, s13
|
||||
|
||||
fmacs s24 , s4, s14
|
||||
fldmias BO!, { s10 - s11 }
|
||||
vldmia.f32 BO!, { s10 - s11 }
|
||||
fmacs s25 , s5, s14
|
||||
fmacs s26 , s6, s14
|
||||
fmacs s27 , s7, s14
|
||||
@@ -190,17 +190,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro KERNEL4x4_M1
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
fldmias AO!, { s4 - s5 }
|
||||
vldmia.f32 AO!, { s4 - s5 }
|
||||
fmacs s17 , s1, s8
|
||||
fmacs s18 , s2, s8
|
||||
fldmias AO!, { s6 - s7 }
|
||||
vldmia.f32 AO!, { s6 - s7 }
|
||||
fmacs s19 , s3, s8
|
||||
|
||||
fmacs s20 , s0, s9
|
||||
fldmias BO!, { s12 - s13 }
|
||||
vldmia.f32 BO!, { s12 - s13 }
|
||||
fmacs s21 , s1, s9
|
||||
fmacs s22 , s2, s9
|
||||
fldmias BO!, { s14 - s15 }
|
||||
vldmia.f32 BO!, { s14 - s15 }
|
||||
fmacs s23 , s3, s9
|
||||
|
||||
fmacs s24 , s0, s10
|
||||
@@ -325,7 +325,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fsts s11, [r4 , #12 ]
|
||||
fmuls s15, s0 , s31
|
||||
|
||||
fstmias CO2, { s12 - s15 }
|
||||
vstmia.f32 CO2, { s12 - s15 }
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
|
||||
@@ -103,29 +103,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
pld [ Y, #X_PRE ]
|
||||
fldmiad X, { d0 - d3 }
|
||||
fldmiad Y, { d4 - d7 }
|
||||
fstmiad Y!, { d0 - d3 }
|
||||
fstmiad X!, { d4 - d7}
|
||||
vldmia.f64 X, { d0 - d3 }
|
||||
vldmia.f64 Y, { d4 - d7 }
|
||||
vstmia.f64 Y!, { d0 - d3 }
|
||||
vstmia.f64 X!, { d4 - d7}
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X, { d0 }
|
||||
fldmiad Y, { d4 }
|
||||
fstmiad Y!, { d0 }
|
||||
fstmiad X!, { d4 }
|
||||
vldmia.f64 X, { d0 }
|
||||
vldmia.f64 Y, { d4 }
|
||||
vstmia.f64 Y!, { d0 }
|
||||
vstmia.f64 X!, { d4 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d0 }
|
||||
fldmiad Y, { d4 }
|
||||
fstmiad Y, { d0 }
|
||||
fstmiad X, { d4 }
|
||||
vldmia.f64 X, { d0 }
|
||||
vldmia.f64 Y, { d4 }
|
||||
vstmia.f64 Y, { d0 }
|
||||
vstmia.f64 X, { d4 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -135,29 +135,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
fldmias X, { s0 - s3 }
|
||||
fldmias Y, { s4 - s7 }
|
||||
fstmias Y!, { s0 - s3 }
|
||||
fstmias X!, { s4 - s7}
|
||||
vldmia.f32 X, { s0 - s3 }
|
||||
vldmia.f32 Y, { s4 - s7 }
|
||||
vstmia.f32 Y!, { s0 - s3 }
|
||||
vstmia.f32 X!, { s4 - s7}
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X, { s0 }
|
||||
fldmias Y, { s4 }
|
||||
fstmias Y!, { s0 }
|
||||
fstmias X!, { s4 }
|
||||
vldmia.f32 X, { s0 }
|
||||
vldmia.f32 Y, { s4 }
|
||||
vstmia.f32 Y!, { s0 }
|
||||
vstmia.f32 X!, { s4 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s0 }
|
||||
fldmias Y, { s4 }
|
||||
fstmias Y, { s0 }
|
||||
fstmias X, { s4 }
|
||||
vldmia.f32 X, { s0 }
|
||||
vldmia.f32 Y, { s4 }
|
||||
vstmia.f32 Y, { s0 }
|
||||
vstmia.f32 X, { s4 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -174,35 +174,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
pld [ Y, #X_PRE ]
|
||||
fldmiad X, { d0 - d3 }
|
||||
fldmiad Y, { d4 - d7 }
|
||||
fstmiad Y!, { d0 - d3 }
|
||||
fstmiad X!, { d4 - d7}
|
||||
vldmia.f64 X, { d0 - d3 }
|
||||
vldmia.f64 Y, { d4 - d7 }
|
||||
vstmia.f64 Y!, { d0 - d3 }
|
||||
vstmia.f64 X!, { d4 - d7}
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
pld [ Y, #X_PRE ]
|
||||
fldmiad X, { d0 - d3 }
|
||||
fldmiad Y, { d4 - d7 }
|
||||
fstmiad Y!, { d0 - d3 }
|
||||
fstmiad X!, { d4 - d7}
|
||||
vldmia.f64 X, { d0 - d3 }
|
||||
vldmia.f64 Y, { d4 - d7 }
|
||||
vstmia.f64 Y!, { d0 - d3 }
|
||||
vstmia.f64 X!, { d4 - d7}
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X, { d0 - d1 }
|
||||
fldmiad Y, { d4 - d5 }
|
||||
fstmiad Y!, { d0 - d1 }
|
||||
fstmiad X!, { d4 - d5 }
|
||||
vldmia.f64 X, { d0 - d1 }
|
||||
vldmia.f64 Y, { d4 - d5 }
|
||||
vstmia.f64 Y!, { d0 - d1 }
|
||||
vstmia.f64 X!, { d4 - d5 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d0 - d1 }
|
||||
fldmiad Y, { d4 - d5 }
|
||||
fstmiad Y, { d0 - d1 }
|
||||
fstmiad X, { d4 - d5 }
|
||||
vldmia.f64 X, { d0 - d1 }
|
||||
vldmia.f64 Y, { d4 - d5 }
|
||||
vstmia.f64 Y, { d0 - d1 }
|
||||
vstmia.f64 X, { d4 - d5 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -215,33 +215,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
pld [ Y, #X_PRE ]
|
||||
fldmias X, { s0 - s3 }
|
||||
fldmias Y, { s4 - s7 }
|
||||
fstmias Y!, { s0 - s3 }
|
||||
fstmias X!, { s4 - s7}
|
||||
vldmia.f32 X, { s0 - s3 }
|
||||
vldmia.f32 Y, { s4 - s7 }
|
||||
vstmia.f32 Y!, { s0 - s3 }
|
||||
vstmia.f32 X!, { s4 - s7}
|
||||
|
||||
fldmias X, { s0 - s3 }
|
||||
fldmias Y, { s4 - s7 }
|
||||
fstmias Y!, { s0 - s3 }
|
||||
fstmias X!, { s4 - s7}
|
||||
vldmia.f32 X, { s0 - s3 }
|
||||
vldmia.f32 Y, { s4 - s7 }
|
||||
vstmia.f32 Y!, { s0 - s3 }
|
||||
vstmia.f32 X!, { s4 - s7}
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmias X, { s0 - s1 }
|
||||
fldmias Y, { s4 - s5 }
|
||||
fstmias Y!, { s0 - s1 }
|
||||
fstmias X!, { s4 - s5 }
|
||||
vldmia.f32 X, { s0 - s1 }
|
||||
vldmia.f32 Y, { s4 - s5 }
|
||||
vstmia.f32 Y!, { s0 - s1 }
|
||||
vstmia.f32 X!, { s4 - s5 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmias X, { s0 - s1 }
|
||||
fldmias Y, { s4 - s5 }
|
||||
fstmias Y, { s0 - s1 }
|
||||
fstmias X, { s4 - s5 }
|
||||
vldmia.f32 X, { s0 - s1 }
|
||||
vldmia.f32 Y, { s4 - s5 }
|
||||
vstmia.f32 Y, { s0 - s1 }
|
||||
vstmia.f32 X, { s4 - s5 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
|
||||
@@ -66,15 +66,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
pld [ X, #X_PRE+32 ]
|
||||
fldmiad X!, { d0 - d7 }
|
||||
fstmiad Y!, { d0 - d7 }
|
||||
vldmia.f64 X!, { d0 - d7 }
|
||||
vstmia.f64 Y!, { d0 - d7 }
|
||||
|
||||
.endm
|
||||
|
||||
.macro COPY_F1
|
||||
|
||||
fldmiad X!, { d0 - d1 }
|
||||
fstmiad Y!, { d0 - d1 }
|
||||
vldmia.f64 X!, { d0 - d1 }
|
||||
vstmia.f64 Y!, { d0 - d1 }
|
||||
|
||||
.endm
|
||||
|
||||
@@ -84,23 +84,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY_S4
|
||||
|
||||
nop
|
||||
fldmiad X, { d0 - d1 }
|
||||
fstmiad Y, { d0 - d1 }
|
||||
vldmia.f64 X, { d0 - d1 }
|
||||
vstmia.f64 Y, { d0 - d1 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmiad X, { d2 - d3 }
|
||||
fstmiad Y, { d2 - d3 }
|
||||
vldmia.f64 X, { d2 - d3 }
|
||||
vstmia.f64 Y, { d2 - d3 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmiad X, { d0 - d1 }
|
||||
fstmiad Y, { d0 - d1 }
|
||||
vldmia.f64 X, { d0 - d1 }
|
||||
vstmia.f64 Y, { d0 - d1 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmiad X, { d2 - d3 }
|
||||
fstmiad Y, { d2 - d3 }
|
||||
vldmia.f64 X, { d2 - d3 }
|
||||
vstmia.f64 Y, { d2 - d3 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
@@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro COPY_S1
|
||||
|
||||
fldmiad X, { d0 - d1 }
|
||||
fstmiad Y, { d0 - d1 }
|
||||
vldmia.f64 X, { d0 - d1 }
|
||||
vstmia.f64 Y, { d0 - d1 }
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
|
||||
@@ -76,15 +76,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
pld [ X, #X_PRE ]
|
||||
pld [ Y, #X_PRE ]
|
||||
|
||||
fldmiad X!, { d4 - d5 }
|
||||
fldmiad Y!, { d8 - d9 }
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vldmia.f64 Y!, { d8 - d9 }
|
||||
fmacd d0 , d4, d8
|
||||
fmacd d1 , d4, d9
|
||||
fldmiad X!, { d6 - d7 }
|
||||
vldmia.f64 X!, { d6 - d7 }
|
||||
fmacd d2 , d5, d9
|
||||
fmacd d3 , d5, d8
|
||||
|
||||
fldmiad Y!, { d10 - d11 }
|
||||
vldmia.f64 Y!, { d10 - d11 }
|
||||
fmacd d0 , d6, d10
|
||||
fmacd d1 , d6, d11
|
||||
pld [ X, #X_PRE ]
|
||||
@@ -93,15 +93,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
pld [ Y, #X_PRE ]
|
||||
|
||||
fldmiad X!, { d4 - d5 }
|
||||
fldmiad Y!, { d8 - d9 }
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vldmia.f64 Y!, { d8 - d9 }
|
||||
fmacd d0 , d4, d8
|
||||
fmacd d1 , d4, d9
|
||||
fldmiad X!, { d6 - d7 }
|
||||
vldmia.f64 X!, { d6 - d7 }
|
||||
fmacd d2 , d5, d9
|
||||
fmacd d3 , d5, d8
|
||||
|
||||
fldmiad Y!, { d10 - d11 }
|
||||
vldmia.f64 Y!, { d10 - d11 }
|
||||
fmacd d0 , d6, d10
|
||||
fmacd d1 , d6, d11
|
||||
fmacd d2 , d7, d11
|
||||
@@ -111,8 +111,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
fldmiad X!, { d4 - d5 }
|
||||
fldmiad Y!, { d8 - d9 }
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vldmia.f64 Y!, { d8 - d9 }
|
||||
fmacd d0 , d4, d8
|
||||
fmacd d1 , d4, d9
|
||||
fmacd d2 , d5, d9
|
||||
@@ -127,8 +127,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
nop
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
fldmiad Y, { d8 - d9 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vldmia.f64 Y, { d8 - d9 }
|
||||
fmacd d0 , d4, d8
|
||||
fmacd d1 , d4, d9
|
||||
fmacd d2 , d5, d9
|
||||
@@ -136,8 +136,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
fldmiad Y, { d8 - d9 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vldmia.f64 Y, { d8 - d9 }
|
||||
fmacd d0 , d4, d8
|
||||
fmacd d1 , d4, d9
|
||||
fmacd d2 , d5, d9
|
||||
@@ -145,8 +145,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
fldmiad Y, { d8 - d9 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vldmia.f64 Y, { d8 - d9 }
|
||||
fmacd d0 , d4, d8
|
||||
fmacd d1 , d4, d9
|
||||
fmacd d2 , d5, d9
|
||||
@@ -154,8 +154,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add X, X, INC_X
|
||||
add Y, Y, INC_Y
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
fldmiad Y, { d8 - d9 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vldmia.f64 Y, { d8 - d9 }
|
||||
fmacd d0 , d4, d8
|
||||
fmacd d1 , d4, d9
|
||||
fmacd d2 , d5, d9
|
||||
@@ -168,8 +168,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
fldmiad Y, { d8 - d9 }
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vldmia.f64 Y, { d8 - d9 }
|
||||
fmacd d0 , d4, d8
|
||||
fmacd d1 , d4, d9
|
||||
fmacd d2 , d5, d9
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user