Compare commits
147 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
864e202afd | ||
|
|
9e4b6971e2 | ||
|
|
7f2a959e3e | ||
|
|
6418667818 | ||
|
|
dd43661cfd | ||
|
|
9253dadaa7 | ||
|
|
1e03a62b67 | ||
|
|
faa73690e4 | ||
|
|
f24d5307cf | ||
|
|
0a4276bc2f | ||
|
|
08bddde3f3 | ||
|
|
e173c51c04 | ||
|
|
9c42f0374a | ||
|
|
d4380c1fe4 | ||
|
|
a51102e9b7 | ||
|
|
7282419525 | ||
|
|
c5b1fbcb2e | ||
|
|
e1cdd15b30 | ||
|
|
d4c0330967 | ||
|
|
12540cedb5 | ||
|
|
99adc8b062 | ||
|
|
6a9bbfc227 | ||
|
|
3349e9debd | ||
|
|
dd7612358d | ||
|
|
e5a6ef3808 | ||
|
|
7aac0aff8e | ||
|
|
26d7f06206 | ||
|
|
68a69c5b50 | ||
|
|
a571359afd | ||
|
|
c2464a7c4a | ||
|
|
294f933869 | ||
|
|
f59c9bd6ef | ||
|
|
c53be46d78 | ||
|
|
bbb2d73d73 | ||
|
|
659ed16591 | ||
|
|
35c98a3556 | ||
|
|
f1a5dd06c5 | ||
|
|
e125a3dc33 | ||
|
|
35f1f21a7f | ||
|
|
7b4b7179ba | ||
|
|
7a92c1538e | ||
|
|
5727268141 | ||
|
|
3d9a50e841 | ||
|
|
828c849b44 | ||
|
|
ecc0bc9813 | ||
|
|
12f209b7b0 | ||
|
|
7316a87930 | ||
|
|
0bff057a87 | ||
|
|
7ee1d29dd4 | ||
|
|
1e6cf9808c | ||
|
|
278511ad2d | ||
|
|
3b5ffb49d3 | ||
|
|
8519e4ed9f | ||
|
|
55eda3813b | ||
|
|
53bfc83c26 | ||
|
|
13ca89f6f0 | ||
|
|
461cf9ea38 | ||
|
|
0664ba4c97 | ||
|
|
aa744dfa59 | ||
|
|
61cf8f74d9 | ||
|
|
de202fa375 | ||
|
|
6f93b53590 | ||
|
|
11c44dede1 | ||
|
|
f00d642592 | ||
|
|
9e4584d069 | ||
|
|
2a5679da5f | ||
|
|
a71e8c82f6 | ||
|
|
9b987badb0 | ||
|
|
1619b2f3c8 | ||
|
|
4f3153395a | ||
|
|
d7a1a7ff2a | ||
|
|
308e6195b7 | ||
|
|
7a3d7b1f52 | ||
|
|
74cc2d6623 | ||
|
|
fc3a558515 | ||
|
|
cd9fafc054 | ||
|
|
84b92e6373 | ||
|
|
c279a53ed8 | ||
|
|
e1df5a6e23 | ||
|
|
5c658f8746 | ||
|
|
ec4390a967 | ||
|
|
fced5744fb | ||
|
|
8c0fb1258d | ||
|
|
aae581d004 | ||
|
|
e17303933a | ||
|
|
f9226275f4 | ||
|
|
cf8c7e28b3 | ||
|
|
5ac02f6dc7 | ||
|
|
7aa1ad4923 | ||
|
|
dcd15b546c | ||
|
|
96284ab295 | ||
|
|
d5e1255ca7 | ||
|
|
587455868e | ||
|
|
323c237e7b | ||
|
|
faa5e2e5e3 | ||
|
|
551fdf53e8 | ||
|
|
fdf291be30 | ||
|
|
68eb4fa329 | ||
|
|
05196a8497 | ||
|
|
db9b611b12 | ||
|
|
2e6333f74e | ||
|
|
c99cc41cbd | ||
|
|
711ecb8bd5 | ||
|
|
10c2ebdfc5 | ||
|
|
26b3b3a3e6 | ||
|
|
acdff55a6a | ||
|
|
7d6b68eb4a | ||
|
|
0bbca5e803 | ||
|
|
cd5241d0cf | ||
|
|
8d652f11e7 | ||
|
|
6c86570e1f | ||
|
|
53ba1a77c8 | ||
|
|
d23c7c713c | ||
|
|
8c43d7fa5f | ||
|
|
8f758eeff9 | ||
|
|
8577be2a95 | ||
|
|
1edf30b790 | ||
|
|
4fc8c937d4 | ||
|
|
efa4f5c936 | ||
|
|
17d655fa64 | ||
|
|
f68141cf1d | ||
|
|
aa90518201 | ||
|
|
6b85dbb6dc | ||
|
|
a0debd4293 | ||
|
|
937493bfeb | ||
|
|
74b0672223 | ||
|
|
6e7be06e07 | ||
|
|
a04d0555ba | ||
|
|
3761c30ba4 | ||
|
|
38593cd3a3 | ||
|
|
e3b7781c2b | ||
|
|
5e6965ea47 | ||
|
|
5cc0301fc3 | ||
|
|
19a6dedfd6 | ||
|
|
0e2b92e216 | ||
|
|
d06b92906a | ||
|
|
8e98478ff3 | ||
|
|
fb8968fb83 | ||
|
|
dae6b82a71 | ||
|
|
d73244b825 | ||
|
|
233c6b959f | ||
|
|
16ec5323c9 | ||
|
|
0ad02ef2d6 | ||
|
|
73397faf68 | ||
|
|
5fc2203d8a | ||
|
|
78dcf5c3d5 | ||
|
|
32f793195f |
@@ -25,6 +25,7 @@ before_install:
|
||||
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi
|
||||
|
||||
script:
|
||||
- set -e
|
||||
- make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
|
||||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
|
||||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
|
||||
|
||||
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4)
|
||||
project(OpenBLAS)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 2)
|
||||
set(OpenBLAS_PATCH_VERSION 16.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 19.dev)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
enable_language(ASM)
|
||||
@@ -54,10 +54,6 @@ if (NOT DYNAMIC_ARCH)
|
||||
list(APPEND BLASDIRS kernel)
|
||||
endif ()
|
||||
|
||||
if (DEFINED UTEST_CHECK)
|
||||
set(SANITY_CHECK 1)
|
||||
endif ()
|
||||
|
||||
if (DEFINED SANITY_CHECK)
|
||||
list(APPEND BLASDIRS reference)
|
||||
endif ()
|
||||
@@ -110,6 +106,10 @@ if (${NO_STATIC} AND ${NO_SHARED})
|
||||
message(FATAL_ERROR "Neither static nor shared are enabled.")
|
||||
endif ()
|
||||
|
||||
#Set default output directory
|
||||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
|
||||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
|
||||
|
||||
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
|
||||
set(TARGET_OBJS "")
|
||||
foreach (SUBDIR ${SUBDIRS})
|
||||
@@ -139,6 +139,17 @@ add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET
|
||||
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/export.cmake")
|
||||
|
||||
# Set output for libopenblas
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
endforeach()
|
||||
|
||||
enable_testing()
|
||||
add_subdirectory(utest)
|
||||
|
||||
if(NOT MSVC)
|
||||
#only build shared library for MSVC
|
||||
@@ -152,7 +163,6 @@ target_link_libraries(${OpenBLAS_LIBNAME}_static pthread)
|
||||
endif()
|
||||
|
||||
#build test and ctest
|
||||
enable_testing()
|
||||
add_subdirectory(test)
|
||||
if(NOT NO_CBLAS)
|
||||
add_subdirectory(ctest)
|
||||
|
||||
@@ -124,7 +124,13 @@ In chronological order:
|
||||
* Jerome Robert <jeromerobert@gmx.com>
|
||||
* [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478)
|
||||
* [2015-12-23] `stack_check` in `gemv.c` (bug #722)
|
||||
* [2015-12-28] Allow to force the number of parallel make job
|
||||
* [2015-12-28] Fix detection of AMD E2-3200 detection
|
||||
* [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected
|
||||
* [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731)
|
||||
* [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742)
|
||||
* [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760)
|
||||
* [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727)
|
||||
|
||||
* Dan Kortschak
|
||||
* [2015-01-07] Added test for drotmg bug #484.
|
||||
@@ -135,5 +141,16 @@ In chronological order:
|
||||
* Martin Koehler <https://github.com/grisuthedragon/>
|
||||
* [2015-09-07] Improved imatcopy
|
||||
|
||||
* [Your name or handle] <[email or website]>
|
||||
* [Date] [Brief summary of your changes]
|
||||
* Ashwin Sekhar T K <https://github.com/ashwinyes/>
|
||||
* [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8)
|
||||
* [2015-11-20] lapack-test fixes for Cortex-A57
|
||||
* [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
|
||||
* [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
|
||||
|
||||
* theoractice <https://github.com/theoractice/>
|
||||
* [2016-03-20] Fix compiler error in VisualStudio with CMake
|
||||
* [2016-03-22] Fix access violation on Windows while static linking
|
||||
|
||||
* Abdelrauf <https://github.com/quickwritereader>
|
||||
* [2017-01-01] dgemm and dtrmm kernels for IBM z13
|
||||
|
||||
|
||||
@@ -1,4 +1,81 @@
|
||||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.2.18
|
||||
12-Apr-2016
|
||||
common:
|
||||
* If you set MAKE_NB_JOBS flag less or equal than zero,
|
||||
make will be without -j.
|
||||
|
||||
x86/x86_64:
|
||||
* Support building Visual Studio static library. (#813, Thanks, theoractice)
|
||||
* Fix bugs to pass buidbot CI tests (http://build.openblas.net)
|
||||
|
||||
ARM:
|
||||
* Provide DGEMM 8x4 kernel for Cortex-A57 (Thanks, Ashwin Sekhar T K)
|
||||
|
||||
POWER:
|
||||
* Optimize S and C BLAS3 on Power8
|
||||
* Optimize BLAS2/1 on Power8
|
||||
|
||||
====================================================================
|
||||
Version 0.2.17
|
||||
20-Mar-2016
|
||||
common:
|
||||
* Enable BUILD_LAPACK_DEPRECATED=1 by default.
|
||||
|
||||
====================================================================
|
||||
Version 0.2.16
|
||||
15-Mar-2016
|
||||
common:
|
||||
* Avoid potential getenv segfault. (#716)
|
||||
* Import LAPACK svn bugfix #142-#147,#150-#155
|
||||
|
||||
x86/x86_64:
|
||||
* Optimize c/zgemv for AMD Bulldozer, Piledriver, Steamroller
|
||||
* Fix bug with scipy linalg test.
|
||||
|
||||
ARM:
|
||||
* Improve DGEMM for ARM Cortex-A57. (Thanks, Ashwin Sekhar T K)
|
||||
|
||||
POWER:
|
||||
* Optimize D and Z BLAS3 functions for Power8.
|
||||
|
||||
====================================================================
|
||||
Version 0.2.16.rc1
|
||||
23-Feb-2016
|
||||
common:
|
||||
* Upgrade LAPACK to 3.6.0 version.
|
||||
Add BUILD_LAPACK_DEPRECATED option in Makefile.rule to build
|
||||
LAPACK deprecated functions.
|
||||
* Add MAKE_NB_JOBS option in Makefile.
|
||||
Force number of make jobs.This is particularly
|
||||
useful when using distcc. (#735. Thanks, Jerome Robert.)
|
||||
* Redesign unit test. Run unit/regression test at every build (Travis-CI and Appveyor).
|
||||
* Disable multi-threading for small size swap and ger. (#744. Thanks, Jerome Robert)
|
||||
* Improve small zger, zgemv, ztrmv using stack alloction (#727. Thanks, Jerome Robert)
|
||||
* Let openblas_get_num_threads return the number of active threads.
|
||||
(#760. Thanks, Jerome Robert)
|
||||
* Support illumos(OmniOS). (#749. Thanks, Lauri Tirkkonen)
|
||||
* Fix LAPACK Dormbr, Dormlq bug. (#711, #713. Thanks, Brendan Tracey)
|
||||
* Update scipy benchmark script. (#745. Thanks, John Kirkham)
|
||||
|
||||
x86/x86_64:
|
||||
* Optimize trsm kernels for AMD Bulldozer, Piledriver, Steamroller.
|
||||
* Detect Intel Avoton.
|
||||
* Detect AMD Trinity, Richland, E2-3200.
|
||||
* Fix gemv performance bug on Mac OSX Intel Haswell.
|
||||
* Fix some bugs with CMake and Visual Studio
|
||||
|
||||
ARM:
|
||||
* Support and optimize Cortex-A57 AArch64.
|
||||
(#686. Thanks, Ashwin Sekhar TK)
|
||||
* Fix Android build on ARMV7 (#778. Thanks, Paul Mustiere)
|
||||
* Update ARMV6 kernels.
|
||||
|
||||
POWER:
|
||||
* Fix detection of POWER architecture
|
||||
(#684. Thanks, Sebastien Villemot)
|
||||
|
||||
====================================================================
|
||||
Version 0.2.15
|
||||
27-Oct-2015
|
||||
|
||||
15
Makefile
15
Makefile
@@ -83,20 +83,20 @@ shared :
|
||||
ifndef NO_SHARED
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||
@$(MAKE) -C exports so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
@$(MAKE) -C exports so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), NetBSD)
|
||||
@$(MAKE) -C exports so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
@$(MAKE) -C exports dyn
|
||||
@-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
@$(MAKE) -C exports dll
|
||||
@@ -113,10 +113,8 @@ ifndef CROSS
|
||||
touch $(LIBNAME)
|
||||
ifndef NO_FBLAS
|
||||
$(MAKE) -C test all
|
||||
ifdef UTEST_CHECK
|
||||
$(MAKE) -C utest all
|
||||
endif
|
||||
endif
|
||||
ifndef NO_CBLAS
|
||||
$(MAKE) -C ctest all
|
||||
endif
|
||||
@@ -259,6 +257,9 @@ endif
|
||||
else
|
||||
-@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
ifeq ($(BUILD_LAPACK_DEPRECATED), 1)
|
||||
-@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
-@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.2.16.dev
|
||||
VERSION = 0.2.19.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
@@ -79,6 +79,9 @@ VERSION = 0.2.16.dev
|
||||
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
|
||||
# NO_LAPACKE = 1
|
||||
|
||||
# Build LAPACK Deprecated functions since LAPACK 3.6.0
|
||||
BUILD_LAPACK_DEPRECATED = 1
|
||||
|
||||
# If you want to use legacy threaded Level 3 implementation.
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
||||
@@ -109,7 +112,10 @@ NO_AFFINITY = 1
|
||||
# NO_PARALLEL_MAKE = 1
|
||||
|
||||
# Force number of make jobs. The default is the number of logical CPU of the host.
|
||||
# This is particularly useful when using distcc
|
||||
# This is particularly useful when using distcc.
|
||||
# A negative value will disable adding a -j flag to make, allowing to use a parent
|
||||
# make -j value. This is useful to call OpenBLAS make from an other project
|
||||
# makefile
|
||||
# MAKE_NB_JOBS = 2
|
||||
|
||||
# If you would like to know minute performance report of GotoBLAS.
|
||||
@@ -142,10 +148,6 @@ NO_AFFINITY = 1
|
||||
# slow (Not implemented yet).
|
||||
# SANITY_CHECK = 1
|
||||
|
||||
# Run testcases in utest/ . When you enable UTEST_CHECK, it would enable
|
||||
# SANITY_CHECK to compare the result with reference BLAS.
|
||||
# UTEST_CHECK = 1
|
||||
|
||||
# The installation directory.
|
||||
# PREFIX = /opt/OpenBLAS
|
||||
|
||||
|
||||
6
Makefile.zarch
Normal file
6
Makefile.zarch
Normal file
@@ -0,0 +1,6 @@
|
||||
|
||||
ifeq ($(CORE), Z13)
|
||||
CCOMMON_OPT += -march=z13 -mzvector
|
||||
FCOMMON_OPT += -march=z13 -mzvector
|
||||
endif
|
||||
|
||||
@@ -75,10 +75,15 @@ Please read GotoBLAS_01Readme.txt
|
||||
|
||||
#### ARM64:
|
||||
- **ARMV8**: Experimental
|
||||
- **ARM Cortex-A57**: Experimental
|
||||
|
||||
#### IBM zEnterprise System:
|
||||
- **Z13**: blas3 for double
|
||||
|
||||
|
||||
### Support OS:
|
||||
- **GNU/Linux**
|
||||
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
|
||||
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
|
||||
|
||||
|
||||
199
USAGE.md
Normal file
199
USAGE.md
Normal file
@@ -0,0 +1,199 @@
|
||||
# Notes on OpenBLAS usage
|
||||
## Usage
|
||||
|
||||
#### Program is Terminated. Because you tried to allocate too many memory regions
|
||||
|
||||
In OpenBLAS, we mange a pool of memory buffers and allocate the number of
|
||||
buffers as the following.
|
||||
```
|
||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
|
||||
```
|
||||
This error indicates that the program exceeded the number of buffers.
|
||||
|
||||
Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
|
||||
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
|
||||
`MAX_CPU_NUMBER=NUM_THREADS`.
|
||||
|
||||
#### How can I use OpenBLAS in multi-threaded applications?
|
||||
|
||||
If your application is already multi-threaded, it will conflict with OpenBLAS
|
||||
multi-threading. Thus, you must set OpenBLAS to use single thread in any of the
|
||||
following ways:
|
||||
|
||||
* `export OPENBLAS_NUM_THREADS=1` in the environment variables.
|
||||
* Call `openblas_set_num_threads(1)` in the application on runtime.
|
||||
* Build OpenBLAS single thread version, e.g. `make USE_THREAD=0`
|
||||
|
||||
If the application is parallelized by OpenMP, please use OpenBLAS built with
|
||||
`USE_OPENMP=1`
|
||||
|
||||
#### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH
|
||||
|
||||
The environment variable which control the kernel selection is
|
||||
`OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export
|
||||
OPENBLAS_CORETYPE=Haswell` and the function `char* openblas_get_corename()`
|
||||
returns the used target.
|
||||
|
||||
#### How could I disable OpenBLAS threading affinity on runtime?
|
||||
|
||||
You can define the `OPENBLAS_MAIN_FREE` or `GOTOBLAS_MAIN_FREE` environment
|
||||
variable to disable threading affinity on runtime. For example, before the
|
||||
running,
|
||||
```
|
||||
export OPENBLAS_MAIN_FREE=1
|
||||
```
|
||||
|
||||
Alternatively, you can disable affinity feature with enabling `NO_AFFINITY=1`
|
||||
in `Makefile.rule`.
|
||||
|
||||
## Linking with the library
|
||||
|
||||
* Link with shared library
|
||||
|
||||
`gcc -o test test.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas`
|
||||
|
||||
If the library is multithreaded, please add `-lpthread`. If the library
|
||||
contains LAPACK functions, please add `-lgfortran` or other Fortran libs.
|
||||
|
||||
* Link with static library
|
||||
|
||||
`gcc -o test test.c /your/path/libopenblas.a`
|
||||
|
||||
You can download `test.c` from https://gist.github.com/xianyi/5780018
|
||||
|
||||
On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by
|
||||
default), custom programs statically linked against `libopenblas.a` should also
|
||||
link with the pthread library e.g.:
|
||||
|
||||
```
|
||||
gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread
|
||||
```
|
||||
|
||||
Failing to add the `-lpthread` flag will cause errors such as:
|
||||
|
||||
```
|
||||
/opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory':
|
||||
memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock'
|
||||
memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock'
|
||||
...
|
||||
```
|
||||
|
||||
## Code examples
|
||||
|
||||
#### Call CBLAS interface
|
||||
This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656
|
||||
```
|
||||
#include <cblas.h>
|
||||
#include <stdio.h>
|
||||
|
||||
void main()
|
||||
{
|
||||
int i=0;
|
||||
double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
|
||||
double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
|
||||
double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5};
|
||||
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3);
|
||||
|
||||
for(i=0; i<9; i++)
|
||||
printf("%lf ", C[i]);
|
||||
printf("\n");
|
||||
}
|
||||
```
|
||||
`gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran`
|
||||
|
||||
#### Call BLAS Fortran interface
|
||||
|
||||
This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018
|
||||
|
||||
```
|
||||
#include "stdio.h"
|
||||
#include "stdlib.h"
|
||||
#include "sys/time.h"
|
||||
#include "time.h"
|
||||
|
||||
extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*);
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int i;
|
||||
printf("test!\n");
|
||||
if(argc<4){
|
||||
printf("Input Error\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
int m = atoi(argv[1]);
|
||||
int n = atoi(argv[2]);
|
||||
int k = atoi(argv[3]);
|
||||
int sizeofa = m * k;
|
||||
int sizeofb = k * n;
|
||||
int sizeofc = m * n;
|
||||
char ta = 'N';
|
||||
char tb = 'N';
|
||||
double alpha = 1.2;
|
||||
double beta = 0.001;
|
||||
|
||||
struct timeval start,finish;
|
||||
double duration;
|
||||
|
||||
double* A = (double*)malloc(sizeof(double) * sizeofa);
|
||||
double* B = (double*)malloc(sizeof(double) * sizeofb);
|
||||
double* C = (double*)malloc(sizeof(double) * sizeofc);
|
||||
|
||||
srand((unsigned)time(NULL));
|
||||
|
||||
for (i=0; i<sizeofa; i++)
|
||||
A[i] = i%3+1;//(rand()%100)/10.0;
|
||||
|
||||
for (i=0; i<sizeofb; i++)
|
||||
B[i] = i%3+1;//(rand()%100)/10.0;
|
||||
|
||||
for (i=0; i<sizeofc; i++)
|
||||
C[i] = i%3+1;//(rand()%100)/10.0;
|
||||
//#if 0
|
||||
printf("m=%d,n=%d,k=%d,alpha=%lf,beta=%lf,sizeofc=%d\n",m,n,k,alpha,beta,sizeofc);
|
||||
gettimeofday(&start, NULL);
|
||||
dgemm_(&ta, &tb, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m);
|
||||
gettimeofday(&finish, NULL);
|
||||
|
||||
duration = ((double)(finish.tv_sec-start.tv_sec)*1000000 + (double)(finish.tv_usec-start.tv_usec)) / 1000000;
|
||||
double gflops = 2.0 * m *n*k;
|
||||
gflops = gflops/duration*1.0e-6;
|
||||
|
||||
FILE *fp;
|
||||
fp = fopen("timeDGEMM.txt", "a");
|
||||
fprintf(fp, "%dx%dx%d\t%lf s\t%lf MFLOPS\n", m, n, k, duration, gflops);
|
||||
fclose(fp);
|
||||
|
||||
free(A);
|
||||
free(B);
|
||||
free(C);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
` gcc -o time_dgemm time_dgemm.c /your/path/libopenblas.a`
|
||||
|
||||
` ./time_dgemm <m> <n> <k> `
|
||||
|
||||
## Troubleshooting
|
||||
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
|
||||
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
|
||||
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
|
||||
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
|
||||
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
|
||||
|
||||
## BLAS reference manual
|
||||
If you want to understand every BLAS function and definition, please read
|
||||
[Intel MKL reference manual](https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-F7ED9FB8-6663-4F44-A62B-61B63C4F0491.htm)
|
||||
or [netlib.org](http://netlib.org/blas/)
|
||||
|
||||
Here are [OpenBLAS extension functions](https://github.com/xianyi/OpenBLAS/wiki/OpenBLAS-Extensions)
|
||||
|
||||
## How to reference OpenBLAS.
|
||||
|
||||
You can reference our [papers](https://github.com/xianyi/OpenBLAS/wiki/publications).
|
||||
|
||||
Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net directly.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
version: 0.2.15.{build}
|
||||
version: 0.2.19.{build}
|
||||
|
||||
#environment:
|
||||
|
||||
@@ -39,4 +39,6 @@ before_build:
|
||||
- cmake -G "Visual Studio 12 Win64" .
|
||||
|
||||
test_script:
|
||||
- echo Build OK!
|
||||
- echo Running Test
|
||||
- cd c:\projects\OpenBLAS\utest
|
||||
- openblas_utest
|
||||
|
||||
@@ -33,6 +33,10 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread
|
||||
# Apple vecLib
|
||||
LIBVECLIB = -framework Accelerate
|
||||
|
||||
ESSL=/opt/ibm/lib
|
||||
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
|
||||
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
@@ -44,6 +48,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
|
||||
sger.goto dger.goto cger.goto zger.goto \
|
||||
sdot.goto ddot.goto \
|
||||
srot.goto drot.goto \
|
||||
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
|
||||
scopy.goto dcopy.goto ccopy.goto zcopy.goto \
|
||||
sswap.goto dswap.goto cswap.goto zswap.goto \
|
||||
@@ -151,6 +156,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
|
||||
sger.goto dger.goto cger.goto zger.goto \
|
||||
sdot.goto ddot.goto cdot.goto zdot.goto \
|
||||
srot.goto drot.goto \
|
||||
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
|
||||
scopy.goto dcopy.goto ccopy.goto zcopy.goto \
|
||||
sswap.goto dswap.goto cswap.goto zswap.goto \
|
||||
@@ -166,7 +172,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
|
||||
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
|
||||
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
|
||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto
|
||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
|
||||
smallscaling
|
||||
|
||||
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
||||
@@ -252,7 +259,9 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
|
||||
|
||||
endif
|
||||
|
||||
|
||||
essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \
|
||||
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \
|
||||
slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl
|
||||
|
||||
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
|
||||
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
|
||||
@@ -305,6 +314,9 @@ slinpack.mkl : slinpack.$(SUFFIX)
|
||||
slinpack.veclib : slinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
slinpack.essl : slinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Dlinpack ####################################################
|
||||
dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
@@ -321,6 +333,9 @@ dlinpack.mkl : dlinpack.$(SUFFIX)
|
||||
dlinpack.veclib : dlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
dlinpack.essl : dlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Clinpack ####################################################
|
||||
|
||||
clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME)
|
||||
@@ -338,6 +353,9 @@ clinpack.mkl : clinpack.$(SUFFIX)
|
||||
clinpack.veclib : clinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
clinpack.essl : clinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Zlinpack ####################################################
|
||||
|
||||
zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME)
|
||||
@@ -355,6 +373,9 @@ zlinpack.mkl : zlinpack.$(SUFFIX)
|
||||
zlinpack.veclib : zlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
zlinpack.essl : zlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Scholesky ###################################################
|
||||
|
||||
scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME)
|
||||
@@ -440,6 +461,9 @@ sgemm.mkl : sgemm.$(SUFFIX)
|
||||
sgemm.veclib : sgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
sgemm.essl : sgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Dgemm ####################################################
|
||||
dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
@@ -456,6 +480,9 @@ dgemm.mkl : dgemm.$(SUFFIX)
|
||||
dgemm.veclib : dgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
dgemm.essl : dgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Cgemm ####################################################
|
||||
|
||||
cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
@@ -473,6 +500,9 @@ cgemm.mkl : cgemm.$(SUFFIX)
|
||||
cgemm.veclib : cgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
cgemm.essl : cgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Zgemm ####################################################
|
||||
|
||||
zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
@@ -490,6 +520,9 @@ zgemm.mkl : zgemm.$(SUFFIX)
|
||||
zgemm.veclib : zgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
zgemm.essl : zgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Ssymm ####################################################
|
||||
ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
@@ -572,6 +605,9 @@ strmm.mkl : strmm.$(SUFFIX)
|
||||
strmm.veclib : strmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
strmm.essl : strmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Dtrmm ####################################################
|
||||
dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
@@ -588,6 +624,9 @@ dtrmm.mkl : dtrmm.$(SUFFIX)
|
||||
dtrmm.veclib : dtrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
dtrmm.essl : dtrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Ctrmm ####################################################
|
||||
|
||||
ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME)
|
||||
@@ -605,6 +644,9 @@ ctrmm.mkl : ctrmm.$(SUFFIX)
|
||||
ctrmm.veclib : ctrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
ctrmm.essl : ctrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Ztrmm ####################################################
|
||||
|
||||
ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME)
|
||||
@@ -622,6 +664,9 @@ ztrmm.mkl : ztrmm.$(SUFFIX)
|
||||
ztrmm.veclib : ztrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
ztrmm.essl : ztrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Strsm ####################################################
|
||||
strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
@@ -1412,6 +1457,39 @@ zdot.mkl : zdot-intel.$(SUFFIX)
|
||||
zdot.veclib : zdot-intel.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Srot ####################################################
|
||||
srot.goto : srot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
srot.acml : srot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
srot.atlas : srot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
srot.mkl : srot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
srot.veclib : srot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Drot ####################################################
|
||||
drot.goto : drot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
drot.acml : drot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
drot.atlas : drot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
drot.mkl : drot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
drot.veclib : drot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
|
||||
##################################### Saxpy ####################################################
|
||||
saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
@@ -2123,6 +2201,13 @@ cgesv.$(SUFFIX) : gesv.c
|
||||
zgesv.$(SUFFIX) : gesv.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
srot.$(SUFFIX) : rot.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
drot.$(SUFFIX) : rot.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -2132,9 +2217,11 @@ cgemm3m.$(SUFFIX) : gemm3m.c
|
||||
zgemm3m.$(SUFFIX) : gemm3m.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
smallscaling: smallscaling.c ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
|
||||
|
||||
clean ::
|
||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib
|
||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl
|
||||
|
||||
include $(TOPDIR)/Makefile.tail
|
||||
|
||||
|
||||
197
benchmark/rot.c
Normal file
197
benchmark/rot.c
Normal file
@@ -0,0 +1,197 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#undef DOT
|
||||
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define ROT BLASFUNC(drot)
|
||||
#else
|
||||
#define ROT BLASFUNC(srot)
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *y;
|
||||
// FLOAT result;
|
||||
blasint m, i;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
FLOAT c[1] = { 2.0 };
|
||||
FLOAT s[1] = { 2.0 };
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
ROT (&m, x, &inc_x, y, &inc_y, c, s);
|
||||
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
|
||||
timeg += time1;
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6);
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
196
benchmark/smallscaling.c
Normal file
196
benchmark/smallscaling.c
Normal file
@@ -0,0 +1,196 @@
|
||||
// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <cblas.h>
|
||||
#include <omp.h>
|
||||
#define MIN_SIZE 5
|
||||
#define MAX_SIZE 60
|
||||
#define NB_SIZE 10
|
||||
|
||||
// number of loop for a 1x1 matrix. Lower it if the test is
|
||||
// too slow on you computer.
|
||||
#define NLOOP 2e7
|
||||
|
||||
typedef struct {
|
||||
int matrix_size;
|
||||
int n_loop;
|
||||
void (* bench_func)();
|
||||
void (* blas_func)();
|
||||
void * (* create_matrix)(int size);
|
||||
} BenchParam;
|
||||
|
||||
void * s_create_matrix(int size) {
|
||||
float * r = malloc(size * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void * c_create_matrix(int size) {
|
||||
float * r = malloc(size * 2 * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < 2 * size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void * z_create_matrix(int size) {
|
||||
double * r = malloc(size * 2 * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < 2 * size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void * d_create_matrix(int size) {
|
||||
double * r = malloc(size * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void trmv_bench(BenchParam * param)
|
||||
{
|
||||
int i, n;
|
||||
int size = param->matrix_size;
|
||||
n = param->n_loop / size;
|
||||
int one = 1;
|
||||
void * A = param->create_matrix(size * size);
|
||||
void * y = param->create_matrix(size);
|
||||
for(i = 0; i < n; i++) {
|
||||
param->blas_func("U", "N", "N", &size, A, &size, y, &one);
|
||||
}
|
||||
free(A);
|
||||
free(y);
|
||||
}
|
||||
|
||||
void gemv_bench(BenchParam * param)
|
||||
{
|
||||
int i, n;
|
||||
int size = param->matrix_size;
|
||||
n = param->n_loop / size;
|
||||
double v = 1.01;
|
||||
int one = 1;
|
||||
void * A = param->create_matrix(size * size);
|
||||
void * y = param->create_matrix(size);
|
||||
for(i = 0; i < n; i++) {
|
||||
param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
|
||||
}
|
||||
free(A);
|
||||
free(y);
|
||||
}
|
||||
|
||||
void ger_bench(BenchParam * param) {
|
||||
int i, n;
|
||||
int size = param->matrix_size;
|
||||
n = param->n_loop / size;
|
||||
double v = 1.01;
|
||||
int one = 1;
|
||||
void * A = param->create_matrix(size * size);
|
||||
void * y = param->create_matrix(size);
|
||||
for(i = 0; i < n; i++) {
|
||||
param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size);
|
||||
}
|
||||
free(A);
|
||||
free(y);
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
void * pthread_func_wrapper(void * param) {
|
||||
((BenchParam *)param)->bench_func(param);
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
#endif
|
||||
|
||||
#define NB_TESTS 5
|
||||
void * TESTS[4 * NB_TESTS] = {
|
||||
trmv_bench, ztrmv_, z_create_matrix, "ztrmv",
|
||||
gemv_bench, dgemv_, d_create_matrix, "dgemv",
|
||||
gemv_bench, zgemv_, z_create_matrix, "zgemv",
|
||||
ger_bench, dger_, d_create_matrix, "dger",
|
||||
ger_bench, zgerc_, z_create_matrix, "zgerc",
|
||||
};
|
||||
|
||||
inline static double delta_time(struct timespec tick) {
|
||||
struct timespec tock;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tock);
|
||||
return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
|
||||
}
|
||||
|
||||
double pthread_bench(BenchParam * param, int nb_threads)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
return 0;
|
||||
#else
|
||||
BenchParam threaded_param = *param;
|
||||
pthread_t threads[nb_threads];
|
||||
int t, rc;
|
||||
struct timespec tick;
|
||||
threaded_param.n_loop /= nb_threads;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||
for(t=0; t<nb_threads; t++){
|
||||
rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param);
|
||||
if (rc){
|
||||
printf("ERROR; return code from pthread_create() is %d\n", rc);
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
for(t=0; t<nb_threads; t++){
|
||||
pthread_join(threads[t], NULL);
|
||||
}
|
||||
return delta_time(tick);
|
||||
#endif
|
||||
}
|
||||
|
||||
double seq_bench(BenchParam * param) {
|
||||
struct timespec tick;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||
param->bench_func(param);
|
||||
return delta_time(tick);
|
||||
}
|
||||
|
||||
double omp_bench(BenchParam * param) {
|
||||
BenchParam threaded_param = *param;
|
||||
struct timespec tick;
|
||||
int t;
|
||||
int nb_threads = omp_get_max_threads();
|
||||
threaded_param.n_loop /= nb_threads;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||
#pragma omp parallel for
|
||||
for(t = 0; t < nb_threads; t ++){
|
||||
param->bench_func(&threaded_param);
|
||||
}
|
||||
return delta_time(tick);
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE);
|
||||
BenchParam param;
|
||||
int test_id;
|
||||
printf ("Running on %d threads\n", omp_get_max_threads());
|
||||
for(test_id = 0; test_id < NB_TESTS; test_id ++) {
|
||||
double size = MIN_SIZE;
|
||||
param.bench_func = TESTS[test_id * 4];
|
||||
param.blas_func = TESTS[test_id * 4 + 1];
|
||||
param.create_matrix = TESTS[test_id * 4 + 2];
|
||||
printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]);
|
||||
param.n_loop = NLOOP;
|
||||
while(size <= MAX_SIZE) {
|
||||
param.matrix_size = (int)(size + 0.5);
|
||||
double seq_time = seq_bench(¶m);
|
||||
double omp_time = omp_bench(¶m);
|
||||
double pthread_time = pthread_bench(¶m, omp_get_max_threads());
|
||||
printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, "
|
||||
"pthread %gs, speedup %g\n",
|
||||
param.matrix_size, seq_time,
|
||||
omp_time, seq_time / omp_time,
|
||||
pthread_time, seq_time / pthread_time);
|
||||
size *= inc_factor;
|
||||
}
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
8
c_check
8
c_check
@@ -7,6 +7,7 @@ $hostarch = "x86_64" if ($hostarch eq "amd64");
|
||||
$hostarch = "arm" if ($hostarch =~ /^arm.*/);
|
||||
$hostarch = "arm64" if ($hostarch eq "aarch64");
|
||||
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
|
||||
$hostarch = "zarch" if ($hostarch eq "s390x");
|
||||
|
||||
$binary = $ENV{"BINARY"};
|
||||
|
||||
@@ -70,6 +71,7 @@ $architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
|
||||
$defined = 0;
|
||||
|
||||
@@ -89,6 +91,11 @@ if (($architecture eq "arm") || ($architecture eq "arm64")) {
|
||||
$defined = 1;
|
||||
}
|
||||
|
||||
if ($architecture eq "zarch") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "alpha") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
@@ -162,6 +169,7 @@ $architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
|
||||
$binformat = bin32;
|
||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
|
||||
@@ -2038,6 +2038,59 @@ set(MATGEN
|
||||
lapacke_zlagsy_work.c
|
||||
)
|
||||
|
||||
set(Utils_SRC
|
||||
lapacke_cgb_nancheck.c lapacke_dpf_nancheck.c lapacke_ssy_trans.c
|
||||
lapacke_cgb_trans.c lapacke_dpf_trans.c lapacke_stb_nancheck.c
|
||||
lapacke_cge_nancheck.c lapacke_dpo_nancheck.c lapacke_stb_trans.c
|
||||
lapacke_cge_trans.c lapacke_dpo_trans.c lapacke_stf_nancheck.c
|
||||
lapacke_cgg_nancheck.c lapacke_dpp_nancheck.c lapacke_stf_trans.c
|
||||
lapacke_cgg_trans.c lapacke_dpp_trans.c lapacke_stp_nancheck.c
|
||||
lapacke_cgt_nancheck.c lapacke_dpt_nancheck.c lapacke_stp_trans.c
|
||||
lapacke_chb_nancheck.c lapacke_dsb_nancheck.c lapacke_str_nancheck.c
|
||||
lapacke_chb_trans.c lapacke_dsb_trans.c lapacke_str_trans.c
|
||||
lapacke_che_nancheck.c lapacke_dsp_nancheck.c lapacke_xerbla.c
|
||||
lapacke_che_trans.c lapacke_dsp_trans.c lapacke_zgb_nancheck.c
|
||||
lapacke_chp_nancheck.c lapacke_dst_nancheck.c lapacke_zgb_trans.c
|
||||
lapacke_chp_trans.c lapacke_dsy_nancheck.c lapacke_zge_nancheck.c
|
||||
lapacke_chs_nancheck.c lapacke_dsy_trans.c lapacke_zge_trans.c
|
||||
lapacke_chs_trans.c lapacke_dtb_nancheck.c lapacke_zgg_nancheck.c
|
||||
lapacke_c_nancheck.c lapacke_dtb_trans.c lapacke_zgg_trans.c
|
||||
lapacke_cpb_nancheck.c lapacke_dtf_nancheck.c lapacke_zgt_nancheck.c
|
||||
lapacke_cpb_trans.c lapacke_dtf_trans.c lapacke_zhb_nancheck.c
|
||||
lapacke_cpf_nancheck.c lapacke_dtp_nancheck.c lapacke_zhb_trans.c
|
||||
lapacke_cpf_trans.c lapacke_dtp_trans.c lapacke_zhe_nancheck.c
|
||||
lapacke_cpo_nancheck.c lapacke_dtr_nancheck.c lapacke_zhe_trans.c
|
||||
lapacke_cpo_trans.c lapacke_dtr_trans.c lapacke_zhp_nancheck.c
|
||||
lapacke_cpp_nancheck.c lapacke_lsame.c lapacke_zhp_trans.c
|
||||
lapacke_cpp_trans.c lapacke_make_complex_double.c lapacke_zhs_nancheck.c
|
||||
lapacke_cpt_nancheck.c lapacke_make_complex_float.c lapacke_zhs_trans.c
|
||||
lapacke_csp_nancheck.c lapacke_sgb_nancheck.c lapacke_z_nancheck.c
|
||||
lapacke_csp_trans.c lapacke_sgb_trans.c lapacke_zpb_nancheck.c
|
||||
lapacke_cst_nancheck.c lapacke_sge_nancheck.c lapacke_zpb_trans.c
|
||||
lapacke_csy_nancheck.c lapacke_sge_trans.c lapacke_zpf_nancheck.c
|
||||
lapacke_csy_trans.c lapacke_sgg_nancheck.c lapacke_zpf_trans.c
|
||||
lapacke_ctb_nancheck.c lapacke_sgg_trans.c lapacke_zpo_nancheck.c
|
||||
lapacke_ctb_trans.c lapacke_sgt_nancheck.c lapacke_zpo_trans.c
|
||||
lapacke_ctf_nancheck.c lapacke_shs_nancheck.c lapacke_zpp_nancheck.c
|
||||
lapacke_ctf_trans.c lapacke_shs_trans.c lapacke_zpp_trans.c
|
||||
lapacke_ctp_nancheck.c lapacke_s_nancheck.c lapacke_zpt_nancheck.c
|
||||
lapacke_ctp_trans.c lapacke_spb_nancheck.c lapacke_zsp_nancheck.c
|
||||
lapacke_ctr_nancheck.c lapacke_spb_trans.c lapacke_zsp_trans.c
|
||||
lapacke_ctr_trans.c lapacke_spf_nancheck.c lapacke_zst_nancheck.c
|
||||
lapacke_dgb_nancheck.c lapacke_spf_trans.c lapacke_zsy_nancheck.c
|
||||
lapacke_dgb_trans.c lapacke_spo_nancheck.c lapacke_zsy_trans.c
|
||||
lapacke_dge_nancheck.c lapacke_spo_trans.c lapacke_ztb_nancheck.c
|
||||
lapacke_dge_trans.c lapacke_spp_nancheck.c lapacke_ztb_trans.c
|
||||
lapacke_dgg_nancheck.c lapacke_spp_trans.c lapacke_ztf_nancheck.c
|
||||
lapacke_dgg_trans.c lapacke_spt_nancheck.c lapacke_ztf_trans.c
|
||||
lapacke_dgt_nancheck.c lapacke_ssb_nancheck.c lapacke_ztp_nancheck.c
|
||||
lapacke_dhs_nancheck.c lapacke_ssb_trans.c lapacke_ztp_trans.c
|
||||
lapacke_dhs_trans.c lapacke_ssp_nancheck.c lapacke_ztr_nancheck.c
|
||||
lapacke_d_nancheck.c lapacke_ssp_trans.c lapacke_ztr_trans.c
|
||||
lapacke_dpb_nancheck.c lapacke_sst_nancheck.c
|
||||
lapacke_dpb_trans.c lapacke_ssy_nancheck.c
|
||||
)
|
||||
|
||||
set(LAPACKE_REL_SRC "")
|
||||
if (BUILD_SINGLE)
|
||||
list(APPEND LAPACKE_REL_SRC ${SSRC})
|
||||
@@ -2061,6 +2114,10 @@ foreach (LAE_FILE ${LAPACKE_REL_SRC})
|
||||
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/src/${LAE_FILE}")
|
||||
endforeach ()
|
||||
|
||||
foreach (Utils_FILE ${Utils_SRC})
|
||||
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/utils/${Utils_FILE}")
|
||||
endforeach ()
|
||||
|
||||
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h" "${lapacke_include_dir}/lapacke_mangling.h")
|
||||
include_directories(${lapacke_include_dir})
|
||||
|
||||
7
common.h
7
common.h
@@ -332,12 +332,13 @@ typedef int blasint;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
#ifdef PILEDRIVER
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
|
||||
/*
|
||||
#ifdef STEAMROLLER
|
||||
@@ -408,6 +409,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
||||
#include "common_arm64.h"
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_ZARCH
|
||||
#include "common_zarch.h"
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#ifdef OS_WINDOWS
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
|
||||
@@ -70,7 +70,7 @@ extern long int syscall (long int __sysno, ...);
|
||||
static inline int my_mbind(void *addr, unsigned long len, int mode,
|
||||
unsigned long *nodemask, unsigned long maxnode,
|
||||
unsigned flags) {
|
||||
#if defined (__LSB_VERSION__)
|
||||
#if defined (__LSB_VERSION__) || defined(ARCH_ZARCH)
|
||||
// So far, LSB (Linux Standard Base) don't support syscall().
|
||||
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
|
||||
return 0;
|
||||
@@ -90,7 +90,7 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
|
||||
}
|
||||
|
||||
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {
|
||||
#if defined (__LSB_VERSION__)
|
||||
#if defined (__LSB_VERSION__) || defined(ARCH_ZARCH)
|
||||
// So far, LSB (Linux Standard Base) don't support syscall().
|
||||
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
|
||||
return 0;
|
||||
|
||||
@@ -798,7 +798,7 @@ Lmcount$lazy_ptr:
|
||||
#elif defined(PPC440FP2)
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#elif defined(POWER8)
|
||||
#define BUFFER_SIZE ( 64 << 20)
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#endif
|
||||
|
||||
@@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \
|
||||
stack_alloc_size = 0; \
|
||||
STACK_ALLOC_PROTECT_SET \
|
||||
TYPE stack_buffer[stack_alloc_size]; \
|
||||
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \
|
||||
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
|
||||
#else
|
||||
//Original OpenBLAS/GotoBLAS codes.
|
||||
|
||||
@@ -62,7 +62,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
// use intrinsic instead of inline assembly
|
||||
ret = _InterlockedExchange(address, 1);
|
||||
ret = _InterlockedExchange((volatile LONG *)address, 1);
|
||||
// inline assembly
|
||||
/*__asm {
|
||||
mov eax, address
|
||||
|
||||
@@ -396,7 +396,7 @@ REALNAME:
|
||||
|
||||
#define PROFCODE
|
||||
|
||||
#define EPILOGUE .end REALNAME
|
||||
#define EPILOGUE .end
|
||||
#endif
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
|
||||
|
||||
140
common_zarch.h
Normal file
140
common_zarch.h
Normal file
@@ -0,0 +1,140 @@
|
||||
/*****************************************************************************
|
||||
Copyright (c) 2011-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
#ifndef COMMON_ZARCH
|
||||
#define COMMON_ZARCH
|
||||
|
||||
#define MB
|
||||
//__asm__ __volatile__ ("dmb ish" : : : "memory")
|
||||
#define WMB
|
||||
//__asm__ __volatile__ ("dmb ishst" : : : "memory")
|
||||
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#define RETURN_BY_COMPLEX
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
/*
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
BLASULONG ret;
|
||||
|
||||
do {
|
||||
while (*address) {YIELDING;};
|
||||
|
||||
__asm__ __volatile__(
|
||||
"mov x4, #1 \n\t"
|
||||
"1: \n\t"
|
||||
"ldaxr x2, [%1] \n\t"
|
||||
"cbnz x2, 1b \n\t"
|
||||
"2: \n\t"
|
||||
"stxr w3, x4, [%1] \n\t"
|
||||
"cbnz w3, 1b \n\t"
|
||||
"mov %0, #0 \n\t"
|
||||
: "=r"(ret), "=r"(address)
|
||||
: "1"(address)
|
||||
: "memory", "x2" , "x3", "x4"
|
||||
|
||||
|
||||
);
|
||||
|
||||
|
||||
} while (ret);
|
||||
|
||||
}
|
||||
*/
|
||||
//#define BLAS_LOCK_DEFINED
|
||||
|
||||
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
}
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory")
|
||||
#else
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory")
|
||||
#endif
|
||||
|
||||
#define GET_IMAGE_CANCEL
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef F_INTERFACE
|
||||
#define REALNAME ASMNAME
|
||||
#else
|
||||
#define REALNAME ASMFNAME
|
||||
#endif
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.text ;\
|
||||
.align 256 ;\
|
||||
.global REALNAME ;\
|
||||
.type REALNAME, %function ;\
|
||||
REALNAME:
|
||||
|
||||
|
||||
#define EPILOGUE
|
||||
|
||||
#define PROFCODE
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#ifndef PAGESIZE
|
||||
#define PAGESIZE ( 4 << 10)
|
||||
#endif
|
||||
#define HUGE_PAGESIZE ( 4 << 20)
|
||||
|
||||
#if defined(CORTEXA57)
|
||||
#define BUFFER_SIZE (20 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE (16 << 20)
|
||||
#endif
|
||||
|
||||
|
||||
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
|
||||
|
||||
#ifndef MAP_ANONYMOUS
|
||||
#define MAP_ANONYMOUS MAP_ANON
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -191,6 +191,8 @@ void get_cpuconfig(void)
|
||||
printf("#define L2_SIZE 2097152\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
93
cpuid_zarch.c
Normal file
93
cpuid_zarch.c
Normal file
@@ -0,0 +1,93 @@
|
||||
/**************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
{
|
||||
// return CPU_GENERIC;
|
||||
return CPU_Z13;
|
||||
|
||||
}
|
||||
|
||||
void get_libname(void)
|
||||
{
|
||||
|
||||
int d = detect();
|
||||
printf("%s", cpuname_lower[d]);
|
||||
}
|
||||
|
||||
char *get_corename(void)
|
||||
{
|
||||
return cpuname[detect()];
|
||||
}
|
||||
|
||||
void get_architecture(void)
|
||||
{
|
||||
printf("ZARCH");
|
||||
}
|
||||
|
||||
void get_subarchitecture(void)
|
||||
{
|
||||
int d = detect();
|
||||
printf("%s", cpuname[d]);
|
||||
}
|
||||
|
||||
void get_subdirname(void)
|
||||
{
|
||||
printf("zarch");
|
||||
}
|
||||
|
||||
|
||||
void get_cpuconfig(void)
|
||||
{
|
||||
|
||||
int d = detect();
|
||||
switch (d){
|
||||
case CPU_GENERIC:
|
||||
printf("#define ZARCH_GENERIC\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
case CPU_Z13:
|
||||
printf("#define Z13\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
4
ctest.c
4
ctest.c
@@ -105,6 +105,10 @@ ARCH_X86_64
|
||||
ARCH_POWER
|
||||
#endif
|
||||
|
||||
#if defined(__s390x__) || defined(__zarch__)
|
||||
ARCH_ZARCH
|
||||
#endif
|
||||
|
||||
#ifdef __mips64
|
||||
ARCH_MIPS64
|
||||
#endif
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'ZBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||
#endif
|
||||
|
||||
x = buffer;
|
||||
buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
|
||||
buffer += ((COMPSIZE * args -> m + 3) & ~3);
|
||||
}
|
||||
|
||||
#ifndef TRANS
|
||||
@@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
||||
|
||||
if (num_cpu) {
|
||||
queue[0].sa = NULL;
|
||||
queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
|
||||
queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE;
|
||||
|
||||
queue[num_cpu - 1].next = NULL;
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
||||
@@ -99,6 +99,17 @@ foreach (float_type ${FLOAT_TYPES})
|
||||
endif()
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
# for gemm3m
|
||||
if(USE_GEMM3M)
|
||||
foreach (GEMM_DEFINE ${GEMM_DEFINES})
|
||||
string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC)
|
||||
GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE}" "gemm3m_${GEMM_DEFINE_LC}" false "" "" false ${float_type})
|
||||
if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3)
|
||||
GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm3m_thread_${GEMM_DEFINE_LC}" false "" "" false ${float_type})
|
||||
endif ()
|
||||
endforeach ()
|
||||
endif()
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ set(COMMON_SOURCES
|
||||
xerbla.c
|
||||
openblas_set_num_threads.c
|
||||
openblas_error_handle.c
|
||||
openblas_env.c
|
||||
openblas_get_num_procs.c
|
||||
openblas_get_num_threads.c
|
||||
)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
TOPDIR = ../..
|
||||
include ../../Makefile.system
|
||||
|
||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
|
||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) openblas_env.$(SUFFIX)
|
||||
|
||||
#COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
|
||||
|
||||
@@ -118,6 +118,9 @@ openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c
|
||||
openblas_error_handle.$(SUFFIX) : openblas_error_handle.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
openblas_env.$(SUFFIX) : openblas_env.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
|
||||
@@ -92,6 +92,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#endif
|
||||
#endif
|
||||
|
||||
extern unsigned int openblas_thread_timeout();
|
||||
|
||||
#ifdef SMP_SERVER
|
||||
|
||||
#undef MONITOR
|
||||
@@ -524,6 +526,7 @@ static int blas_monitor(void *arg){
|
||||
int blas_thread_init(void){
|
||||
BLASLONG i;
|
||||
int ret;
|
||||
int thread_timeout_env;
|
||||
#ifdef NEED_STACKATTR
|
||||
pthread_attr_t attr;
|
||||
#endif
|
||||
@@ -540,22 +543,12 @@ int blas_thread_init(void){
|
||||
|
||||
if (!blas_server_avail){
|
||||
|
||||
env_var_t p;
|
||||
|
||||
if (readenv(p,"THREAD_TIMEOUT")) {
|
||||
thread_timeout = atoi(p);
|
||||
if (thread_timeout < 4) thread_timeout = 4;
|
||||
if (thread_timeout > 30) thread_timeout = 30;
|
||||
thread_timeout = (1 << thread_timeout);
|
||||
}else{
|
||||
if (readenv(p,"GOTO_THREAD_TIMEOUT")) {
|
||||
thread_timeout = atoi(p);
|
||||
if (thread_timeout < 4) thread_timeout = 4;
|
||||
if (thread_timeout > 30) thread_timeout = 30;
|
||||
thread_timeout = (1 << thread_timeout);
|
||||
}
|
||||
}
|
||||
|
||||
thread_timeout_env=openblas_thread_timeout();
|
||||
if (thread_timeout_env>0) {
|
||||
if (thread_timeout_env < 4) thread_timeout_env = 4;
|
||||
if (thread_timeout_env > 30) thread_timeout_env = 30;
|
||||
thread_timeout = (1 << thread_timeout_env);
|
||||
}
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
|
||||
|
||||
@@ -391,7 +391,7 @@ static char *corename[] = {
|
||||
"Nehalem",
|
||||
"Athlon",
|
||||
"Opteron",
|
||||
"Opteron(SSE3)",
|
||||
"Opteron_SSE3",
|
||||
"Barcelona",
|
||||
"Nano",
|
||||
"Sandybridge",
|
||||
|
||||
@@ -294,8 +294,11 @@ void openblas_fork_handler()
|
||||
#endif
|
||||
}
|
||||
|
||||
extern int openblas_num_threads_env();
|
||||
extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
env_var_t p;
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
@@ -310,18 +313,18 @@ int blas_get_cpu_number(void){
|
||||
|
||||
blas_goto_num = 0;
|
||||
#ifndef USE_OPENMP
|
||||
if (readenv(p,"OPENBLAS_NUM_THREADS")) blas_goto_num = atoi(p);
|
||||
blas_goto_num=openblas_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
|
||||
if (blas_goto_num == 0) {
|
||||
if (readenv(p,"GOTO_NUM_THREADS")) blas_goto_num = atoi(p);
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
blas_goto_num=openblas_goto_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
blas_omp_num = 0;
|
||||
if (readenv(p,"OMP_NUM_THREADS")) blas_omp_num = atoi(p);
|
||||
blas_omp_num=openblas_omp_num_threads_env();
|
||||
if (blas_omp_num < 0) blas_omp_num = 0;
|
||||
|
||||
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
|
||||
@@ -1340,6 +1343,7 @@ static void gotoblas_memory_init(void) {
|
||||
/* Initialization for all function; this function should be called before main */
|
||||
|
||||
static int gotoblas_initialized = 0;
|
||||
extern void openblas_read_env();
|
||||
|
||||
void CONSTRUCTOR gotoblas_init(void) {
|
||||
|
||||
@@ -1349,6 +1353,8 @@ void CONSTRUCTOR gotoblas_init(void) {
|
||||
openblas_fork_handler();
|
||||
#endif
|
||||
|
||||
openblas_read_env();
|
||||
|
||||
#ifdef PROFILE
|
||||
moncontrol (0);
|
||||
#endif
|
||||
@@ -1365,7 +1371,8 @@ void CONSTRUCTOR gotoblas_init(void) {
|
||||
gotoblas_memory_init();
|
||||
#endif
|
||||
|
||||
#if defined(OS_LINUX)
|
||||
//#if defined(OS_LINUX)
|
||||
#if 0
|
||||
struct rlimit curlimit;
|
||||
if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
|
||||
{
|
||||
@@ -1445,6 +1452,31 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/*
|
||||
This is to allow static linking.
|
||||
Code adapted from Google performance tools:
|
||||
https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
|
||||
Reference:
|
||||
https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
|
||||
http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
|
||||
*/
|
||||
static int on_process_term(void)
|
||||
{
|
||||
gotoblas_quit();
|
||||
return 0;
|
||||
}
|
||||
#ifdef _WIN64
|
||||
#pragma comment(linker, "/INCLUDE:_tls_used")
|
||||
#else
|
||||
#pragma comment(linker, "/INCLUDE:__tls_used")
|
||||
#endif
|
||||
#pragma data_seg(push, old_seg)
|
||||
#pragma data_seg(".CRT$XLB")
|
||||
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
#pragma data_seg(".CRT$XTU")
|
||||
static int(*p_process_term)(void) = on_process_term;
|
||||
#pragma data_seg(pop, old_seg)
|
||||
#endif
|
||||
|
||||
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
|
||||
|
||||
84
driver/others/openblas_env.c
Normal file
84
driver/others/openblas_env.c
Normal file
@@ -0,0 +1,84 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2011-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static int openblas_env_verbose=0;
|
||||
static unsigned int openblas_env_thread_timeout=0;
|
||||
static int openblas_env_block_factor=0;
|
||||
static int openblas_env_openblas_num_threads=0;
|
||||
static int openblas_env_goto_num_threads=0;
|
||||
static int openblas_env_omp_num_threads=0;
|
||||
|
||||
int openblas_verbose() { return openblas_env_verbose;}
|
||||
unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;}
|
||||
int openblas_block_factor() { return openblas_env_block_factor;}
|
||||
int openblas_num_threads_env() { return openblas_env_openblas_num_threads;}
|
||||
int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;}
|
||||
int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;}
|
||||
|
||||
void openblas_read_env() {
|
||||
int ret=0;
|
||||
env_var_t p;
|
||||
if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_verbose=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OPENBLAS_BLOCK_FACTOR")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_block_factor=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OPENBLAS_THREAD_TIMEOUT")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_thread_timeout=(unsigned int)ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_openblas_num_threads=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_goto_num_threads=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OMP_NUM_THREADS")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_omp_num_threads=ret;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -33,13 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int openblas_verbose() {
|
||||
int ret=0;
|
||||
env_var_t p;
|
||||
if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
return ret;
|
||||
}
|
||||
extern int openblas_verbose();
|
||||
|
||||
void openblas_warning(int verbose, const char * msg) {
|
||||
int current_verbose;
|
||||
|
||||
@@ -40,6 +40,7 @@
|
||||
#include <string.h>
|
||||
#include "common.h"
|
||||
|
||||
extern int openblas_block_factor();
|
||||
int get_L2_size(void);
|
||||
|
||||
#define DEFAULT_GEMM_P 128
|
||||
@@ -249,7 +250,6 @@ int get_L2_size(void){
|
||||
|
||||
void blas_set_parameter(void){
|
||||
|
||||
env_var_t p;
|
||||
int factor;
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
|
||||
int size = 16;
|
||||
@@ -468,9 +468,8 @@ void blas_set_parameter(void){
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
if (readenv(p,"GOTO_BLOCK_FACTOR")) {
|
||||
factor = atoi(p);
|
||||
factor=openblas_block_factor();
|
||||
if (factor>0) {
|
||||
if (factor < 10) factor = 10;
|
||||
if (factor > 200) factor = 200;
|
||||
|
||||
|
||||
@@ -26,6 +26,10 @@ ifndef ONLY_CBLAS
|
||||
ONLY_CBLAS = 0
|
||||
endif
|
||||
|
||||
ifndef BUILD_LAPACK_DEPRECATED
|
||||
BUILD_LAPACK_DEPRECATED = 0
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifndef ONLY_CBLAS
|
||||
@@ -92,10 +96,10 @@ dll : ../$(LIBDLLNAME)
|
||||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB)
|
||||
|
||||
libopenblas.def : gensymbol
|
||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
libgoto_hpl.def : gensymbol
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
||||
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
||||
@@ -205,23 +209,23 @@ static : ../$(LIBNAME)
|
||||
rm -f goto.$(SUFFIX)
|
||||
|
||||
osx.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
aix.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
objcopy.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
objconv.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
test : linktest.c
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
|
||||
rm -f linktest
|
||||
|
||||
linktest.c : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > linktest.c
|
||||
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c
|
||||
|
||||
clean ::
|
||||
@rm -f *.def *.dylib __.SYMDEF* *.renamed
|
||||
|
||||
@@ -590,6 +590,13 @@
|
||||
dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx,
|
||||
);
|
||||
|
||||
@lapack_deprecated_objs = (
|
||||
cgegs, cggsvd, ctzrqf, dgeqpf, dlatzm, sgelsx, slahrd, zgegv, zggsvp,
|
||||
cgegv, cggsvp, dgegs, dggsvd, dtzrqf, sgeqpf, slatzm, zgelsx, zlahrd,
|
||||
cgelsx, clahrd, dgegv, dggsvp, sgegs, sggsvd, stzrqf, zgeqpf, zlatzm,
|
||||
cgeqpf, clatzm, dgelsx, dlahrd, sgegv, sggsvp, zgegs, zggsvd, ztzrqf,
|
||||
);
|
||||
|
||||
@lapackeobjs = (
|
||||
# LAPACK C interface routines.
|
||||
#
|
||||
@@ -2984,6 +2991,11 @@ if ($ARGV[8] == 1) {
|
||||
@need_2underscore_objs = (@lapack_embeded_underscore_objs);
|
||||
};
|
||||
|
||||
if ($ARGV[11] == 1){
|
||||
#BUILD_LAPACK_DEPRECATED=1
|
||||
@underscore_objs =(@underscore_objs, @lapack_deprecated_objs);
|
||||
}
|
||||
|
||||
} else {
|
||||
@underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs);
|
||||
}
|
||||
|
||||
15
getarch.c
15
getarch.c
@@ -862,6 +862,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
#if defined(__zarch__) || defined(__s390x__)
|
||||
#define ZARCH
|
||||
#include "cpuid_zarch.c"
|
||||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
#ifdef INTEL_AMD
|
||||
#include "cpuid_x86.c"
|
||||
#define OPENBLAS_SUPPORTED
|
||||
@@ -957,7 +963,7 @@ int main(int argc, char *argv[]){
|
||||
#ifdef FORCE
|
||||
printf("CORE=%s\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH)
|
||||
printf("CORE=%s\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
@@ -1013,7 +1019,12 @@ int main(int argc, char *argv[]){
|
||||
#endif
|
||||
|
||||
#ifdef MAKE_NB_JOBS
|
||||
#if MAKE_NB_JOBS > 0
|
||||
printf("MAKE += -j %d\n", MAKE_NB_JOBS);
|
||||
#else
|
||||
// Let make use parent -j argument or -j1 if there
|
||||
// is no make parent
|
||||
#endif
|
||||
#elif NO_PARALLEL_MAKE==1
|
||||
printf("MAKE += -j 1\n");
|
||||
#else
|
||||
@@ -1059,7 +1070,7 @@ int main(int argc, char *argv[]){
|
||||
#ifdef FORCE
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH)
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -64,10 +64,13 @@ int main(int argc, char **argv) {
|
||||
|
||||
|
||||
if ((argc >= 2) && (*argv[1] == '1')) {
|
||||
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64)
|
||||
printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float)));
|
||||
printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double)));
|
||||
printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float)));
|
||||
printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double)));
|
||||
#endif
|
||||
|
||||
#ifdef USE64BITINT
|
||||
printf("#define USE64BITINT\n");
|
||||
|
||||
@@ -77,6 +77,7 @@ void NAME(char *TRANS, blasint *M, blasint *N,
|
||||
blasint incy = *INCY;
|
||||
|
||||
FLOAT *buffer;
|
||||
int buffer_size;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
#endif
|
||||
@@ -141,7 +142,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
FLOAT *buffer;
|
||||
blasint lenx, leny;
|
||||
int trans;
|
||||
int trans, buffer_size;
|
||||
blasint info, t;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
@@ -230,7 +231,19 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (incx < 0) x -= (lenx - 1) * incx * 2;
|
||||
if (incy < 0) y -= (leny - 1) * incy * 2;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT) ;
|
||||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
|
||||
#if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
|
||||
// cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746)
|
||||
if(trans && stack_alloc_size)
|
||||
memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size));
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
@@ -253,7 +266,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
}
|
||||
#endif
|
||||
|
||||
blas_memory_free(buffer);
|
||||
STACK_FREE(buffer);
|
||||
|
||||
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
||||
|
||||
|
||||
@@ -210,7 +210,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (incy < 0) y -= (n - 1) * incy * 2;
|
||||
if (incx < 0) x -= (m - 1) * incx * 2;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
STACK_ALLOC(2 * m, FLOAT, buffer);
|
||||
|
||||
#ifdef SMPTEST
|
||||
// Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
|
||||
@@ -249,7 +249,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
}
|
||||
#endif
|
||||
|
||||
blas_memory_free(buffer);
|
||||
STACK_FREE(buffer);
|
||||
|
||||
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
||||
|
||||
|
||||
@@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
|
||||
blasint info;
|
||||
int uplo;
|
||||
int unit;
|
||||
int trans;
|
||||
int trans, buffer_size;
|
||||
FLOAT *buffer;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
@@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) {
|
||||
|
||||
int trans, uplo, unit;
|
||||
int trans, uplo, unit, buffer_size;
|
||||
blasint info;
|
||||
FLOAT *buffer;
|
||||
#ifdef SMP
|
||||
@@ -227,11 +227,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
|
||||
if (incx < 0 ) x -= (n - 1) * incx * 2;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
#ifdef SMP
|
||||
// Calibrated on a Xeon E5-2630
|
||||
if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) {
|
||||
nthreads = num_cpu_avail(2);
|
||||
if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
|
||||
nthreads = 2;
|
||||
} else
|
||||
nthreads = 1;
|
||||
|
||||
if(nthreads > 1) {
|
||||
buffer_size = n > 16 ? 0 : n * 4 + 40;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT);
|
||||
if(incx != 1)
|
||||
buffer_size += n * 2;
|
||||
}
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -245,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
}
|
||||
#endif
|
||||
|
||||
blas_memory_free(buffer);
|
||||
STACK_FREE(buffer);
|
||||
|
||||
FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n);
|
||||
|
||||
|
||||
@@ -227,6 +227,28 @@ foreach (float_type ${FLOAT_TYPES})
|
||||
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type})
|
||||
|
||||
#gemm3m
|
||||
if (USE_GEMM3M)
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM3MKERNEL}" "NN" "gemm3m_kernel" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA" "gemm3m_oncopyb" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;REAL_ONLY" "gemm3m_oncopyr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;IMAGE_ONLY" "gemm3m_oncopyi" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA" "gemm3m_otcopyb" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;REAL_ONLY" "gemm3m_otcopyr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;IMAGE_ONLY" "gemm3m_otcopyi" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY" "gemm3m_incopyb" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;REAL_ONLY" "gemm3m_incopyr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;IMAGE_ONLY" "gemm3m_incopyi" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY" "gemm3m_itcopyb" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;REAL_ONLY" "gemm3m_itcopyr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;IMAGE_ONLY" "gemm3m_itcopyi" false "" "" false ${float_type})
|
||||
|
||||
endif()
|
||||
|
||||
else () #For real
|
||||
GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type})
|
||||
|
||||
|
||||
@@ -40,6 +40,10 @@ ifeq ($(CORE), POWER8)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z13)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -60,32 +60,55 @@ DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x4.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x4.S
|
||||
CTRMMKERNEL = ctrmm_kernel_4x4.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_4x4.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_4x4.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_4x4.S
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
endif
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_4x4.S
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_4x4.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy.o
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy.o
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
|
||||
@@ -179,93 +179,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v2.4s, v3.4s}, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmul v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.4s, v0.4s, v9.4s[0]
|
||||
fmls v17.4s, v0.4s, v9.s[0]
|
||||
#else
|
||||
fmul v17.4s, v0.4s, v9.4s[0]
|
||||
fmul v17.4s, v0.4s, v9.s[0]
|
||||
#endif
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
fmul v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.4s, v0.4s, v9.4s[1]
|
||||
fmls v21.4s, v0.4s, v9.s[1]
|
||||
#else
|
||||
fmul v21.4s, v0.4s, v9.4s[1]
|
||||
fmul v21.4s, v0.4s, v9.s[1]
|
||||
#endif
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
fmul v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
fmul v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.4s, v0.4s, v9.4s[2]
|
||||
fmls v25.4s, v0.4s, v9.s[2]
|
||||
#else
|
||||
fmul v25.4s, v0.4s, v9.4s[2]
|
||||
fmul v25.4s, v0.4s, v9.s[2]
|
||||
#endif
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
fmul v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
fmul v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.4s, v0.4s, v9.4s[3]
|
||||
fmls v29.4s, v0.4s, v9.s[3]
|
||||
#else
|
||||
fmul v29.4s, v0.4s, v9.4s[3]
|
||||
fmul v29.4s, v0.4s, v9.s[3]
|
||||
#endif
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
fmul v18.4s, v2.4s, v8.4s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.4s[0]
|
||||
fmul v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.s[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.4s, v2.4s, v9.4s[0]
|
||||
fmls v19.4s, v2.4s, v9.s[0]
|
||||
#else
|
||||
fmul v19.4s, v2.4s, v9.4s[0]
|
||||
fmul v19.4s, v2.4s, v9.s[0]
|
||||
#endif
|
||||
OP_ir v19.4s, v3.4s, v8.4s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
fmul v22.4s, v2.4s, v8.4s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.4s[1]
|
||||
fmul v22.4s, v2.4s, v8.s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.s[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v23.16b, v23.16b, v23.16b
|
||||
fmls v23.4s, v2.4s, v9.4s[1]
|
||||
fmls v23.4s, v2.4s, v9.s[1]
|
||||
#else
|
||||
fmul v23.4s, v2.4s, v9.4s[1]
|
||||
fmul v23.4s, v2.4s, v9.s[1]
|
||||
#endif
|
||||
OP_ir v23.4s, v3.4s, v8.4s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
fmul v26.4s, v2.4s, v8.4s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.4s[2]
|
||||
fmul v26.4s, v2.4s, v8.s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.s[2]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v27.16b, v27.16b, v27.16b
|
||||
fmls v27.4s, v2.4s, v9.4s[2]
|
||||
fmls v27.4s, v2.4s, v9.s[2]
|
||||
#else
|
||||
fmul v27.4s, v2.4s, v9.4s[2]
|
||||
fmul v27.4s, v2.4s, v9.s[2]
|
||||
#endif
|
||||
OP_ir v27.4s, v3.4s, v8.4s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.s[2]
|
||||
|
||||
fmul v30.4s, v2.4s, v8.4s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.4s[3]
|
||||
fmul v30.4s, v2.4s, v8.s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.s[3]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v31.16b, v31.16b, v31.16b
|
||||
fmls v31.4s, v2.4s, v9.4s[3]
|
||||
fmls v31.4s, v2.4s, v9.s[3]
|
||||
#else
|
||||
fmul v31.4s, v2.4s, v9.4s[3]
|
||||
fmul v31.4s, v2.4s, v9.s[3]
|
||||
#endif
|
||||
OP_ir v31.4s, v3.4s, v8.4s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.s[3]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB]
|
||||
add pB, pB, #32
|
||||
@@ -276,159 +276,159 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB] // for next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
ld2 {v4.4s, v5.4s} , [pA] // for next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v18.4s, v2.4s, v8.4s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.4s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.4s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.4s[0]
|
||||
OP_rr v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
ld2 {v6.4s, v7.4s} , [ppA] // for next round
|
||||
add ppA, ppA, #32
|
||||
|
||||
OP_rr v22.4s, v2.4s, v8.4s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.4s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.4s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.4s[1]
|
||||
OP_rr v22.4s, v2.4s, v8.s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [ppA, #512]
|
||||
|
||||
OP_rr v26.4s, v2.4s, v8.4s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.4s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.4s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.4s[2]
|
||||
OP_rr v26.4s, v2.4s, v8.s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.s[2]
|
||||
|
||||
OP_rr v30.4s, v2.4s, v8.4s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.4s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.4s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.4s[3]
|
||||
OP_rr v30.4s, v2.4s, v8.s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
ld2 {v8.4s, v9.4s}, [pB] // for next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
ld2 {v0.4s, v1.4s}, [pA] // for next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
|
||||
prfm PLDL1KEEP, [ppA, #512]
|
||||
|
||||
OP_rr v18.4s, v6.4s, v12.4s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.4s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.4s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.4s[0]
|
||||
OP_rr v18.4s, v6.4s, v12.s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.s[0]
|
||||
|
||||
ld2 {v2.4s, v3.4s}, [ppA] // for next round
|
||||
add ppA, ppA, #32
|
||||
|
||||
OP_rr v22.4s, v6.4s, v12.4s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.4s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.4s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.4s[1]
|
||||
OP_rr v22.4s, v6.4s, v12.s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v26.4s, v6.4s, v12.4s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.4s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.4s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.4s[2]
|
||||
OP_rr v26.4s, v6.4s, v12.s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.s[2]
|
||||
|
||||
OP_rr v30.4s, v6.4s, v12.4s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.4s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.4s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.4s[3]
|
||||
OP_rr v30.4s, v6.4s, v12.s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
|
||||
OP_rr v18.4s, v6.4s, v12.4s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.4s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.4s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.4s[0]
|
||||
OP_rr v18.4s, v6.4s, v12.s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.s[0]
|
||||
|
||||
OP_rr v22.4s, v6.4s, v12.4s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.4s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.4s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.4s[1]
|
||||
OP_rr v22.4s, v6.4s, v12.s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.s[1]
|
||||
|
||||
OP_rr v26.4s, v6.4s, v12.4s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.4s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.4s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.4s[2]
|
||||
OP_rr v26.4s, v6.4s, v12.s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.s[2]
|
||||
|
||||
OP_rr v30.4s, v6.4s, v12.4s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.4s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.4s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.4s[3]
|
||||
OP_rr v30.4s, v6.4s, v12.s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
@@ -437,48 +437,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
ld2 {v2.4s, v3.4s}, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
OP_rr v18.4s, v2.4s, v8.4s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.4s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.4s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.4s[0]
|
||||
OP_rr v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
OP_rr v22.4s, v2.4s, v8.4s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.4s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.4s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.4s[1]
|
||||
OP_rr v22.4s, v2.4s, v8.s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
OP_rr v26.4s, v2.4s, v8.4s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.4s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.4s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.4s[2]
|
||||
OP_rr v26.4s, v2.4s, v8.s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.s[2]
|
||||
|
||||
OP_rr v30.4s, v2.4s, v8.4s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.4s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.4s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.4s[3]
|
||||
OP_rr v30.4s, v2.4s, v8.s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
@@ -578,25 +578,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
@@ -658,25 +658,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr v16.2s, v0.2s, v8.4s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.4s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.4s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.4s[0]
|
||||
OP_rr v16.2s, v0.2s, v8.s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
OP_rr v20.2s, v0.2s, v8.4s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.4s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.4s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.4s[1]
|
||||
OP_rr v20.2s, v0.2s, v8.s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
OP_rr v24.2s, v0.2s, v8.4s[2]
|
||||
OP_ii v24.2s, v1.2s, v9.4s[2]
|
||||
OP_ri v25.2s, v0.2s, v9.4s[2]
|
||||
OP_ir v25.2s, v1.2s, v8.4s[2]
|
||||
OP_rr v24.2s, v0.2s, v8.s[2]
|
||||
OP_ii v24.2s, v1.2s, v9.s[2]
|
||||
OP_ri v25.2s, v0.2s, v9.s[2]
|
||||
OP_ir v25.2s, v1.2s, v8.s[2]
|
||||
|
||||
OP_rr v28.2s, v0.2s, v8.4s[3]
|
||||
OP_ii v28.2s, v1.2s, v9.4s[3]
|
||||
OP_ri v29.2s, v0.2s, v9.4s[3]
|
||||
OP_ir v29.2s, v1.2s, v8.4s[3]
|
||||
OP_rr v28.2s, v0.2s, v8.s[3]
|
||||
OP_ii v28.2s, v1.2s, v9.s[3]
|
||||
OP_ri v29.2s, v0.2s, v9.s[3]
|
||||
OP_ir v29.2s, v1.2s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
@@ -738,25 +738,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.s, v1.s}[0], [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
OP_rr s16, s0, v8.4s[0]
|
||||
OP_ii s16, s1, v9.4s[0]
|
||||
OP_ri s17, s0, v9.4s[0]
|
||||
OP_ir s17, s1, v8.4s[0]
|
||||
OP_rr s16, s0, v8.s[0]
|
||||
OP_ii s16, s1, v9.s[0]
|
||||
OP_ri s17, s0, v9.s[0]
|
||||
OP_ir s17, s1, v8.s[0]
|
||||
|
||||
OP_rr s20, s0, v8.4s[1]
|
||||
OP_ii s20, s1, v9.4s[1]
|
||||
OP_ri s21, s0, v9.4s[1]
|
||||
OP_ir s21, s1, v8.4s[1]
|
||||
OP_rr s20, s0, v8.s[1]
|
||||
OP_ii s20, s1, v9.s[1]
|
||||
OP_ri s21, s0, v9.s[1]
|
||||
OP_ir s21, s1, v8.s[1]
|
||||
|
||||
OP_rr s24, s0, v8.4s[2]
|
||||
OP_ii s24, s1, v9.4s[2]
|
||||
OP_ri s25, s0, v9.4s[2]
|
||||
OP_ir s25, s1, v8.4s[2]
|
||||
OP_rr s24, s0, v8.s[2]
|
||||
OP_ii s24, s1, v9.s[2]
|
||||
OP_ri s25, s0, v9.s[2]
|
||||
OP_ir s25, s1, v8.s[2]
|
||||
|
||||
OP_rr s28, s0, v8.4s[3]
|
||||
OP_ii s28, s1, v9.4s[3]
|
||||
OP_ri s29, s0, v9.4s[3]
|
||||
OP_ir s29, s1, v8.4s[3]
|
||||
OP_rr s28, s0, v8.s[3]
|
||||
OP_ii s28, s1, v9.s[3]
|
||||
OP_ri s29, s0, v9.s[3]
|
||||
OP_ir s29, s1, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
@@ -814,15 +814,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.2s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.2s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.2s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.2s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.2s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.2s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.2s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.2s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
@@ -862,15 +862,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr v16.2s, v0.2s, v8.2s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.2s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.2s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.2s[0]
|
||||
OP_rr v16.2s, v0.2s, v8.s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
OP_rr v20.2s, v0.2s, v8.2s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.2s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.2s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.2s[1]
|
||||
OP_rr v20.2s, v0.2s, v8.s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
@@ -910,15 +910,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.s, v1.s}[0], [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
OP_rr s16, s0, v8.2s[0]
|
||||
OP_ii s16, s1, v9.2s[0]
|
||||
OP_ri s17, s0, v9.2s[0]
|
||||
OP_ir s17, s1, v8.2s[0]
|
||||
OP_rr s16, s0, v8.s[0]
|
||||
OP_ii s16, s1, v9.s[0]
|
||||
OP_ri s17, s0, v9.s[0]
|
||||
OP_ir s17, s1, v8.s[0]
|
||||
|
||||
OP_rr s20, s0, v8.2s[1]
|
||||
OP_ii s20, s1, v9.2s[1]
|
||||
OP_ri s21, s0, v9.2s[1]
|
||||
OP_ir s21, s1, v8.2s[1]
|
||||
OP_rr s20, s0, v8.s[1]
|
||||
OP_ii s20, s1, v9.s[1]
|
||||
OP_ri s21, s0, v9.s[1]
|
||||
OP_ir s21, s1, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
||||
2044
kernel/arm64/cgemm_kernel_8x4.S
Normal file
2044
kernel/arm64/cgemm_kernel_8x4.S
Normal file
File diff suppressed because it is too large
Load Diff
@@ -170,49 +170,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.4s, v0.4s, v9.4s[0]
|
||||
fmls v17.4s, v0.4s, v9.s[0]
|
||||
#else
|
||||
fmul v17.4s, v0.4s, v9.4s[0]
|
||||
fmul v17.4s, v0.4s, v9.s[0]
|
||||
#endif
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
fmul v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.4s, v0.4s, v9.4s[1]
|
||||
fmls v21.4s, v0.4s, v9.s[1]
|
||||
#else
|
||||
fmul v21.4s, v0.4s, v9.4s[1]
|
||||
fmul v21.4s, v0.4s, v9.s[1]
|
||||
#endif
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
fmul v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
fmul v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.4s, v0.4s, v9.4s[2]
|
||||
fmls v25.4s, v0.4s, v9.s[2]
|
||||
#else
|
||||
fmul v25.4s, v0.4s, v9.4s[2]
|
||||
fmul v25.4s, v0.4s, v9.s[2]
|
||||
#endif
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
fmul v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
fmul v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.4s, v0.4s, v9.4s[3]
|
||||
fmls v29.4s, v0.4s, v9.s[3]
|
||||
#else
|
||||
fmul v29.4s, v0.4s, v9.4s[3]
|
||||
fmul v29.4s, v0.4s, v9.s[3]
|
||||
#endif
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB]
|
||||
add pB, pB, #32
|
||||
@@ -221,85 +221,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
ld2 {v4.4s, v5.4s}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
ld2 {v8.4s, v9.4s}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
ld2 {v0.4s, v1.4s}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
@@ -308,25 +308,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
@@ -384,25 +384,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr v16.2s, v0.2s, v8.4s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.4s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.4s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.4s[0]
|
||||
OP_rr v16.2s, v0.2s, v8.s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
OP_rr v20.2s, v0.2s, v8.4s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.4s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.4s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.4s[1]
|
||||
OP_rr v20.2s, v0.2s, v8.s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
OP_rr v24.2s, v0.2s, v8.4s[2]
|
||||
OP_ii v24.2s, v1.2s, v9.4s[2]
|
||||
OP_ri v25.2s, v0.2s, v9.4s[2]
|
||||
OP_ir v25.2s, v1.2s, v8.4s[2]
|
||||
OP_rr v24.2s, v0.2s, v8.s[2]
|
||||
OP_ii v24.2s, v1.2s, v9.s[2]
|
||||
OP_ri v25.2s, v0.2s, v9.s[2]
|
||||
OP_ir v25.2s, v1.2s, v8.s[2]
|
||||
|
||||
OP_rr v28.2s, v0.2s, v8.4s[3]
|
||||
OP_ii v28.2s, v1.2s, v9.4s[3]
|
||||
OP_ri v29.2s, v0.2s, v9.4s[3]
|
||||
OP_ir v29.2s, v1.2s, v8.4s[3]
|
||||
OP_rr v28.2s, v0.2s, v8.s[3]
|
||||
OP_ii v28.2s, v1.2s, v9.s[3]
|
||||
OP_ri v29.2s, v0.2s, v9.s[3]
|
||||
OP_ir v29.2s, v1.2s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
@@ -460,25 +460,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.s, v1.s}[0], [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
OP_rr s16, s0, v8.4s[0]
|
||||
OP_ii s16, s1, v9.4s[0]
|
||||
OP_ri s17, s0, v9.4s[0]
|
||||
OP_ir s17, s1, v8.4s[0]
|
||||
OP_rr s16, s0, v8.s[0]
|
||||
OP_ii s16, s1, v9.s[0]
|
||||
OP_ri s17, s0, v9.s[0]
|
||||
OP_ir s17, s1, v8.s[0]
|
||||
|
||||
OP_rr s20, s0, v8.4s[1]
|
||||
OP_ii s20, s1, v9.4s[1]
|
||||
OP_ri s21, s0, v9.4s[1]
|
||||
OP_ir s21, s1, v8.4s[1]
|
||||
OP_rr s20, s0, v8.s[1]
|
||||
OP_ii s20, s1, v9.s[1]
|
||||
OP_ri s21, s0, v9.s[1]
|
||||
OP_ir s21, s1, v8.s[1]
|
||||
|
||||
OP_rr s24, s0, v8.4s[2]
|
||||
OP_ii s24, s1, v9.4s[2]
|
||||
OP_ri s25, s0, v9.4s[2]
|
||||
OP_ir s25, s1, v8.4s[2]
|
||||
OP_rr s24, s0, v8.s[2]
|
||||
OP_ii s24, s1, v9.s[2]
|
||||
OP_ri s25, s0, v9.s[2]
|
||||
OP_ir s25, s1, v8.s[2]
|
||||
|
||||
OP_rr s28, s0, v8.4s[3]
|
||||
OP_ii s28, s1, v9.4s[3]
|
||||
OP_ri s29, s0, v9.4s[3]
|
||||
OP_ir s29, s1, v8.4s[3]
|
||||
OP_rr s28, s0, v8.s[3]
|
||||
OP_ii s28, s1, v9.s[3]
|
||||
OP_ri s29, s0, v9.s[3]
|
||||
OP_ir s29, s1, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
@@ -532,15 +532,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.2s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.2s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.2s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.2s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.2s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.2s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.2s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.2s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
@@ -578,15 +578,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr v16.2s, v0.2s, v8.2s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.2s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.2s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.2s[0]
|
||||
OP_rr v16.2s, v0.2s, v8.s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
OP_rr v20.2s, v0.2s, v8.2s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.2s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.2s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.2s[1]
|
||||
OP_rr v20.2s, v0.2s, v8.s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
@@ -624,15 +624,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.s, v1.s}[0], [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
OP_rr s16, s0, v8.2s[0]
|
||||
OP_ii s16, s1, v9.2s[0]
|
||||
OP_ri s17, s0, v9.2s[0]
|
||||
OP_ir s17, s1, v8.2s[0]
|
||||
OP_rr s16, s0, v8.s[0]
|
||||
OP_ii s16, s1, v9.s[0]
|
||||
OP_ri s17, s0, v9.s[0]
|
||||
OP_ir s17, s1, v8.s[0]
|
||||
|
||||
OP_rr s20, s0, v8.2s[1]
|
||||
OP_ii s20, s1, v9.2s[1]
|
||||
OP_ri s21, s0, v9.2s[1]
|
||||
OP_ir s21, s1, v8.2s[1]
|
||||
OP_rr s20, s0, v8.s[1]
|
||||
OP_ii s20, s1, v9.s[1]
|
||||
OP_ri s21, s0, v9.s[1]
|
||||
OP_ir s21, s1, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
||||
2425
kernel/arm64/ctrmm_kernel_8x4.S
Normal file
2425
kernel/arm64/ctrmm_kernel_8x4.S
Normal file
File diff suppressed because it is too large
Load Diff
@@ -46,21 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
#define pA x15
|
||||
#define ppC x16
|
||||
#define ppCRow0 x17
|
||||
#define ppCRow1 x18
|
||||
#define ppCRow2 x19
|
||||
#define ppA x20
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define ppC x17
|
||||
#define ppCRow0 x18
|
||||
#define ppCRow1 x19
|
||||
#define ppCRow2 x20
|
||||
#define ppCRow3 x21
|
||||
#define ppA x22
|
||||
#define alpha x23
|
||||
|
||||
#define alpha0 d10
|
||||
#define alphaV0 v10.d[0]
|
||||
#define alpha1 d11
|
||||
#define alphaV1 v11.d[0]
|
||||
#define alpha2 d14
|
||||
#define alphaV2 v14.d[0]
|
||||
#define alpha3 d15
|
||||
#define alphaV3 v15.d[0]
|
||||
|
||||
#define A_PRE_SIZE 1024
|
||||
#define B_PRE_SIZE 1024
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
// 00 origM
|
||||
// 01 origN
|
||||
@@ -77,15 +78,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 pA
|
||||
// 16 ppC
|
||||
// 17 ppCRow0
|
||||
// 18 must save ppCRow1
|
||||
// 19 must save ppCRow2
|
||||
// 20 must save ppA
|
||||
// 21 must save
|
||||
// 22 must save
|
||||
// 23 must save
|
||||
// 15 pCRow3
|
||||
// 16 pA
|
||||
// 17 ppC
|
||||
// 18 must save ppCRow0
|
||||
// 19 must save ppCRow1
|
||||
// 20 must save ppCRow2
|
||||
// 21 must save ppCRow3
|
||||
// 22 must save ppA
|
||||
// 23 must save alpha
|
||||
// 24 must save
|
||||
// 25 must save
|
||||
// 26 must save
|
||||
@@ -106,11 +107,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//v08 must save pB00, pB01
|
||||
//v09 must save pB02, pB03
|
||||
//v10 must save ALPHA0
|
||||
//v11 must save ALPHA1
|
||||
//v11 must save
|
||||
//v12 must save pB10, pB11
|
||||
//v13 must save pB12, pB13
|
||||
//v14 must save ALPHA2
|
||||
//v15 must save ALPHA3
|
||||
//v14 must save
|
||||
//v15 must save
|
||||
//v16 must save C00, C01
|
||||
//v17 must save C02, C03
|
||||
//v18 ppC00, ppC01
|
||||
@@ -152,222 +153,254 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_I
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
ldp d8, d9, [pB]
|
||||
add pB, pB, #16
|
||||
ldp d10, d11, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
ldp q0, q1, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
fmul v29.2d, v1.2d, v9.2d[1]
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
fmul v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [ppA]
|
||||
ldp q2, q3, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmul v20.2d, v0.2d, v8.2d[1]
|
||||
fmul v25.2d, v1.2d, v9.2d[0]
|
||||
fmul v20.2d, v0.2d, v9.d[0]
|
||||
fmul v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.2d[0]
|
||||
fmul v31.2d, v3.2d, v9.2d[1]
|
||||
fmul v22.2d, v2.2d, v8.2d[1]
|
||||
fmul v27.2d, v3.2d, v9.2d[0]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB] // for next round
|
||||
add pB, pB, #32
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
fmul v31.2d, v3.2d, v11.d[0]
|
||||
|
||||
fmul v24.2d, v0.2d, v9.2d[0]
|
||||
fmul v21.2d, v1.2d, v8.2d[1]
|
||||
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
|
||||
|
||||
ld1 {v4.2d, v5.2d} , [pA] // for next round
|
||||
fmul v22.2d, v2.2d, v9.d[0]
|
||||
fmul v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
ldp d12, d13, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
fmul v21.2d, v1.2d, v9.d[0]
|
||||
|
||||
ldp q4, q5, [pA] // for next round
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v26.2d, v2.2d, v9.2d[0]
|
||||
fmul v23.2d, v3.2d, v8.2d[1]
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
fmul v23.2d, v3.2d, v9.d[0]
|
||||
|
||||
ld1 {v6.2d, v7.2d} , [ppA] // for next round
|
||||
ldp q6, q7, [ppA] // for next round
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmul v28.2d, v0.2d, v9.2d[1]
|
||||
fmul v17.2d, v1.2d, v8.2d[0]
|
||||
fmul v30.2d, v2.2d, v9.2d[1]
|
||||
fmul v19.2d, v3.2d, v8.2d[0]
|
||||
fmul v28.2d, v0.2d, v11.d[0]
|
||||
fmul v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ldp d14, d15, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmul v30.2d, v2.2d, v11.d[0]
|
||||
fmul v19.2d, v3.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ldp d8, d9, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v18.2d, v6.2d, v12.2d[0]
|
||||
fmla v31.2d, v7.2d, v13.2d[1]
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v18.2d, v6.2d, v12.d[0]
|
||||
fmla v31.2d, v7.2d, v15.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
ldp d10, d11, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v22.2d, v6.2d, v12.2d[1]
|
||||
fmla v27.2d, v7.2d, v13.2d[0]
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v22.2d, v6.2d, v13.d[0]
|
||||
fmla v27.2d, v7.2d, v14.d[0]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
|
||||
ldp q0, q1, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v26.2d, v6.2d, v13.2d[0]
|
||||
fmla v23.2d, v7.2d, v12.2d[1]
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v26.2d, v6.2d, v14.d[0]
|
||||
fmla v23.2d, v7.2d, v13.d[0]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [ppA]
|
||||
ldp q2, q3, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmla v30.2d, v6.2d, v13.2d[1]
|
||||
fmla v19.2d, v7.2d, v12.2d[0]
|
||||
fmla v30.2d, v6.2d, v15.d[0]
|
||||
fmla v19.2d, v7.2d, v12.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB] // for next round
|
||||
add pB, pB, #32
|
||||
ldp d12, d13, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v31.2d, v3.2d, v9.2d[1]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v31.2d, v3.2d, v11.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
ldp d14, d15, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v22.2d, v2.2d, v8.2d[1]
|
||||
fmla v27.2d, v3.2d, v9.2d[0]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [ppA, #512]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v22.2d, v2.2d, v9.d[0]
|
||||
fmla v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
ld1 {v4.2d, v5.2d} , [pA] // for next round
|
||||
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
|
||||
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
|
||||
ldp q4, q5, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v26.2d, v2.2d, v9.2d[0]
|
||||
fmla v23.2d, v3.2d, v8.2d[1]
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v26.2d, v2.2d, v10.d[0]
|
||||
fmla v23.2d, v3.2d, v9.d[0]
|
||||
|
||||
ld1 {v6.2d, v7.2d} , [ppA] // for next round
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ldp q6, q7, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmla v30.2d, v2.2d, v9.2d[1]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
fmla v30.2d, v2.2d, v11.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v18.2d, v6.2d, v12.2d[0]
|
||||
fmla v27.2d, v7.2d, v13.2d[0]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
fmla v18.2d, v6.2d, v12.d[0]
|
||||
fmla v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v22.2d, v6.2d, v12.2d[1]
|
||||
fmla v31.2d, v7.2d, v13.2d[1]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
fmla v22.2d, v6.2d, v13.d[0]
|
||||
fmla v31.2d, v7.2d, v15.d[0]
|
||||
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v26.2d, v6.2d, v13.2d[0]
|
||||
fmla v19.2d, v7.2d, v12.2d[0]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v26.2d, v6.2d, v14.d[0]
|
||||
fmla v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v30.2d, v6.2d, v13.2d[1]
|
||||
fmla v23.2d, v7.2d, v12.2d[1]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
fmla v30.2d, v6.2d, v15.d[0]
|
||||
fmla v23.2d, v7.2d, v13.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
ldp d8, d9, [pB]
|
||||
add pB, pB, #16
|
||||
ldp d10, d11, [pB]
|
||||
add pB, pB, #16
|
||||
ldp q0, q1, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [ppA]
|
||||
ldp q2, q3, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v31.2d, v3.2d, v9.2d[1]
|
||||
fmla v22.2d, v2.2d, v8.2d[1]
|
||||
fmla v27.2d, v3.2d, v9.2d[0]
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v31.2d, v3.2d, v11.d[0]
|
||||
fmla v22.2d, v2.2d, v9.d[0]
|
||||
fmla v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
fmla v26.2d, v2.2d, v9.2d[0]
|
||||
fmla v23.2d, v3.2d, v8.2d[1]
|
||||
fmla v30.2d, v2.2d, v9.2d[1]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
fmla v26.2d, v2.2d, v10.d[0]
|
||||
fmla v23.2d, v3.2d, v9.d[0]
|
||||
fmla v30.2d, v2.2d, v11.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
add ppCRow0, pCRow0, #32
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pCRow0]
|
||||
ldp q0, q1, [pCRow0]
|
||||
fmla v0.2d, v16.2d, alphaV0
|
||||
fmla v1.2d, v17.2d, alphaV1
|
||||
st1 {v0.2d, v1.2d}, [pCRow0]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [ppCRow0]
|
||||
fmla v2.2d, v18.2d, alphaV2
|
||||
fmla v3.2d, v19.2d, alphaV3
|
||||
st1 {v2.2d, v3.2d}, [ppCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
add ppCRow1, ppCRow0, LDC
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0
|
||||
fmla v5.2d, v21.2d, alphaV1
|
||||
st1 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
ld1 {v6.2d, v7.2d}, [ppCRow1]
|
||||
fmla v6.2d, v22.2d, alphaV2
|
||||
fmla v7.2d, v23.2d, alphaV3
|
||||
st1 {v6.2d, v7.2d}, [ppCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
add ppCRow2, ppCRow1, LDC
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pCRow2]
|
||||
fmla v0.2d, v24.2d, alphaV0
|
||||
fmla v1.2d, v25.2d, alphaV1
|
||||
st1 {v0.2d, v1.2d}, [pCRow2]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [ppCRow2]
|
||||
fmla v2.2d, v26.2d, alphaV2
|
||||
fmla v3.2d, v27.2d, alphaV3
|
||||
st1 {v2.2d, v3.2d}, [ppCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
add ppCRow1, ppCRow2, LDC
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v28.2d, alphaV0
|
||||
fmla v5.2d, v29.2d, alphaV1
|
||||
st1 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
ld1 {v6.2d, v7.2d}, [ppCRow1]
|
||||
fmla v6.2d, v30.2d, alphaV2
|
||||
fmla v7.2d, v31.2d, alphaV3
|
||||
st1 {v6.2d, v7.2d}, [ppCRow1]
|
||||
fmla v1.2d, v17.2d, alphaV0
|
||||
stp q0, q1, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
||||
ldp q2, q3, [ppCRow0]
|
||||
fmla v2.2d, v18.2d, alphaV0
|
||||
fmla v3.2d, v19.2d, alphaV0
|
||||
stp q2, q3, [ppCRow0]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
add ppCRow1, pCRow1, #32
|
||||
|
||||
ldp q4, q5, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0
|
||||
fmla v5.2d, v21.2d, alphaV0
|
||||
stp q4, q5, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, #64
|
||||
|
||||
ldp q6, q7, [ppCRow1]
|
||||
fmla v6.2d, v22.2d, alphaV0
|
||||
fmla v7.2d, v23.2d, alphaV0
|
||||
stp q6, q7, [ppCRow1]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
add ppCRow2, pCRow2, #32
|
||||
|
||||
ldp q0, q1, [pCRow2]
|
||||
fmla v0.2d, v24.2d, alphaV0
|
||||
fmla v1.2d, v25.2d, alphaV0
|
||||
stp q0, q1, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #64
|
||||
|
||||
ldp q2, q3, [ppCRow2]
|
||||
fmla v2.2d, v26.2d, alphaV0
|
||||
fmla v3.2d, v27.2d, alphaV0
|
||||
stp q2, q3, [ppCRow2]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
add ppCRow3, pCRow3, #32
|
||||
|
||||
ldp q4, q5, [pCRow3]
|
||||
fmla v4.2d, v28.2d, alphaV0
|
||||
fmla v5.2d, v29.2d, alphaV0
|
||||
stp q4, q5, [pCRow3]
|
||||
|
||||
add pCRow3, pCRow3, #64
|
||||
|
||||
ldp q6, q7, [ppCRow3]
|
||||
fmla v6.2d, v30.2d, alphaV0
|
||||
fmla v7.2d, v31.2d, alphaV0
|
||||
stp q6, q7, [ppCRow3]
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
@@ -389,44 +422,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
fmla v9.2d, v17.2d, alphaV1
|
||||
fmla v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV2
|
||||
fmla v13.2d, v21.2d, alphaV3
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
fmla v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow2]
|
||||
fmla v8.2d, v24.2d, alphaV0
|
||||
fmla v9.2d, v25.2d, alphaV1
|
||||
fmla v9.2d, v25.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||
fmla v12.2d, v28.2d, alphaV2
|
||||
fmla v13.2d, v29.2d, alphaV3
|
||||
fmla v12.2d, v28.2d, alphaV0
|
||||
fmla v13.2d, v29.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
@@ -447,13 +482,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
@@ -461,19 +498,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV1
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
|
||||
ld1 {v8.2d}, [pCRow2]
|
||||
fmla v8.2d, v24.2d, alphaV2
|
||||
fmla v8.2d, v24.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
ld1 {v12.2d}, [pCRow1]
|
||||
fmla v12.2d, v28.2d, alphaV3
|
||||
fmla v12.2d, v28.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
@@ -498,6 +535,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v8.d}[0], [pCRow0]
|
||||
@@ -511,7 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
ld1 {v12.d}[0], [pCRow2]
|
||||
ld1 {v12.d}[1], [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV1
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.d}[0], [pCRow2]
|
||||
st1 {v12.d}[1], [pCRow1]
|
||||
|
||||
@@ -533,23 +572,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
fmla v9.2d, v17.2d, alphaV1
|
||||
fmla v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV2
|
||||
fmla v13.2d, v21.2d, alphaV3
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
fmla v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
@@ -569,11 +610,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
@@ -581,7 +624,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add pCRow1 , pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV1
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
@@ -600,10 +643,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ldr d0 , [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2d, v8.2d, v0.2d[0]
|
||||
fmla v16.2d, v8.2d, v0.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
fmov alpha0, alpha
|
||||
|
||||
add pCRow1 , pCRow0, LDC
|
||||
|
||||
ld1 {v8.d}[0], [pCRow0]
|
||||
@@ -629,14 +674,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA , pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
fmla v9.2d, v17.2d, alphaV1
|
||||
fmla v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
@@ -658,10 +705,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2d}, [pA]
|
||||
add pA , pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
@@ -686,6 +735,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro SAVE1x1
|
||||
fmov alpha0, alpha
|
||||
|
||||
ldr d8, [pCRow0]
|
||||
fmadd d8, d16, alpha0, d8
|
||||
str d8, [pCRow0]
|
||||
@@ -713,10 +764,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
fmov alpha0, d0
|
||||
fmov alpha1, d0
|
||||
fmov alpha2, d0
|
||||
fmov alpha3, d0
|
||||
fmov alpha, d0
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
lsl LDC, LDC, #3 // ldc = ldc * 8
|
||||
|
||||
@@ -728,12 +778,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ble dgemm_kernel_L2_BEGIN
|
||||
|
||||
dgemm_kernel_L4_BEGIN:
|
||||
mov pCRow0, pC // pCRow0 = C
|
||||
add pC, pC, LDC, lsl #2
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow3, pCRow2, LDC
|
||||
add pC, pCRow3, LDC
|
||||
|
||||
lsl temp, origK, #5 // k * 4 * 8
|
||||
mov pA, origPA // pA = start of A array
|
||||
add ppA, temp, pA
|
||||
prfm PLDL1KEEP, [ppA]
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
@@ -744,43 +798,51 @@ dgemm_kernel_L4_M8_BEGIN:
|
||||
cmp counterI, #0
|
||||
ble dgemm_kernel_L4_M4_BEGIN
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_20:
|
||||
|
||||
mov pB, origPB
|
||||
asr counterL , origK, #1 // L = K / 2
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
asr counterL , origK, #2 // L = K / 4
|
||||
cmp counterL , #2
|
||||
blt dgemm_kernel_L4_M8_32
|
||||
|
||||
KERNEL8x4_I // do one in the K
|
||||
KERNEL8x4_M2 // do another in the K
|
||||
KERNEL8x4_I
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble dgemm_kernel_L4_M8_22a
|
||||
|
||||
.align 5
|
||||
|
||||
dgemm_kernel_L4_M8_22:
|
||||
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dgemm_kernel_L4_M8_22
|
||||
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_22a:
|
||||
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_E
|
||||
|
||||
b dgemm_kernel_L4_M8_44
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble dgemm_kernel_L4_M8_40
|
||||
|
||||
KERNEL8x4_I
|
||||
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_E
|
||||
|
||||
b dgemm_kernel_L4_M8_44
|
||||
@@ -792,14 +854,22 @@ dgemm_kernel_L4_M8_40:
|
||||
|
||||
dgemm_kernel_L4_M8_44:
|
||||
|
||||
ands counterL , origK, #1
|
||||
ands counterL , origK, #3
|
||||
ble dgemm_kernel_L4_M8_100
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_46:
|
||||
|
||||
KERNEL8x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne dgemm_kernel_L4_M8_46
|
||||
|
||||
dgemm_kernel_L4_M8_100:
|
||||
lsl temp, origK, #5
|
||||
prfm PLDL1KEEP, [pA, temp]
|
||||
prfm PLDL1KEEP, [ppA, temp]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVE8x4
|
||||
|
||||
@@ -810,7 +880,6 @@ dgemm_kernel_L4_M8_END:
|
||||
subs counterI, counterI, #1
|
||||
bne dgemm_kernel_L4_M8_20
|
||||
|
||||
|
||||
dgemm_kernel_L4_M4_BEGIN:
|
||||
mov counterI, origM
|
||||
tst counterI , #7
|
||||
|
||||
1689
kernel/arm64/dgemm_kernel_4x8.S
Normal file
1689
kernel/arm64/dgemm_kernel_4x8.S
Normal file
File diff suppressed because it is too large
Load Diff
1602
kernel/arm64/dgemm_kernel_8x4.S
Normal file
1602
kernel/arm64/dgemm_kernel_8x4.S
Normal file
File diff suppressed because it is too large
Load Diff
@@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
fmul v29.2d, v1.2d, v9.2d[1]
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
fmul v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmul v20.2d, v0.2d, v8.2d[1]
|
||||
fmul v25.2d, v1.2d, v9.2d[0]
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
fmul v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
fmul v24.2d, v0.2d, v9.2d[0]
|
||||
fmul v21.2d, v1.2d, v8.2d[1]
|
||||
fmul v24.2d, v0.2d, v9.d[0]
|
||||
fmul v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmul v28.2d, v0.2d, v9.2d[1]
|
||||
fmul v17.2d, v1.2d, v8.2d[0]
|
||||
fmul v28.2d, v0.2d, v9.d[1]
|
||||
fmul v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
@@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
@@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
@@ -283,10 +283,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
@@ -361,10 +361,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
@@ -395,8 +395,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
@@ -424,7 +424,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ldr d0 , [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2d, v8.2d, v0.2d[0]
|
||||
fmla v16.2d, v8.2d, v0.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
@@ -451,8 +451,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA , pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
@@ -479,7 +479,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2d}, [pA]
|
||||
add pA , pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
|
||||
2026
kernel/arm64/dtrmm_kernel_4x8.S
Normal file
2026
kernel/arm64/dtrmm_kernel_4x8.S
Normal file
File diff suppressed because it is too large
Load Diff
1849
kernel/arm64/dtrmm_kernel_8x4.S
Normal file
1849
kernel/arm64/dtrmm_kernel_8x4.S
Normal file
File diff suppressed because it is too large
Load Diff
1987
kernel/arm64/sgemm_kernel_16x4.S
Normal file
1987
kernel/arm64/sgemm_kernel_16x4.S
Normal file
File diff suppressed because it is too large
Load Diff
@@ -192,164 +192,164 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.4s}, [pA_0]
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmul v16.4s, v0.4s, v8.4s[0]
|
||||
fmul v20.4s, v0.4s, v8.4s[1]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
|
||||
ld1 {v2.4s}, [pA_1]
|
||||
add pA_1, pA_1, #16
|
||||
|
||||
fmul v24.4s, v0.4s, v8.4s[2]
|
||||
fmul v28.4s, v0.4s, v8.4s[3]
|
||||
fmul v24.4s, v0.4s, v8.s[2]
|
||||
fmul v28.4s, v0.4s, v8.s[3]
|
||||
|
||||
ld1 {v4.4s}, [pA_2]
|
||||
add pA_2, pA_2, #16
|
||||
|
||||
fmul v17.4s, v2.4s, v8.4s[0]
|
||||
fmul v21.4s, v2.4s, v8.4s[1]
|
||||
fmul v17.4s, v2.4s, v8.s[0]
|
||||
fmul v21.4s, v2.4s, v8.s[1]
|
||||
|
||||
ld1 {v6.4s}, [pA_3]
|
||||
add pA_3, pA_3, #16
|
||||
|
||||
fmul v25.4s, v2.4s, v8.4s[2]
|
||||
fmul v29.4s, v2.4s, v8.4s[3]
|
||||
fmul v25.4s, v2.4s, v8.s[2]
|
||||
fmul v29.4s, v2.4s, v8.s[3]
|
||||
|
||||
ld1 {v12.4s}, [pB] // for next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmul v18.4s, v4.4s, v8.4s[0]
|
||||
fmul v19.4s, v6.4s, v8.4s[0]
|
||||
fmul v18.4s, v4.4s, v8.s[0]
|
||||
fmul v19.4s, v6.4s, v8.s[0]
|
||||
|
||||
ld1 {v1.4s}, [pA_0] // for next round
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmul v22.4s, v4.4s, v8.4s[1]
|
||||
fmul v23.4s, v6.4s, v8.4s[1]
|
||||
fmul v22.4s, v4.4s, v8.s[1]
|
||||
fmul v23.4s, v6.4s, v8.s[1]
|
||||
|
||||
ld1 {v3.4s}, [pA_1] // for next round
|
||||
add pA_1, pA_1, #16
|
||||
|
||||
fmul v26.4s, v4.4s, v8.4s[2]
|
||||
fmul v27.4s, v6.4s, v8.4s[2]
|
||||
fmul v26.4s, v4.4s, v8.s[2]
|
||||
fmul v27.4s, v6.4s, v8.s[2]
|
||||
|
||||
ld1 {v5.4s}, [pA_2] // for next round
|
||||
add pA_2, pA_2, #16
|
||||
|
||||
fmul v30.4s, v4.4s, v8.4s[3]
|
||||
fmul v31.4s, v6.4s, v8.4s[3]
|
||||
fmul v30.4s, v4.4s, v8.s[3]
|
||||
fmul v31.4s, v6.4s, v8.s[3]
|
||||
|
||||
ld1 {v7.4s}, [pA_3] // for next round
|
||||
add pA_3, pA_3, #16
|
||||
.endm
|
||||
|
||||
.macro KERNEL16x4_M2
|
||||
fmla v16.4s, v1.4s, v12.4s[0]
|
||||
fmla v17.4s, v3.4s, v12.4s[0]
|
||||
fmla v16.4s, v1.4s, v12.s[0]
|
||||
fmla v17.4s, v3.4s, v12.s[0]
|
||||
|
||||
ld1 {v8.4s}, [pB] // for next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v18.4s, v5.4s, v12.4s[0]
|
||||
fmla v19.4s, v7.4s, v12.4s[0]
|
||||
fmla v18.4s, v5.4s, v12.s[0]
|
||||
fmla v19.4s, v7.4s, v12.s[0]
|
||||
|
||||
ld1 {v0.4s}, [pA_0] // for next round
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmla v20.4s, v1.4s, v12.4s[1]
|
||||
fmla v21.4s, v3.4s, v12.4s[1]
|
||||
fmla v20.4s, v1.4s, v12.s[1]
|
||||
fmla v21.4s, v3.4s, v12.s[1]
|
||||
|
||||
ld1 {v2.4s}, [pA_1] // for next round
|
||||
add pA_1, pA_1, #16
|
||||
|
||||
fmla v22.4s, v5.4s, v12.4s[1]
|
||||
fmla v23.4s, v7.4s, v12.4s[1]
|
||||
fmla v22.4s, v5.4s, v12.s[1]
|
||||
fmla v23.4s, v7.4s, v12.s[1]
|
||||
|
||||
ld1 {v4.4s}, [pA_2] // for next round
|
||||
add pA_2, pA_2, #16
|
||||
|
||||
fmla v24.4s, v1.4s, v12.4s[2]
|
||||
fmla v25.4s, v3.4s, v12.4s[2]
|
||||
fmla v24.4s, v1.4s, v12.s[2]
|
||||
fmla v25.4s, v3.4s, v12.s[2]
|
||||
|
||||
ld1 {v6.4s}, [pA_3] // for next round
|
||||
add pA_3, pA_3, #16
|
||||
|
||||
fmla v26.4s, v5.4s, v12.4s[2]
|
||||
fmla v27.4s, v7.4s, v12.4s[2]
|
||||
fmla v26.4s, v5.4s, v12.s[2]
|
||||
fmla v27.4s, v7.4s, v12.s[2]
|
||||
|
||||
prfm PLDL1KEEP, [pA_2, #512]
|
||||
|
||||
fmla v28.4s, v1.4s, v12.4s[3]
|
||||
fmla v29.4s, v3.4s, v12.4s[3]
|
||||
fmla v28.4s, v1.4s, v12.s[3]
|
||||
fmla v29.4s, v3.4s, v12.s[3]
|
||||
|
||||
prfm PLDL1KEEP, [pA_3, #512]
|
||||
|
||||
fmla v30.4s, v5.4s, v12.4s[3]
|
||||
fmla v31.4s, v7.4s, v12.4s[3]
|
||||
fmla v30.4s, v5.4s, v12.s[3]
|
||||
fmla v31.4s, v7.4s, v12.s[3]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
.endm
|
||||
|
||||
.macro KERNEL16x4_M1
|
||||
fmla v16.4s, v0.4s, v8.4s[0]
|
||||
fmla v17.4s, v2.4s, v8.4s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v2.4s, v8.s[0]
|
||||
|
||||
ld1 {v12.4s}, [pB] // for next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v18.4s, v4.4s, v8.4s[0]
|
||||
fmla v19.4s, v6.4s, v8.4s[0]
|
||||
fmla v18.4s, v4.4s, v8.s[0]
|
||||
fmla v19.4s, v6.4s, v8.s[0]
|
||||
|
||||
ld1 {v1.4s}, [pA_0] // for next round
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmla v20.4s, v0.4s, v8.4s[1]
|
||||
fmla v21.4s, v2.4s, v8.4s[1]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v2.4s, v8.s[1]
|
||||
|
||||
ld1 {v3.4s}, [pA_1] // for next round
|
||||
add pA_1, pA_1, #16
|
||||
|
||||
fmla v22.4s, v4.4s, v8.4s[1]
|
||||
fmla v23.4s, v6.4s, v8.4s[1]
|
||||
fmla v22.4s, v4.4s, v8.s[1]
|
||||
fmla v23.4s, v6.4s, v8.s[1]
|
||||
|
||||
ld1 {v5.4s}, [pA_2] // for next round
|
||||
add pA_2, pA_2, #16
|
||||
|
||||
fmla v24.4s, v0.4s, v8.4s[2]
|
||||
fmla v25.4s, v2.4s, v8.4s[2]
|
||||
fmla v24.4s, v0.4s, v8.s[2]
|
||||
fmla v25.4s, v2.4s, v8.s[2]
|
||||
|
||||
ld1 {v7.4s}, [pA_3] // for next round
|
||||
add pA_3, pA_3, #16
|
||||
|
||||
fmla v26.4s, v4.4s, v8.4s[2]
|
||||
fmla v27.4s, v6.4s, v8.4s[2]
|
||||
fmla v26.4s, v4.4s, v8.s[2]
|
||||
fmla v27.4s, v6.4s, v8.s[2]
|
||||
|
||||
prfm PLDL1KEEP, [pA_0, #512]
|
||||
|
||||
fmla v28.4s, v0.4s, v8.4s[3]
|
||||
fmla v29.4s, v2.4s, v8.4s[3]
|
||||
fmla v28.4s, v0.4s, v8.s[3]
|
||||
fmla v29.4s, v2.4s, v8.s[3]
|
||||
|
||||
prfm PLDL1KEEP, [pA_1, #512]
|
||||
|
||||
fmla v30.4s, v4.4s, v8.4s[3]
|
||||
fmla v31.4s, v6.4s, v8.4s[3]
|
||||
fmla v30.4s, v4.4s, v8.s[3]
|
||||
fmla v31.4s, v6.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL16x4_E
|
||||
fmla v16.4s, v1.4s, v12.4s[0]
|
||||
fmla v17.4s, v3.4s, v12.4s[0]
|
||||
fmla v18.4s, v5.4s, v12.4s[0]
|
||||
fmla v19.4s, v7.4s, v12.4s[0]
|
||||
fmla v20.4s, v1.4s, v12.4s[1]
|
||||
fmla v21.4s, v3.4s, v12.4s[1]
|
||||
fmla v22.4s, v5.4s, v12.4s[1]
|
||||
fmla v23.4s, v7.4s, v12.4s[1]
|
||||
fmla v24.4s, v1.4s, v12.4s[2]
|
||||
fmla v25.4s, v3.4s, v12.4s[2]
|
||||
fmla v26.4s, v5.4s, v12.4s[2]
|
||||
fmla v27.4s, v7.4s, v12.4s[2]
|
||||
fmla v28.4s, v1.4s, v12.4s[3]
|
||||
fmla v29.4s, v3.4s, v12.4s[3]
|
||||
fmla v30.4s, v5.4s, v12.4s[3]
|
||||
fmla v31.4s, v7.4s, v12.4s[3]
|
||||
fmla v16.4s, v1.4s, v12.s[0]
|
||||
fmla v17.4s, v3.4s, v12.s[0]
|
||||
fmla v18.4s, v5.4s, v12.s[0]
|
||||
fmla v19.4s, v7.4s, v12.s[0]
|
||||
fmla v20.4s, v1.4s, v12.s[1]
|
||||
fmla v21.4s, v3.4s, v12.s[1]
|
||||
fmla v22.4s, v5.4s, v12.s[1]
|
||||
fmla v23.4s, v7.4s, v12.s[1]
|
||||
fmla v24.4s, v1.4s, v12.s[2]
|
||||
fmla v25.4s, v3.4s, v12.s[2]
|
||||
fmla v26.4s, v5.4s, v12.s[2]
|
||||
fmla v27.4s, v7.4s, v12.s[2]
|
||||
fmla v28.4s, v1.4s, v12.s[3]
|
||||
fmla v29.4s, v3.4s, v12.s[3]
|
||||
fmla v30.4s, v5.4s, v12.s[3]
|
||||
fmla v31.4s, v7.4s, v12.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL16x4_SUB
|
||||
@@ -359,34 +359,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.4s}, [pA_0]
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.4s[0]
|
||||
fmla v20.4s, v0.4s, v8.4s[1]
|
||||
fmla v24.4s, v0.4s, v8.4s[2]
|
||||
fmla v28.4s, v0.4s, v8.4s[3]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v24.4s, v0.4s, v8.s[2]
|
||||
fmla v28.4s, v0.4s, v8.s[3]
|
||||
|
||||
ld1 {v2.4s}, [pA_1]
|
||||
add pA_1, pA_1, #16
|
||||
|
||||
fmla v17.4s, v2.4s, v8.4s[0]
|
||||
fmla v21.4s, v2.4s, v8.4s[1]
|
||||
fmla v25.4s, v2.4s, v8.4s[2]
|
||||
fmla v29.4s, v2.4s, v8.4s[3]
|
||||
fmla v17.4s, v2.4s, v8.s[0]
|
||||
fmla v21.4s, v2.4s, v8.s[1]
|
||||
fmla v25.4s, v2.4s, v8.s[2]
|
||||
fmla v29.4s, v2.4s, v8.s[3]
|
||||
|
||||
ld1 {v4.4s}, [pA_2]
|
||||
add pA_2, pA_2, #16
|
||||
|
||||
fmla v18.4s, v4.4s, v8.4s[0]
|
||||
fmla v22.4s, v4.4s, v8.4s[1]
|
||||
fmla v26.4s, v4.4s, v8.4s[2]
|
||||
fmla v30.4s, v4.4s, v8.4s[3]
|
||||
fmla v18.4s, v4.4s, v8.s[0]
|
||||
fmla v22.4s, v4.4s, v8.s[1]
|
||||
fmla v26.4s, v4.4s, v8.s[2]
|
||||
fmla v30.4s, v4.4s, v8.s[3]
|
||||
|
||||
ld1 {v6.4s}, [pA_3]
|
||||
add pA_3, pA_3, #16
|
||||
|
||||
fmla v19.4s, v6.4s, v8.4s[0]
|
||||
fmla v23.4s, v6.4s, v8.4s[1]
|
||||
fmla v27.4s, v6.4s, v8.4s[2]
|
||||
fmla v31.4s, v6.4s, v8.4s[3]
|
||||
fmla v19.4s, v6.4s, v8.s[0]
|
||||
fmla v23.4s, v6.4s, v8.s[1]
|
||||
fmla v27.4s, v6.4s, v8.s[2]
|
||||
fmla v31.4s, v6.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE16x4
|
||||
@@ -456,28 +456,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2s, v1.2s}, [pA_0]
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
ld1 {v2.2s, v3.2s}, [pA_1]
|
||||
add pA_1, pA_1, #16
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
fmla v18.2s, v2.2s, v8.2s[0]
|
||||
fmla v31.2s, v3.2s, v9.2s[1]
|
||||
fmla v22.2s, v2.2s, v8.2s[1]
|
||||
fmla v27.2s, v3.2s, v9.2s[0]
|
||||
fmla v18.2s, v2.2s, v8.s[0]
|
||||
fmla v31.2s, v3.2s, v9.s[1]
|
||||
fmla v22.2s, v2.2s, v8.s[1]
|
||||
fmla v27.2s, v3.2s, v9.s[0]
|
||||
|
||||
fmla v26.2s, v2.2s, v9.2s[0]
|
||||
fmla v23.2s, v3.2s, v8.2s[1]
|
||||
fmla v30.2s, v2.2s, v9.2s[1]
|
||||
fmla v19.2s, v3.2s, v8.2s[0]
|
||||
fmla v26.2s, v2.2s, v9.s[0]
|
||||
fmla v23.2s, v3.2s, v8.s[1]
|
||||
fmla v30.2s, v2.2s, v9.s[1]
|
||||
fmla v19.2s, v3.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
@@ -556,17 +556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2s, v1.2s}, [pA_0]
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
@@ -614,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2s}, [pA_0]
|
||||
add pA_0, pA_0, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
@@ -700,10 +700,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2s, v1.2s}, [pA_0]
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
@@ -736,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2s}, [pA_0]
|
||||
add pA_0, pA_0, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
@@ -767,7 +767,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ldr s0 , [pA_0]
|
||||
add pA_0, pA_0, #4
|
||||
|
||||
fmla v16.2s, v8.2s, v0.2s[0]
|
||||
fmla v16.2s, v8.2s, v0.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
@@ -796,8 +796,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2s, v1.2s}, [pA_0]
|
||||
add pA_0 , pA_0, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
@@ -825,7 +825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2s}, [pA_0]
|
||||
add pA_0 , pA_0, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
|
||||
2305
kernel/arm64/sgemm_kernel_8x8.S
Normal file
2305
kernel/arm64/sgemm_kernel_8x8.S
Normal file
File diff suppressed because it is too large
Load Diff
2431
kernel/arm64/strmm_kernel_16x4.S
Normal file
2431
kernel/arm64/strmm_kernel_16x4.S
Normal file
File diff suppressed because it is too large
Load Diff
@@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.2s, v0.2s, v8.2s[0]
|
||||
fmul v29.2s, v1.2s, v9.2s[1]
|
||||
fmul v16.2s, v0.2s, v8.s[0]
|
||||
fmul v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
fmul v20.2s, v0.2s, v8.2s[1]
|
||||
fmul v25.2s, v1.2s, v9.2s[0]
|
||||
fmul v20.2s, v0.2s, v8.s[1]
|
||||
fmul v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
fmul v24.2s, v0.2s, v9.2s[0]
|
||||
fmul v21.2s, v1.2s, v8.2s[1]
|
||||
fmul v24.2s, v0.2s, v9.s[0]
|
||||
fmul v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
fmul v28.2s, v0.2s, v9.2s[1]
|
||||
fmul v17.2s, v1.2s, v8.2s[0]
|
||||
fmul v28.2s, v0.2s, v9.s[1]
|
||||
fmul v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
@@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB] // For next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
ld1 {v4.2s, v5.2s}, [pA] // For next round
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
fmla v16.2s, v4.2s, v12.2s[0]
|
||||
fmla v29.2s, v5.2s, v13.2s[1]
|
||||
fmla v16.2s, v4.2s, v12.s[0]
|
||||
fmla v29.2s, v5.2s, v13.s[1]
|
||||
|
||||
ld1 {v8.2s, v9.2s}, [pB] // For next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v20.2s, v4.2s, v12.2s[1]
|
||||
fmla v25.2s, v5.2s, v13.2s[0]
|
||||
fmla v20.2s, v4.2s, v12.s[1]
|
||||
fmla v25.2s, v5.2s, v13.s[0]
|
||||
|
||||
ld1 {v0.2s, v1.2s}, [pA] // For next round
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v24.2s, v4.2s, v13.2s[0]
|
||||
fmla v21.2s, v5.2s, v12.2s[1]
|
||||
fmla v24.2s, v4.2s, v13.s[0]
|
||||
fmla v21.2s, v5.2s, v12.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
fmla v28.2s, v4.2s, v13.2s[1]
|
||||
fmla v17.2s, v5.2s, v12.2s[0]
|
||||
fmla v28.2s, v4.2s, v13.s[1]
|
||||
fmla v17.2s, v5.2s, v12.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
fmla v16.2s, v4.2s, v12.2s[0]
|
||||
fmla v29.2s, v5.2s, v13.2s[1]
|
||||
fmla v16.2s, v4.2s, v12.s[0]
|
||||
fmla v29.2s, v5.2s, v13.s[1]
|
||||
|
||||
fmla v20.2s, v4.2s, v12.2s[1]
|
||||
fmla v25.2s, v5.2s, v13.2s[0]
|
||||
fmla v20.2s, v4.2s, v12.s[1]
|
||||
fmla v25.2s, v5.2s, v13.s[0]
|
||||
|
||||
fmla v24.2s, v4.2s, v13.2s[0]
|
||||
fmla v21.2s, v5.2s, v12.2s[1]
|
||||
fmla v24.2s, v4.2s, v13.s[0]
|
||||
fmla v21.2s, v5.2s, v12.s[1]
|
||||
|
||||
fmla v28.2s, v4.2s, v13.2s[1]
|
||||
fmla v17.2s, v5.2s, v12.2s[0]
|
||||
fmla v28.2s, v4.2s, v13.s[1]
|
||||
fmla v17.2s, v5.2s, v12.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
@@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
@@ -280,10 +280,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2s}, [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
@@ -353,10 +353,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
@@ -386,8 +386,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2s}, [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
@@ -414,7 +414,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ldr s0 , [pA]
|
||||
add pA, pA, #4
|
||||
|
||||
fmla v16.2s, v8.2s, v0.2s[0]
|
||||
fmla v16.2s, v8.2s, v0.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
@@ -440,8 +440,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA , pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
@@ -468,7 +468,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld1 {v0.2s}, [pA]
|
||||
add pA , pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
|
||||
2795
kernel/arm64/strmm_kernel_8x8.S
Normal file
2795
kernel/arm64/strmm_kernel_8x8.S
Normal file
File diff suppressed because it is too large
Load Diff
@@ -147,12 +147,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fmla v4.4s, v3.4s, v17.4s
|
||||
#endif
|
||||
|
||||
#if !defined(CONJ)
|
||||
|
||||
fmla v5.4s, v2.4s, v17.4s
|
||||
#else
|
||||
fmls v5.4s, v2.4s, v17.4s
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
fmla v5.4s, v3.4s, v16.4s
|
||||
#else
|
||||
fmls v5.4s, v3.4s, v16.4s
|
||||
#endif
|
||||
|
||||
st2 {v4.4s, v5.4s}, [Y], #32
|
||||
#else // DOUBLE
|
||||
@@ -165,12 +166,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#else
|
||||
fmla v4.2d, v3.2d, v17.2d
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
|
||||
fmla v5.2d, v2.2d, v17.2d
|
||||
#else
|
||||
fmls v5.2d, v2.2d, v17.2d
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
fmla v5.2d, v3.2d, v16.2d
|
||||
#else
|
||||
fmls v5.2d, v3.2d, v16.2d
|
||||
#endif
|
||||
|
||||
st2 {v4.2d, v5.2d}, [Y], #32
|
||||
|
||||
@@ -183,13 +185,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#else
|
||||
fmla v20.2d, v19.2d, v17.2d
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
fmla v21.2d, v18.2d, v17.2d
|
||||
#else
|
||||
fmls v21.2d, v18.2d, v17.2d
|
||||
#endif
|
||||
fmla v21.2d, v19.2d, v16.2d
|
||||
|
||||
fmla v21.2d, v18.2d, v17.2d
|
||||
#if !defined(CONJ)
|
||||
fmla v21.2d, v19.2d, v16.2d
|
||||
#else
|
||||
fmls v21.2d, v19.2d, v16.2d
|
||||
#endif
|
||||
st2 {v20.2d, v21.2d}, [Y], #32
|
||||
#endif
|
||||
PRFM PLDL1KEEP, [X, #512]
|
||||
|
||||
@@ -182,93 +182,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.2d, v0.2d, v9.2d[0]
|
||||
fmls v17.2d, v0.2d, v9.d[0]
|
||||
#else
|
||||
fmul v17.2d, v0.2d, v9.2d[0]
|
||||
fmul v17.2d, v0.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.2d[0]
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.2d[0]
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
fmul v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.2d, v0.2d, v9.2d[1]
|
||||
fmls v21.2d, v0.2d, v9.d[1]
|
||||
#else
|
||||
fmul v21.2d, v0.2d, v9.2d[1]
|
||||
fmul v21.2d, v0.2d, v9.d[1]
|
||||
#endif
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmul v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
fmul v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v23.16b, v23.16b, v23.16b
|
||||
fmls v23.2d, v2.2d, v9.2d[1]
|
||||
fmls v23.2d, v2.2d, v9.d[1]
|
||||
#else
|
||||
fmul v23.2d, v2.2d, v9.2d[1]
|
||||
fmul v23.2d, v2.2d, v9.d[1]
|
||||
#endif
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
fmul v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.2d, v0.2d, v11.2d[0]
|
||||
fmls v25.2d, v0.2d, v11.d[0]
|
||||
#else
|
||||
fmul v25.2d, v0.2d, v11.2d[0]
|
||||
fmul v25.2d, v0.2d, v11.d[0]
|
||||
#endif
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
fmul v26.2d, v2.2d, v10.2d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.2d[0]
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v27.16b, v27.16b, v27.16b
|
||||
fmls v27.2d, v2.2d, v11.2d[0]
|
||||
fmls v27.2d, v2.2d, v11.d[0]
|
||||
#else
|
||||
fmul v27.2d, v2.2d, v11.2d[0]
|
||||
fmul v27.2d, v2.2d, v11.d[0]
|
||||
#endif
|
||||
OP_ir v27.2d, v3.2d, v10.2d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
fmul v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
fmul v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.2d, v0.2d, v11.2d[1]
|
||||
fmls v29.2d, v0.2d, v11.d[1]
|
||||
#else
|
||||
fmul v29.2d, v0.2d, v11.2d[1]
|
||||
fmul v29.2d, v0.2d, v11.d[1]
|
||||
#endif
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
fmul v30.2d, v2.2d, v10.2d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.2d[1]
|
||||
fmul v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v31.16b, v31.16b, v31.16b
|
||||
fmls v31.2d, v2.2d, v11.2d[1]
|
||||
fmls v31.2d, v2.2d, v11.d[1]
|
||||
#else
|
||||
fmul v31.2d, v2.2d, v11.2d[1]
|
||||
fmul v31.2d, v2.2d, v11.d[1]
|
||||
#endif
|
||||
OP_ir v31.2d, v3.2d, v10.2d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
@@ -281,161 +281,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v26.2d, v2.2d, v10.2d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.2d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.2d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.2d[0]
|
||||
OP_rr v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
OP_rr v30.2d, v2.2d, v10.2d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.2d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.2d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.2d[1]
|
||||
OP_rr v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
OP_rr v16.2d, v4.2d, v12.2d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.2d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.2d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.2d[0]
|
||||
OP_rr v16.2d, v4.2d, v12.d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
ld2 {v8.2d, v9.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v6.2d, v12.2d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.2d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.2d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.2d[0]
|
||||
OP_rr v18.2d, v6.2d, v12.d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.2d, v4.2d, v12.2d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.2d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.2d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.2d[1]
|
||||
OP_rr v20.2d, v4.2d, v12.d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.2d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.2d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.2d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.2d[1]
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.2d, v4.2d, v14.2d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.2d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.2d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.2d[0]
|
||||
OP_rr v24.2d, v4.2d, v14.d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.2d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.2d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.2d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.2d[0]
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v28.2d, v4.2d, v14.2d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.2d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.2d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.2d[1]
|
||||
OP_rr v28.2d, v4.2d, v14.d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.d[1]
|
||||
|
||||
OP_rr v30.2d, v6.2d, v14.2d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.2d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.2d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.2d[1]
|
||||
OP_rr v30.2d, v6.2d, v14.d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.d[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
OP_rr v16.2d, v4.2d, v12.2d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.2d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.2d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.2d[0]
|
||||
OP_rr v16.2d, v4.2d, v12.d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
OP_rr v18.2d, v6.2d, v12.2d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.2d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.2d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.2d[0]
|
||||
OP_rr v18.2d, v6.2d, v12.d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
OP_rr v20.2d, v4.2d, v12.2d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.2d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.2d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.2d[1]
|
||||
OP_rr v20.2d, v4.2d, v12.d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.2d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.2d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.2d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.2d[1]
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
OP_rr v24.2d, v4.2d, v14.2d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.2d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.2d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.2d[0]
|
||||
OP_rr v24.2d, v4.2d, v14.d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.2d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.2d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.2d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.2d[0]
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
OP_rr v28.2d, v4.2d, v14.2d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.2d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.2d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.2d[1]
|
||||
OP_rr v28.2d, v4.2d, v14.d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.d[1]
|
||||
|
||||
OP_rr v30.2d, v6.2d, v14.2d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.2d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.2d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.2d[1]
|
||||
OP_rr v30.2d, v6.2d, v14.d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.d[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
@@ -448,45 +448,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
OP_rr v26.2d, v2.2d, v10.2d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.2d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.2d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.2d[0]
|
||||
OP_rr v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
OP_rr v30.2d, v2.2d, v10.2d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.2d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.2d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.2d[1]
|
||||
OP_rr v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
@@ -582,25 +582,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
@@ -669,25 +669,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.d, v1.d}[0], [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr d16, d0, v8.2d[0]
|
||||
OP_ii d16, d1, v9.2d[0]
|
||||
OP_ri d17, d0, v9.2d[0]
|
||||
OP_ir d17, d1, v8.2d[0]
|
||||
OP_rr d16, d0, v8.d[0]
|
||||
OP_ii d16, d1, v9.d[0]
|
||||
OP_ri d17, d0, v9.d[0]
|
||||
OP_ir d17, d1, v8.d[0]
|
||||
|
||||
OP_rr d20, d0, v8.2d[1]
|
||||
OP_ii d20, d1, v9.2d[1]
|
||||
OP_ri d21, d0, v9.2d[1]
|
||||
OP_ir d21, d1, v8.2d[1]
|
||||
OP_rr d20, d0, v8.d[1]
|
||||
OP_ii d20, d1, v9.d[1]
|
||||
OP_ri d21, d0, v9.d[1]
|
||||
OP_ir d21, d1, v8.d[1]
|
||||
|
||||
OP_rr d24, d0, v10.2d[0]
|
||||
OP_ii d24, d1, v11.2d[0]
|
||||
OP_ri d25, d0, v11.2d[0]
|
||||
OP_ir d25, d1, v10.2d[0]
|
||||
OP_rr d24, d0, v10.d[0]
|
||||
OP_ii d24, d1, v11.d[0]
|
||||
OP_ri d25, d0, v11.d[0]
|
||||
OP_ir d25, d1, v10.d[0]
|
||||
|
||||
OP_rr d28, d0, v10.2d[1]
|
||||
OP_ii d28, d1, v11.2d[1]
|
||||
OP_ri d29, d0, v11.2d[1]
|
||||
OP_ir d29, d1, v10.2d[1]
|
||||
OP_rr d28, d0, v10.d[1]
|
||||
OP_ii d28, d1, v11.d[1]
|
||||
OP_ri d29, d0, v11.d[1]
|
||||
OP_ir d29, d1, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
@@ -756,25 +756,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
@@ -833,15 +833,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
@@ -886,15 +886,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.d, v1.d}[0], [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr d16, d0, v8.2d[0]
|
||||
OP_ii d16, d1, v9.2d[0]
|
||||
OP_ri d17, d0, v9.2d[0]
|
||||
OP_ir d17, d1, v8.2d[0]
|
||||
OP_rr d16, d0, v8.d[0]
|
||||
OP_ii d16, d1, v9.d[0]
|
||||
OP_ri d17, d0, v9.d[0]
|
||||
OP_ir d17, d1, v8.d[0]
|
||||
|
||||
OP_rr d20, d0, v8.2d[1]
|
||||
OP_ii d20, d1, v9.2d[1]
|
||||
OP_ri d21, d0, v9.2d[1]
|
||||
OP_ir d21, d1, v8.2d[1]
|
||||
OP_rr d20, d0, v8.d[1]
|
||||
OP_ii d20, d1, v9.d[1]
|
||||
OP_ri d21, d0, v9.d[1]
|
||||
OP_ir d21, d1, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
||||
@@ -185,93 +185,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.2d, v0.2d, v9.2d[0]
|
||||
fmls v17.2d, v0.2d, v9.d[0]
|
||||
#else
|
||||
fmul v17.2d, v0.2d, v9.2d[0]
|
||||
fmul v17.2d, v0.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.2d[0]
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.2d[0]
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
fmul v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.2d, v0.2d, v9.2d[1]
|
||||
fmls v21.2d, v0.2d, v9.d[1]
|
||||
#else
|
||||
fmul v21.2d, v0.2d, v9.2d[1]
|
||||
fmul v21.2d, v0.2d, v9.d[1]
|
||||
#endif
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmul v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
fmul v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v23.16b, v23.16b, v23.16b
|
||||
fmls v23.2d, v2.2d, v9.2d[1]
|
||||
fmls v23.2d, v2.2d, v9.d[1]
|
||||
#else
|
||||
fmul v23.2d, v2.2d, v9.2d[1]
|
||||
fmul v23.2d, v2.2d, v9.d[1]
|
||||
#endif
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
fmul v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.2d, v0.2d, v11.2d[0]
|
||||
fmls v25.2d, v0.2d, v11.d[0]
|
||||
#else
|
||||
fmul v25.2d, v0.2d, v11.2d[0]
|
||||
fmul v25.2d, v0.2d, v11.d[0]
|
||||
#endif
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
fmul v26.2d, v2.2d, v10.2d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.2d[0]
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v27.16b, v27.16b, v27.16b
|
||||
fmls v27.2d, v2.2d, v11.2d[0]
|
||||
fmls v27.2d, v2.2d, v11.d[0]
|
||||
#else
|
||||
fmul v27.2d, v2.2d, v11.2d[0]
|
||||
fmul v27.2d, v2.2d, v11.d[0]
|
||||
#endif
|
||||
OP_ir v27.2d, v3.2d, v10.2d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
fmul v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
fmul v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.2d, v0.2d, v11.2d[1]
|
||||
fmls v29.2d, v0.2d, v11.d[1]
|
||||
#else
|
||||
fmul v29.2d, v0.2d, v11.2d[1]
|
||||
fmul v29.2d, v0.2d, v11.d[1]
|
||||
#endif
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
fmul v30.2d, v2.2d, v10.2d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.2d[1]
|
||||
fmul v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v31.16b, v31.16b, v31.16b
|
||||
fmls v31.2d, v2.2d, v11.2d[1]
|
||||
fmls v31.2d, v2.2d, v11.d[1]
|
||||
#else
|
||||
fmul v31.2d, v2.2d, v11.2d[1]
|
||||
fmul v31.2d, v2.2d, v11.d[1]
|
||||
#endif
|
||||
OP_ir v31.2d, v3.2d, v10.2d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
@@ -284,161 +284,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v26.2d, v2.2d, v10.2d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.2d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.2d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.2d[0]
|
||||
OP_rr v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
OP_rr v30.2d, v2.2d, v10.2d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.2d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.2d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.2d[1]
|
||||
OP_rr v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
OP_rr v16.2d, v4.2d, v12.2d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.2d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.2d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.2d[0]
|
||||
OP_rr v16.2d, v4.2d, v12.d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
ld2 {v8.2d, v9.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v6.2d, v12.2d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.2d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.2d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.2d[0]
|
||||
OP_rr v18.2d, v6.2d, v12.d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.2d, v4.2d, v12.2d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.2d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.2d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.2d[1]
|
||||
OP_rr v20.2d, v4.2d, v12.d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.2d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.2d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.2d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.2d[1]
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.2d, v4.2d, v14.2d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.2d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.2d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.2d[0]
|
||||
OP_rr v24.2d, v4.2d, v14.d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.2d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.2d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.2d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.2d[0]
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v28.2d, v4.2d, v14.2d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.2d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.2d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.2d[1]
|
||||
OP_rr v28.2d, v4.2d, v14.d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.d[1]
|
||||
|
||||
OP_rr v30.2d, v6.2d, v14.2d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.2d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.2d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.2d[1]
|
||||
OP_rr v30.2d, v6.2d, v14.d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.d[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
OP_rr v16.2d, v4.2d, v12.2d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.2d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.2d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.2d[0]
|
||||
OP_rr v16.2d, v4.2d, v12.d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
OP_rr v18.2d, v6.2d, v12.2d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.2d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.2d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.2d[0]
|
||||
OP_rr v18.2d, v6.2d, v12.d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
OP_rr v20.2d, v4.2d, v12.2d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.2d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.2d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.2d[1]
|
||||
OP_rr v20.2d, v4.2d, v12.d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.2d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.2d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.2d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.2d[1]
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
OP_rr v24.2d, v4.2d, v14.2d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.2d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.2d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.2d[0]
|
||||
OP_rr v24.2d, v4.2d, v14.d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.2d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.2d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.2d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.2d[0]
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
OP_rr v28.2d, v4.2d, v14.2d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.2d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.2d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.2d[1]
|
||||
OP_rr v28.2d, v4.2d, v14.d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.d[1]
|
||||
|
||||
OP_rr v30.2d, v6.2d, v14.2d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.2d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.2d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.2d[1]
|
||||
OP_rr v30.2d, v6.2d, v14.d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.d[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
@@ -451,45 +451,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
OP_rr v26.2d, v2.2d, v10.2d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.2d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.2d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.2d[0]
|
||||
OP_rr v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
OP_rr v30.2d, v2.2d, v10.2d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.2d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.2d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.2d[1]
|
||||
OP_rr v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
@@ -577,25 +577,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
@@ -660,25 +660,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.d, v1.d}[0], [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr d16, d0, v8.2d[0]
|
||||
OP_ii d16, d1, v9.2d[0]
|
||||
OP_ri d17, d0, v9.2d[0]
|
||||
OP_ir d17, d1, v8.2d[0]
|
||||
OP_rr d16, d0, v8.d[0]
|
||||
OP_ii d16, d1, v9.d[0]
|
||||
OP_ri d17, d0, v9.d[0]
|
||||
OP_ir d17, d1, v8.d[0]
|
||||
|
||||
OP_rr d20, d0, v8.2d[1]
|
||||
OP_ii d20, d1, v9.2d[1]
|
||||
OP_ri d21, d0, v9.2d[1]
|
||||
OP_ir d21, d1, v8.2d[1]
|
||||
OP_rr d20, d0, v8.d[1]
|
||||
OP_ii d20, d1, v9.d[1]
|
||||
OP_ri d21, d0, v9.d[1]
|
||||
OP_ir d21, d1, v8.d[1]
|
||||
|
||||
OP_rr d24, d0, v10.2d[0]
|
||||
OP_ii d24, d1, v11.2d[0]
|
||||
OP_ri d25, d0, v11.2d[0]
|
||||
OP_ir d25, d1, v10.2d[0]
|
||||
OP_rr d24, d0, v10.d[0]
|
||||
OP_ii d24, d1, v11.d[0]
|
||||
OP_ri d25, d0, v11.d[0]
|
||||
OP_ir d25, d1, v10.d[0]
|
||||
|
||||
OP_rr d28, d0, v10.2d[1]
|
||||
OP_ii d28, d1, v11.2d[1]
|
||||
OP_ri d29, d0, v11.2d[1]
|
||||
OP_ir d29, d1, v10.2d[1]
|
||||
OP_rr d28, d0, v10.d[1]
|
||||
OP_ii d28, d1, v11.d[1]
|
||||
OP_ri d29, d0, v11.d[1]
|
||||
OP_ir d29, d1, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
@@ -743,25 +743,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
@@ -816,15 +816,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
@@ -867,15 +867,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld2 {v0.d, v1.d}[0], [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr d16, d0, v8.2d[0]
|
||||
OP_ii d16, d1, v9.2d[0]
|
||||
OP_ri d17, d0, v9.2d[0]
|
||||
OP_ir d17, d1, v8.2d[0]
|
||||
OP_rr d16, d0, v8.d[0]
|
||||
OP_ii d16, d1, v9.d[0]
|
||||
OP_ri d17, d0, v9.d[0]
|
||||
OP_ir d17, d1, v8.d[0]
|
||||
|
||||
OP_rr d20, d0, v8.2d[1]
|
||||
OP_ii d20, d1, v9.2d[1]
|
||||
OP_ri d21, d0, v9.2d[1]
|
||||
OP_ir d21, d1, v8.2d[1]
|
||||
OP_rr d20, d0, v8.d[1]
|
||||
OP_ii d20, d1, v9.d[1]
|
||||
OP_ri d21, d0, v9.d[1]
|
||||
OP_ir d21, d1, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
||||
@@ -46,3 +46,7 @@ ifndef ZGEMM_BETA
|
||||
ZGEMM_BETA = zgemm_beta.S
|
||||
endif
|
||||
|
||||
ifndef DSDOTKERNEL
|
||||
DSDOTKERNEL = ../generic/dot.c
|
||||
endif
|
||||
|
||||
|
||||
@@ -3,14 +3,18 @@
|
||||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
#ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
STRMMKERNEL = gemm_kernel_power6.S
|
||||
STRMMKERNEL = strmm_kernel_16x8_power8.S
|
||||
DTRMMKERNEL = dtrmm_kernel_16x4_power8.S
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
|
||||
|
||||
SGEMMKERNEL = gemm_kernel_power6.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMKERNEL = sgemm_kernel_16x8_power8.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
@@ -24,11 +28,15 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
@@ -97,56 +105,56 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
#ISMINKERNEL = ../arm/imin.c
|
||||
#IDMINKERNEL = ../arm/imin.c
|
||||
#
|
||||
#SASUMKERNEL = ../arm/asum.c
|
||||
#DASUMKERNEL = ../arm/asum.c
|
||||
#CASUMKERNEL = ../arm/zasum.c
|
||||
#ZASUMKERNEL = ../arm/zasum.c
|
||||
SASUMKERNEL = sasum.c
|
||||
DASUMKERNEL = dasum.c
|
||||
CASUMKERNEL = casum.c
|
||||
ZASUMKERNEL = zasum.c
|
||||
#
|
||||
#SAXPYKERNEL = ../arm/axpy.c
|
||||
#DAXPYKERNEL = ../arm/axpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
#CAXPYKERNEL = ../arm/zaxpy.c
|
||||
#ZAXPYKERNEL = ../arm/zaxpy.c
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
#SCOPYKERNEL = ../arm/copy.c
|
||||
#DCOPYKERNEL = ../arm/copy.c
|
||||
#CCOPYKERNEL = ../arm/zcopy.c
|
||||
#ZCOPYKERNEL = ../arm/zcopy.c
|
||||
SCOPYKERNEL = scopy.c
|
||||
DCOPYKERNEL = dcopy.c
|
||||
CCOPYKERNEL = ccopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
#
|
||||
#SDOTKERNEL = ../arm/dot.c
|
||||
#DDOTKERNEL = ../arm/dot.c
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
#CDOTKERNEL = ../arm/zdot.c
|
||||
#ZDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = zdot.c
|
||||
#
|
||||
#SNRM2KERNEL = ../arm/nrm2.c
|
||||
#DNRM2KERNEL = ../arm/nrm2.c
|
||||
#CNRM2KERNEL = ../arm/znrm2.c
|
||||
#ZNRM2KERNEL = ../arm/znrm2.c
|
||||
#
|
||||
#SROTKERNEL = ../arm/rot.c
|
||||
#DROTKERNEL = ../arm/rot.c
|
||||
SROTKERNEL = srot.c
|
||||
DROTKERNEL = drot.c
|
||||
#CROTKERNEL = ../arm/zrot.c
|
||||
#ZROTKERNEL = ../arm/zrot.c
|
||||
#
|
||||
#SSCALKERNEL = ../arm/scal.c
|
||||
#DSCALKERNEL = ../arm/scal.c
|
||||
SSCALKERNEL = sscal.c
|
||||
DSCALKERNEL = dscal.c
|
||||
#CSCALKERNEL = ../arm/zscal.c
|
||||
#ZSCALKERNEL = ../arm/zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
#
|
||||
#SSWAPKERNEL = ../arm/swap.c
|
||||
#DSWAPKERNEL = ../arm/swap.c
|
||||
#CSWAPKERNEL = ../arm/zswap.c
|
||||
#ZSWAPKERNEL = ../arm/zswap.c
|
||||
SSWAPKERNEL = sswap.c
|
||||
DSWAPKERNEL = dswap.c
|
||||
CSWAPKERNEL = cswap.c
|
||||
ZSWAPKERNEL = zswap.c
|
||||
#
|
||||
|
||||
#SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
#DGEMVNKERNEL = ../arm/gemv_n.c
|
||||
DGEMVNKERNEL = dgemv_n.c
|
||||
#CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
#ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
#
|
||||
#SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
#DGEMVTKERNEL = ../arm/gemv_t.c
|
||||
#CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
#ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
#ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
|
||||
#SSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
|
||||
151
kernel/power/casum.c
Normal file
151
kernel/power/casum.c
Normal file
@@ -0,0 +1,151 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "casum_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT *x = x1;
|
||||
FLOAT temp0, temp1, temp2, temp3;
|
||||
FLOAT temp4, temp5, temp6, temp7;
|
||||
FLOAT sum0 = 0.0;
|
||||
FLOAT sum1 = 0.0;
|
||||
FLOAT sum2 = 0.0;
|
||||
FLOAT sum3 = 0.0;
|
||||
|
||||
while ( i< n )
|
||||
{
|
||||
|
||||
temp0 = ABS(x[0]);
|
||||
temp1 = ABS(x[1]);
|
||||
temp2 = ABS(x[2]);
|
||||
temp3 = ABS(x[3]);
|
||||
temp4 = ABS(x[4]);
|
||||
temp5 = ABS(x[5]);
|
||||
temp6 = ABS(x[6]);
|
||||
temp7 = ABS(x[7]);
|
||||
|
||||
sum0 += temp0;
|
||||
sum1 += temp1;
|
||||
sum2 += temp2;
|
||||
sum3 += temp3;
|
||||
|
||||
sum0 += temp4;
|
||||
sum1 += temp5;
|
||||
sum2 += temp6;
|
||||
sum3 += temp7;
|
||||
|
||||
x+=8;
|
||||
i+=4;
|
||||
|
||||
}
|
||||
|
||||
svec[0] = sum0+sum1+sum2+sum3;
|
||||
svec[1] = 0.0;
|
||||
svec[2] = 0.0;
|
||||
svec[3] = 0.0;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ip=0;
|
||||
FLOAT sumf = 0.0;
|
||||
FLOAT svec[4] __attribute__ ((aligned (16)));;
|
||||
BLASLONG n1;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
casum_kernel_16(n1, x, svec);
|
||||
sumf = svec[0] + svec[1]+svec[2]+svec[3];
|
||||
i=n1;
|
||||
ip = 2 * n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[ip]) + ABS(x[ip+1]);
|
||||
ip += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[ip]) + ABS(x[ip+1]);
|
||||
ip += inc_x2;
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
||||
177
kernel/power/casum_microk_power8.c
Normal file
177
kernel/power/casum_microk_power8.c
Normal file
@@ -0,0 +1,177 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
|
||||
|
||||
static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"dcbt %2 , %4 \n\t"
|
||||
|
||||
"xxlxor 32,32,32 \n\t"
|
||||
"xxlxor 33,33,33 \n\t"
|
||||
"xxlxor 34,34,34 \n\t"
|
||||
"xxlxor 35,35,35 \n\t"
|
||||
"xxlxor 36,36,36 \n\t"
|
||||
"xxlxor 37,37,37 \n\t"
|
||||
"xxlxor 38,38,38 \n\t"
|
||||
"xxlxor 39,39,39 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2 , %4 \n\t"
|
||||
|
||||
"xvabssp 48, 40 \n\t"
|
||||
"xvabssp 49, 41 \n\t"
|
||||
"xvabssp 50, 42 \n\t"
|
||||
"xvabssp 51, 43 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
|
||||
"xvabssp 52, 44 \n\t"
|
||||
"xvabssp 53, 45 \n\t"
|
||||
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
|
||||
"xvabssp 54, 46 \n\t"
|
||||
"xvabssp 55, 47 \n\t"
|
||||
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 48 \n\t"
|
||||
"xvaddsp 33, 33, 49 \n\t"
|
||||
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
"xvaddsp 34, 34, 50 \n\t"
|
||||
"xvaddsp 35, 35, 51 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"xvaddsp 36, 36, 52 \n\t"
|
||||
"xvaddsp 37, 37, 53 \n\t"
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"xvaddsp 38, 38, 54 \n\t"
|
||||
"xvaddsp 39, 39, 55 \n\t"
|
||||
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
|
||||
"xvabssp 48, 40 \n\t"
|
||||
"xvabssp 49, 41 \n\t"
|
||||
"xvabssp 50, 42 \n\t"
|
||||
"xvabssp 51, 43 \n\t"
|
||||
"xvabssp 52, 44 \n\t"
|
||||
"xvabssp 53, 45 \n\t"
|
||||
"xvabssp 54, 46 \n\t"
|
||||
"xvabssp 55, 47 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 48 \n\t"
|
||||
"xvaddsp 33, 33, 49 \n\t"
|
||||
"xvaddsp 34, 34, 50 \n\t"
|
||||
"xvaddsp 35, 35, 51 \n\t"
|
||||
"xvaddsp 36, 36, 52 \n\t"
|
||||
"xvaddsp 37, 37, 53 \n\t"
|
||||
"xvaddsp 38, 38, 54 \n\t"
|
||||
"xvaddsp 39, 39, 55 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 33 \n\t"
|
||||
"xvaddsp 34, 34, 35 \n\t"
|
||||
"xvaddsp 36, 36, 37 \n\t"
|
||||
"xvaddsp 38, 38, 39 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 34 \n\t"
|
||||
"xvaddsp 36, 36, 38 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 36 \n\t"
|
||||
|
||||
|
||||
"stxvw4x 32, 0, %3 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (svec), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
140
kernel/power/ccopy.c
Normal file
140
kernel/power/ccopy.c
Normal file
@@ -0,0 +1,140 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "ccopy_microk_power8.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
f0 = x1[0];
|
||||
f1 = x1[1];
|
||||
f2 = x1[2];
|
||||
f3 = x1[3];
|
||||
f4 = x1[4];
|
||||
f5 = x1[5];
|
||||
f6 = x1[6];
|
||||
f7 = x1[7];
|
||||
|
||||
y1[0] = f0;
|
||||
y1[1] = f1;
|
||||
y1[2] = f2;
|
||||
y1[3] = f3;
|
||||
y1[4] = f4;
|
||||
y1[5] = f5;
|
||||
y1[6] = f6;
|
||||
y1[7] = f7;
|
||||
|
||||
x1 += 8;
|
||||
y1 += 8;
|
||||
|
||||
i+=4;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
ccopy_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
ix=n1*2;
|
||||
iy=n1*2;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[iy] ;
|
||||
y[iy+1] = x[ix+1] ;
|
||||
ix+=2;
|
||||
iy+=2;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
BLASLONG inc_x2 = 2 * inc_x;
|
||||
BLASLONG inc_y2 = 2 * inc_y;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[ix] ;
|
||||
y[iy+1] = x[ix+1] ;
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
174
kernel/power/ccopy_microk_power8.c
Normal file
174
kernel/power/ccopy_microk_power8.c
Normal file
@@ -0,0 +1,174 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_32 1
|
||||
|
||||
static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
BLASLONG pre = 384;
|
||||
BLASLONG alpha=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvw4x 50, 0, %2 \n\t"
|
||||
"lxvw4x 51, %5, %2 \n\t"
|
||||
"lxvw4x 52, %6, %2 \n\t"
|
||||
"lxvw4x 53, %7, %2 \n\t"
|
||||
"lxvw4x 54, %8, %2 \n\t"
|
||||
"lxvw4x 55, %9, %2 \n\t"
|
||||
"lxvw4x 56, %10, %2 \n\t"
|
||||
"lxvw4x 57, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %1 \n\t"
|
||||
"stxvw4x 41, %5, %1 \n\t"
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"stxvw4x 42, %6, %1 \n\t"
|
||||
"stxvw4x 43, %7, %1 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"stxvw4x 44, %8, %1 \n\t"
|
||||
"stxvw4x 45, %9, %1 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"stxvw4x 46, %10, %1 \n\t"
|
||||
"stxvw4x 47, %11, %1 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"stxvw4x 50, 0, %1 \n\t"
|
||||
"stxvw4x 51, %5, %1 \n\t"
|
||||
"lxvw4x 50, 0, %2 \n\t"
|
||||
"lxvw4x 51, %5, %2 \n\t"
|
||||
"stxvw4x 52, %6, %1 \n\t"
|
||||
"stxvw4x 53, %7, %1 \n\t"
|
||||
"lxvw4x 52, %6, %2 \n\t"
|
||||
"lxvw4x 53, %7, %2 \n\t"
|
||||
"stxvw4x 54, %8, %1 \n\t"
|
||||
"stxvw4x 55, %9, %1 \n\t"
|
||||
"lxvw4x 54, %8, %2 \n\t"
|
||||
"lxvw4x 55, %9, %2 \n\t"
|
||||
"stxvw4x 56, %10, %1 \n\t"
|
||||
"stxvw4x 57, %11, %1 \n\t"
|
||||
"lxvw4x 56, %10, %2 \n\t"
|
||||
"lxvw4x 57, %11, %2 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %1 \n\t"
|
||||
"stxvw4x 41, %5, %1 \n\t"
|
||||
"stxvw4x 42, %6, %1 \n\t"
|
||||
"stxvw4x 43, %7, %1 \n\t"
|
||||
"stxvw4x 44, %8, %1 \n\t"
|
||||
"stxvw4x 45, %9, %1 \n\t"
|
||||
"stxvw4x 46, %10, %1 \n\t"
|
||||
"stxvw4x 47, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"stxvw4x 50, 0, %1 \n\t"
|
||||
"stxvw4x 51, %5, %1 \n\t"
|
||||
"stxvw4x 52, %6, %1 \n\t"
|
||||
"stxvw4x 53, %7, %1 \n\t"
|
||||
"stxvw4x 54, %8, %1 \n\t"
|
||||
"stxvw4x 55, %9, %1 \n\t"
|
||||
"stxvw4x 56, %10, %1 \n\t"
|
||||
"stxvw4x 57, %11, %1 \n\t"
|
||||
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (y1), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (alpha), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
407
kernel/power/cgemm_kernel_8x4_power8.S
Normal file
407
kernel/power/cgemm_kernel_8x4_power8.S
Normal file
@@ -0,0 +1,407 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "def_vsx.h"
|
||||
|
||||
#ifndef __64BIT__
|
||||
#define LOAD lwz
|
||||
#else
|
||||
#define LOAD ld
|
||||
#endif
|
||||
|
||||
#ifdef __64BIT__
|
||||
#define STACKSIZE 32000
|
||||
#define ALPHA_R_SP 296(SP)
|
||||
#define ALPHA_I_SP 304(SP)
|
||||
#define FZERO 312(SP)
|
||||
#else
|
||||
#define STACKSIZE 256
|
||||
#define ALPHA_R_SP 224(SP)
|
||||
#define ALPHA_I_SP 232(SP)
|
||||
#define FZERO 240(SP)
|
||||
#endif
|
||||
|
||||
#define M r3
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
#define C r8
|
||||
#define LDC r9
|
||||
#define OFFSET r10
|
||||
#else
|
||||
#define A r8
|
||||
#define B r9
|
||||
#define C r10
|
||||
#define LDC r6
|
||||
#define OFFSET r7
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#if !defined(__64BIT__) && defined(DOUBLE)
|
||||
#define A r10
|
||||
#define B r6
|
||||
#define C r7
|
||||
#define LDC r8
|
||||
#define OFFSET r9
|
||||
#else
|
||||
#define A r8
|
||||
#define B r9
|
||||
#define C r10
|
||||
#define LDC r6
|
||||
#define OFFSET r7
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define o0 0
|
||||
|
||||
#define alpha_dr vs28
|
||||
#define alpha_di vs29
|
||||
#define alpha_sr vs30
|
||||
#define alpha_si vs31
|
||||
|
||||
#define FRAMEPOINTER r12
|
||||
|
||||
#define BBUFFER r14
|
||||
#define L r15
|
||||
#define o12 r16
|
||||
#define o4 r17
|
||||
#define T2 r19
|
||||
#define BBO r20
|
||||
#define o8 r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
#define AO r24
|
||||
#define BO r25
|
||||
#define CO r26
|
||||
#define o16 r27
|
||||
#define o32 r28
|
||||
#define o48 r29
|
||||
|
||||
#define PRE r30
|
||||
#define T1 r31
|
||||
|
||||
#ifndef NEEDPARAM
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
mr FRAMEPOINTER, SP
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
|
||||
#ifdef __64BIT__
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
#else
|
||||
stw r31, 144(SP)
|
||||
stw r30, 148(SP)
|
||||
stw r29, 152(SP)
|
||||
stw r28, 156(SP)
|
||||
stw r27, 160(SP)
|
||||
stw r26, 164(SP)
|
||||
stw r25, 168(SP)
|
||||
stw r24, 172(SP)
|
||||
stw r23, 176(SP)
|
||||
stw r22, 180(SP)
|
||||
stw r21, 184(SP)
|
||||
stw r20, 188(SP)
|
||||
stw r19, 192(SP)
|
||||
stw r18, 196(SP)
|
||||
stw r17, 200(SP)
|
||||
stw r16, 204(SP)
|
||||
stw r15, 208(SP)
|
||||
stw r14, 212(SP)
|
||||
#endif
|
||||
|
||||
stfs f1, ALPHA_R_SP
|
||||
stfs f2, ALPHA_I_SP
|
||||
// stw r0, FZERO
|
||||
|
||||
#ifdef linux
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
neg KK, OFFSET
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "cgemm_macros_8x4_power8.S"
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble L999_H1
|
||||
cmpwi cr0, N, 0
|
||||
ble L999_H1
|
||||
cmpwi cr0, K, 0
|
||||
ble L999_H1
|
||||
|
||||
slwi LDC, LDC, ZBASE_SHIFT
|
||||
li PRE, 384
|
||||
li o4 , 4
|
||||
li o8 , 8
|
||||
li o12 , 12
|
||||
li o16 , 16
|
||||
li o32 , 32
|
||||
li o48 , 48
|
||||
|
||||
addi BBUFFER, SP, 512+4096
|
||||
li T1, -4096
|
||||
and BBUFFER, BBUFFER, T1
|
||||
|
||||
|
||||
#ifdef __64BIT__
|
||||
addi T1 , SP, 296
|
||||
#else
|
||||
addi T1 , SP, 224
|
||||
#endif
|
||||
|
||||
stxsspx vs1, 0, T1
|
||||
lxsspx alpha_dr, 0, T1
|
||||
stxsspx vs2, o8 , T1
|
||||
lxsspx alpha_di, o8, T1
|
||||
addi T1, SP, 360
|
||||
li T2, 0
|
||||
|
||||
stw T2, 0(T1)
|
||||
stw T2, 4(T1)
|
||||
stw T2, 8(T1)
|
||||
stxsspx alpha_dr, o12, T1
|
||||
lxvw4x alpha_sr, o0 , T1
|
||||
addi T1, T1, 16
|
||||
|
||||
stw T2, 0(T1)
|
||||
stw T2, 4(T1)
|
||||
stw T2, 8(T1)
|
||||
stxsspx alpha_di, o12, T1
|
||||
lxvw4x alpha_si, o0 , T1
|
||||
|
||||
.align 5
|
||||
|
||||
#include "cgemm_logic_8x4_power8.S"
|
||||
|
||||
L999:
|
||||
addi r3, 0, 0
|
||||
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
#ifdef __64BIT__
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
#else
|
||||
lwz r31, 144(SP)
|
||||
lwz r30, 148(SP)
|
||||
lwz r29, 152(SP)
|
||||
lwz r28, 156(SP)
|
||||
lwz r27, 160(SP)
|
||||
lwz r26, 164(SP)
|
||||
lwz r25, 168(SP)
|
||||
lwz r24, 172(SP)
|
||||
lwz r23, 176(SP)
|
||||
lwz r22, 180(SP)
|
||||
lwz r21, 184(SP)
|
||||
lwz r20, 188(SP)
|
||||
lwz r19, 192(SP)
|
||||
lwz r18, 196(SP)
|
||||
lwz r17, 200(SP)
|
||||
lwz r16, 204(SP)
|
||||
lwz r15, 208(SP)
|
||||
lwz r14, 212(SP)
|
||||
#endif
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
|
||||
blr
|
||||
|
||||
EPILOGUE
|
||||
#endif
|
||||
1459
kernel/power/cgemm_logic_8x4_power8.S
Normal file
1459
kernel/power/cgemm_logic_8x4_power8.S
Normal file
File diff suppressed because it is too large
Load Diff
6355
kernel/power/cgemm_macros_8x4_power8.S
Normal file
6355
kernel/power/cgemm_macros_8x4_power8.S
Normal file
File diff suppressed because it is too large
Load Diff
175
kernel/power/cswap.c
Normal file
175
kernel/power/cswap.c
Normal file
@@ -0,0 +1,175 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "cswap_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
f0 = x1[0];
|
||||
f1 = x1[1];
|
||||
f2 = x1[2];
|
||||
f3 = x1[3];
|
||||
f4 = x1[4];
|
||||
f5 = x1[5];
|
||||
f6 = x1[6];
|
||||
f7 = x1[7];
|
||||
|
||||
g0 = y1[0];
|
||||
g1 = y1[1];
|
||||
g2 = y1[2];
|
||||
g3 = y1[3];
|
||||
g4 = y1[4];
|
||||
g5 = y1[5];
|
||||
g6 = y1[6];
|
||||
g7 = y1[7];
|
||||
|
||||
y1[0] = f0;
|
||||
y1[1] = f1;
|
||||
y1[2] = f2;
|
||||
y1[3] = f3;
|
||||
y1[4] = f4;
|
||||
y1[5] = f5;
|
||||
y1[6] = f6;
|
||||
y1[7] = f7;
|
||||
|
||||
x1[0] = g0;
|
||||
x1[1] = g1;
|
||||
x1[2] = g2;
|
||||
x1[3] = g3;
|
||||
x1[4] = g4;
|
||||
x1[5] = g5;
|
||||
x1[6] = g6;
|
||||
x1[7] = g7;
|
||||
|
||||
x1 += 8;
|
||||
y1 += 8;
|
||||
|
||||
i+=4;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp[2];
|
||||
BLASLONG inc_x2, inc_y2;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
cswap_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
ix = 2* n1;
|
||||
iy = 2* n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
temp[0] = x[ix] ;
|
||||
temp[1] = x[ix+1] ;
|
||||
x[ix] = y[iy] ;
|
||||
x[ix+1] = y[iy+1] ;
|
||||
y[iy] = temp[0] ;
|
||||
y[iy+1] = temp[1] ;
|
||||
|
||||
ix += 2 ;
|
||||
iy += 2 ;
|
||||
i++ ;
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
temp[0] = x[ix] ;
|
||||
temp[1] = x[ix+1] ;
|
||||
x[ix] = y[iy] ;
|
||||
x[ix+1] = y[iy+1] ;
|
||||
y[iy] = temp[0] ;
|
||||
y[iy+1] = temp[1] ;
|
||||
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
180
kernel/power/cswap_microk_power8.c
Normal file
180
kernel/power/cswap_microk_power8.c
Normal file
@@ -0,0 +1,180 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_32 1
|
||||
|
||||
static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
FLOAT *x2=x+1;
|
||||
FLOAT *y2=y+1;
|
||||
BLASLONG pre = 384;
|
||||
BLASLONG alpha=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"addi %3, %3, -4 \n\t"
|
||||
"addi %4, %4, -4 \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"lxvw4x 32, 0, %2 \n\t"
|
||||
"lxvw4x 33, %5, %2 \n\t"
|
||||
"lxvw4x 34, %6, %2 \n\t"
|
||||
"lxvw4x 35, %7, %2 \n\t"
|
||||
"lxvw4x 36, %8, %2 \n\t"
|
||||
"lxvw4x 37, %9, %2 \n\t"
|
||||
"lxvw4x 38, %10, %2 \n\t"
|
||||
"lxvw4x 39, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvw4x 48, 0, %1 \n\t"
|
||||
"lxvw4x 49, %5, %1 \n\t"
|
||||
"lxvw4x 50, %6, %1 \n\t"
|
||||
"lxvw4x 51, %7, %1 \n\t"
|
||||
"lxvw4x 52, %8, %1 \n\t"
|
||||
"lxvw4x 53, %9, %1 \n\t"
|
||||
"lxvw4x 54, %10, %1 \n\t"
|
||||
"lxvw4x 55, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"lxvw4x 56, 0, %1 \n\t"
|
||||
"lxvw4x 57, %5, %1 \n\t"
|
||||
"lxvw4x 58, %6, %1 \n\t"
|
||||
"lxvw4x 59, %7, %1 \n\t"
|
||||
"lxvw4x 60, %8, %1 \n\t"
|
||||
"lxvw4x 61, %9, %1 \n\t"
|
||||
"lxvw4x 62, %10, %1 \n\t"
|
||||
"lxvw4x 63, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"stxvw4x 32, 0, %3 \n\t"
|
||||
"stxvw4x 33, %5, %3 \n\t"
|
||||
"stxvw4x 34, %6, %3 \n\t"
|
||||
"stxvw4x 35, %7, %3 \n\t"
|
||||
"stxvw4x 36, %8, %3 \n\t"
|
||||
"stxvw4x 37, %9, %3 \n\t"
|
||||
"stxvw4x 38, %10, %3 \n\t"
|
||||
"stxvw4x 39, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t"
|
||||
"stxvw4x 41, %5, %3 \n\t"
|
||||
"stxvw4x 42, %6, %3 \n\t"
|
||||
"stxvw4x 43, %7, %3 \n\t"
|
||||
"stxvw4x 44, %8, %3 \n\t"
|
||||
"stxvw4x 45, %9, %3 \n\t"
|
||||
"stxvw4x 46, %10, %3 \n\t"
|
||||
"stxvw4x 47, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvw4x 48, 0, %4 \n\t"
|
||||
"stxvw4x 49, %5, %4 \n\t"
|
||||
"stxvw4x 50, %6, %4 \n\t"
|
||||
"stxvw4x 51, %7, %4 \n\t"
|
||||
"stxvw4x 52, %8, %4 \n\t"
|
||||
"stxvw4x 53, %9, %4 \n\t"
|
||||
"stxvw4x 54, %10, %4 \n\t"
|
||||
"stxvw4x 55, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
"stxvw4x 56, 0, %4 \n\t"
|
||||
"stxvw4x 57, %5, %4 \n\t"
|
||||
"stxvw4x 58, %6, %4 \n\t"
|
||||
"stxvw4x 59, %7, %4 \n\t"
|
||||
"stxvw4x 60, %8, %4 \n\t"
|
||||
"stxvw4x 61, %9, %4 \n\t"
|
||||
"stxvw4x 62, %10, %4 \n\t"
|
||||
"stxvw4x 63, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (y1), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (y2), // 3
|
||||
"r" (x2), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
399
kernel/power/ctrmm_kernel_8x4_power8.S
Normal file
399
kernel/power/ctrmm_kernel_8x4_power8.S
Normal file
@@ -0,0 +1,399 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "def_vsx.h"
|
||||
|
||||
#ifndef __64BIT__
|
||||
#define LOAD lwz
|
||||
#else
|
||||
#define LOAD ld
|
||||
#endif
|
||||
|
||||
#ifdef __64BIT__
|
||||
#define STACKSIZE 400
|
||||
#define ALPHA_R_SP 304(SP)
|
||||
#define ALPHA_I_SP 312(SP)
|
||||
#else
|
||||
#define STACKSIZE 256
|
||||
#define ALPHA_R_SP 224(SP)
|
||||
#define ALPHA_I_SP 232(SP)
|
||||
#define FZERO 240(SP)
|
||||
#endif
|
||||
|
||||
#define M r3
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
#define C r8
|
||||
#define LDC r9
|
||||
#define OFFSET r10
|
||||
#else
|
||||
#define A r8
|
||||
#define B r9
|
||||
#define C r10
|
||||
#define LDC r6
|
||||
#define OFFSET r7
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#if !defined(__64BIT__) && defined(DOUBLE)
|
||||
#define A r10
|
||||
#define B r6
|
||||
#define C r7
|
||||
#define LDC r8
|
||||
#define OFFSET r9
|
||||
#else
|
||||
#define A r8
|
||||
#define B r9
|
||||
#define C r10
|
||||
#define LDC r6
|
||||
#define OFFSET r7
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define o0 0
|
||||
|
||||
#define alpha_dr vs28
|
||||
#define alpha_di vs29
|
||||
#define alpha_sr vs30
|
||||
#define alpha_si vs31
|
||||
|
||||
#define o12 r12
|
||||
#define KKK r13
|
||||
#define K1 r14
|
||||
#define L r15
|
||||
#define o16 r16
|
||||
#define NOTUSED r17
|
||||
#define T2 r19
|
||||
#define KK r20
|
||||
#define o8 r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
#define AO r24
|
||||
#define BO r25
|
||||
#define CO r26
|
||||
#define o4 r27
|
||||
#define o32 r28
|
||||
#define o48 r29
|
||||
|
||||
#define PRE r30
|
||||
#define T1 r31
|
||||
|
||||
#ifndef NEEDPARAM
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
|
||||
#ifdef __64BIT__
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
std r13, 288(SP)
|
||||
std r12, 296(SP)
|
||||
#else
|
||||
stw r31, 144(SP)
|
||||
stw r30, 148(SP)
|
||||
stw r29, 152(SP)
|
||||
stw r28, 156(SP)
|
||||
stw r27, 160(SP)
|
||||
stw r26, 164(SP)
|
||||
stw r25, 168(SP)
|
||||
stw r24, 172(SP)
|
||||
stw r23, 176(SP)
|
||||
stw r22, 180(SP)
|
||||
stw r21, 184(SP)
|
||||
stw r20, 188(SP)
|
||||
stw r19, 192(SP)
|
||||
stw r18, 196(SP)
|
||||
stw r17, 200(SP)
|
||||
stw r16, 204(SP)
|
||||
stw r15, 208(SP)
|
||||
stw r14, 212(SP)
|
||||
stw r13, 216(SP)
|
||||
#endif
|
||||
|
||||
stfs f1, ALPHA_R_SP
|
||||
stfs f2, ALPHA_I_SP
|
||||
// stw r0, FZERO
|
||||
|
||||
#ifdef linux
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz B, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
lwz C, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
|
||||
#else
|
||||
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
|
||||
#else
|
||||
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
neg KK, OFFSET
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "ctrmm_macros_8x4_power8.S"
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble L999_H1
|
||||
cmpwi cr0, N, 0
|
||||
ble L999_H1
|
||||
cmpwi cr0, K, 0
|
||||
ble L999_H1
|
||||
|
||||
slwi LDC, LDC, ZBASE_SHIFT
|
||||
li PRE, 384
|
||||
li o4 , 4
|
||||
li o8 , 8
|
||||
li o12 , 12
|
||||
li o16 , 16
|
||||
li o32 , 32
|
||||
li o48 , 48
|
||||
|
||||
|
||||
#ifdef __64BIT__
|
||||
addi T1, SP, 304
|
||||
#else
|
||||
addi T1, SP, 224
|
||||
#endif
|
||||
|
||||
lxsspx alpha_dr, 0, T1
|
||||
lxsspx alpha_di, o8, T1
|
||||
addi T1, SP, 360
|
||||
li T2, 0
|
||||
|
||||
stw T2, 0(T1)
|
||||
stw T2, 4(T1)
|
||||
stw T2, 8(T1)
|
||||
stxsspx alpha_dr, o12, T1
|
||||
lxvw4x alpha_sr, o0 , T1
|
||||
addi T1, T1, 16
|
||||
|
||||
stw T2, 0(T1)
|
||||
stw T2, 4(T1)
|
||||
stw T2, 8(T1)
|
||||
stxsspx alpha_di, o12, T1
|
||||
lxvw4x alpha_si, o0 , T1
|
||||
|
||||
.align 5
|
||||
|
||||
#include "ctrmm_logic_8x4_power8.S"
|
||||
|
||||
L999:
|
||||
addi r3, 0, 0
|
||||
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
#ifdef __64BIT__
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
ld r13, 288(SP)
|
||||
ld r12, 296(SP)
|
||||
#else
|
||||
lwz r31, 144(SP)
|
||||
lwz r30, 148(SP)
|
||||
lwz r29, 152(SP)
|
||||
lwz r28, 156(SP)
|
||||
lwz r27, 160(SP)
|
||||
lwz r26, 164(SP)
|
||||
lwz r25, 168(SP)
|
||||
lwz r24, 172(SP)
|
||||
lwz r23, 176(SP)
|
||||
lwz r22, 180(SP)
|
||||
lwz r21, 184(SP)
|
||||
lwz r20, 188(SP)
|
||||
lwz r19, 192(SP)
|
||||
lwz r18, 196(SP)
|
||||
lwz r17, 200(SP)
|
||||
lwz r16, 204(SP)
|
||||
lwz r15, 208(SP)
|
||||
lwz r14, 212(SP)
|
||||
lwz r13, 216(SP)
|
||||
#endif
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
|
||||
blr
|
||||
|
||||
EPILOGUE
|
||||
#endif
|
||||
1769
kernel/power/ctrmm_logic_8x4_power8.S
Normal file
1769
kernel/power/ctrmm_logic_8x4_power8.S
Normal file
File diff suppressed because it is too large
Load Diff
6794
kernel/power/ctrmm_macros_8x4_power8.S
Normal file
6794
kernel/power/ctrmm_macros_8x4_power8.S
Normal file
File diff suppressed because it is too large
Load Diff
144
kernel/power/dasum.c
Normal file
144
kernel/power/dasum.c
Normal file
@@ -0,0 +1,144 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "dasum_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT *x = x1;
|
||||
FLOAT temp0, temp1, temp2, temp3;
|
||||
FLOAT temp4, temp5, temp6, temp7;
|
||||
FLOAT sum0 = 0.0;
|
||||
FLOAT sum1 = 0.0;
|
||||
FLOAT sum2 = 0.0;
|
||||
FLOAT sum3 = 0.0;
|
||||
|
||||
while ( i< n )
|
||||
{
|
||||
|
||||
temp0 = ABS(x[0]);
|
||||
temp1 = ABS(x[1]);
|
||||
temp2 = ABS(x[2]);
|
||||
temp3 = ABS(x[3]);
|
||||
temp4 = ABS(x[4]);
|
||||
temp5 = ABS(x[5]);
|
||||
temp6 = ABS(x[6]);
|
||||
temp7 = ABS(x[7]);
|
||||
|
||||
sum0 += temp0;
|
||||
sum1 += temp1;
|
||||
sum2 += temp2;
|
||||
sum3 += temp3;
|
||||
|
||||
sum0 += temp4;
|
||||
sum1 += temp5;
|
||||
sum2 += temp6;
|
||||
sum3 += temp7;
|
||||
|
||||
x+=8;
|
||||
i+=8;
|
||||
|
||||
}
|
||||
|
||||
svec[0] = sum0+sum1+sum2+sum3;
|
||||
svec[1] = 0.0;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
FLOAT svec[2] __attribute__ ((aligned (16)));;
|
||||
BLASLONG n1;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
dasum_kernel_16(n1, x, svec);
|
||||
sumf = svec[0] + svec[1];
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[i]);
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
n *= inc_x;
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[i]);
|
||||
i += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
||||
177
kernel/power/dasum_microk_power8.c
Normal file
177
kernel/power/dasum_microk_power8.c
Normal file
@@ -0,0 +1,177 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
|
||||
|
||||
static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"dcbt %2 , %4 \n\t"
|
||||
|
||||
"xxlxor 32,32,32 \n\t"
|
||||
"xxlxor 33,33,33 \n\t"
|
||||
"xxlxor 34,34,34 \n\t"
|
||||
"xxlxor 35,35,35 \n\t"
|
||||
"xxlxor 36,36,36 \n\t"
|
||||
"xxlxor 37,37,37 \n\t"
|
||||
"xxlxor 38,38,38 \n\t"
|
||||
"xxlxor 39,39,39 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2 , %4 \n\t"
|
||||
|
||||
"xvabsdp 48, 40 \n\t"
|
||||
"xvabsdp 49, 41 \n\t"
|
||||
"xvabsdp 50, 42 \n\t"
|
||||
"xvabsdp 51, 43 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
|
||||
"xvabsdp 52, 44 \n\t"
|
||||
"xvabsdp 53, 45 \n\t"
|
||||
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
|
||||
"xvabsdp 54, 46 \n\t"
|
||||
"xvabsdp 55, 47 \n\t"
|
||||
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 48 \n\t"
|
||||
"xvadddp 33, 33, 49 \n\t"
|
||||
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"xvadddp 34, 34, 50 \n\t"
|
||||
"xvadddp 35, 35, 51 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"xvadddp 36, 36, 52 \n\t"
|
||||
"xvadddp 37, 37, 53 \n\t"
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"xvadddp 38, 38, 54 \n\t"
|
||||
"xvadddp 39, 39, 55 \n\t"
|
||||
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
|
||||
"xvabsdp 48, 40 \n\t"
|
||||
"xvabsdp 49, 41 \n\t"
|
||||
"xvabsdp 50, 42 \n\t"
|
||||
"xvabsdp 51, 43 \n\t"
|
||||
"xvabsdp 52, 44 \n\t"
|
||||
"xvabsdp 53, 45 \n\t"
|
||||
"xvabsdp 54, 46 \n\t"
|
||||
"xvabsdp 55, 47 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 48 \n\t"
|
||||
"xvadddp 33, 33, 49 \n\t"
|
||||
"xvadddp 34, 34, 50 \n\t"
|
||||
"xvadddp 35, 35, 51 \n\t"
|
||||
"xvadddp 36, 36, 52 \n\t"
|
||||
"xvadddp 37, 37, 53 \n\t"
|
||||
"xvadddp 38, 38, 54 \n\t"
|
||||
"xvadddp 39, 39, 55 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 33 \n\t"
|
||||
"xvadddp 34, 34, 35 \n\t"
|
||||
"xvadddp 36, 36, 37 \n\t"
|
||||
"xvadddp 38, 38, 39 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 34 \n\t"
|
||||
"xvadddp 36, 36, 38 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 36 \n\t"
|
||||
|
||||
|
||||
"stxvd2x 32, 0, %3 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (svec), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
136
kernel/power/daxpy.c
Normal file
136
kernel/power/daxpy.c
Normal file
@@ -0,0 +1,136 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/22 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "daxpy_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
FLOAT a = *alpha;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[i] += a * x[i];
|
||||
y[i+1] += a * x[i+1];
|
||||
y[i+2] += a * x[i+2];
|
||||
y[i+3] += a * x[i+3];
|
||||
y[i+4] += a * x[i+4];
|
||||
y[i+5] += a * x[i+5];
|
||||
y[i+6] += a * x[i+6];
|
||||
y[i+7] += a * x[i+7];
|
||||
i+=8 ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT a2[4];
|
||||
a2[0]=da;
|
||||
a2[1]=da;
|
||||
a2[2]=da;
|
||||
a2[3]=da;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if ( n1 )
|
||||
daxpy_kernel_8(n1, x, y , a2 );
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[i] += da * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
|
||||
while(i < n1)
|
||||
{
|
||||
|
||||
FLOAT m1 = da * x[ix] ;
|
||||
FLOAT m2 = da * x[ix+inc_x] ;
|
||||
FLOAT m3 = da * x[ix+2*inc_x] ;
|
||||
FLOAT m4 = da * x[ix+3*inc_x] ;
|
||||
|
||||
y[iy] += m1 ;
|
||||
y[iy+inc_y] += m2 ;
|
||||
y[iy+2*inc_y] += m3 ;
|
||||
y[iy+3*inc_y] += m4 ;
|
||||
|
||||
ix += inc_x*4 ;
|
||||
iy += inc_y*4 ;
|
||||
i+=4 ;
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[iy] += da * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
201
kernel/power/daxpy_microk_power8.c
Normal file
201
kernel/power/daxpy_microk_power8.c
Normal file
@@ -0,0 +1,201 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/22 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
FLOAT *y2=y+1;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"lxsdx 33, %5, %4 \n\t"
|
||||
"xxspltd 32, 33, 0 \n\t"
|
||||
"addi %8, %8, -8 \n\t"
|
||||
|
||||
"dcbt %2, %9 \n\t"
|
||||
"dcbt %3, %9 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %3 \n\t"
|
||||
"lxvd2x 49, %5, %3 \n\t"
|
||||
"lxvd2x 50, %6, %3 \n\t"
|
||||
"lxvd2x 51, %7, %3 \n\t"
|
||||
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"addi %3, %3, 64 \n\t"
|
||||
|
||||
"lxvd2x 44, 0, %2 \n\t"
|
||||
"lxvd2x 45, %5, %2 \n\t"
|
||||
"lxvd2x 46, %6, %2 \n\t"
|
||||
"lxvd2x 47, %7, %2 \n\t"
|
||||
|
||||
"lxvd2x 52, 0, %3 \n\t"
|
||||
"lxvd2x 53, %5, %3 \n\t"
|
||||
"lxvd2x 54, %6, %3 \n\t"
|
||||
"lxvd2x 55, %7, %3 \n\t"
|
||||
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"addi %3, %3, 64 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2, %9 \n\t"
|
||||
"dcbt %3, %9 \n\t"
|
||||
|
||||
"xvmaddadp 48, 40, 32 \n\t"
|
||||
"xvmaddadp 49, 41, 32 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
|
||||
"stxvd2x 48, 0, %8 \n\t"
|
||||
"stxvd2x 49, %5, %8 \n\t"
|
||||
|
||||
"xvmaddadp 50, 42, 32 \n\t"
|
||||
"xvmaddadp 51, 43, 32 \n\t"
|
||||
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
|
||||
"stxvd2x 50, %6, %8 \n\t"
|
||||
"stxvd2x 51, %7, %8 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %3 \n\t"
|
||||
"lxvd2x 49, %5, %3 \n\t"
|
||||
"lxvd2x 50, %6, %3 \n\t"
|
||||
"lxvd2x 51, %7, %3 \n\t"
|
||||
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"addi %8, %8, 64 \n\t"
|
||||
|
||||
"xvmaddadp 52, 44, 32 \n\t"
|
||||
"addi %3, %3, 64 \n\t"
|
||||
"xvmaddadp 53, 45, 32 \n\t"
|
||||
|
||||
"lxvd2x 44, 0, %2 \n\t"
|
||||
"lxvd2x 45, %5, %2 \n\t"
|
||||
|
||||
"stxvd2x 52, 0, %8 \n\t"
|
||||
"stxvd2x 53, %5, %8 \n\t"
|
||||
|
||||
"xvmaddadp 54, 46, 32 \n\t"
|
||||
"xvmaddadp 55, 47, 32 \n\t"
|
||||
|
||||
"lxvd2x 46, %6, %2 \n\t"
|
||||
"lxvd2x 47, %7, %2 \n\t"
|
||||
|
||||
"stxvd2x 54, %6, %8 \n\t"
|
||||
"stxvd2x 55, %7, %8 \n\t"
|
||||
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"addi %8, %8, 64 \n\t"
|
||||
|
||||
"lxvd2x 52, 0, %3 \n\t"
|
||||
"lxvd2x 53, %5, %3 \n\t"
|
||||
"lxvd2x 54, %6, %3 \n\t"
|
||||
"lxvd2x 55, %7, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 64 \n\t"
|
||||
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
|
||||
"xvmaddadp 48, 40, 32 \n\t"
|
||||
"xvmaddadp 49, 41, 32 \n\t"
|
||||
"xvmaddadp 50, 42, 32 \n\t"
|
||||
"xvmaddadp 51, 43, 32 \n\t"
|
||||
|
||||
"xvmaddadp 52, 44, 32 \n\t"
|
||||
"xvmaddadp 53, 45, 32 \n\t"
|
||||
"xvmaddadp 54, 46, 32 \n\t"
|
||||
"xvmaddadp 55, 47, 32 \n\t"
|
||||
|
||||
"stxvd2x 48, 0, %8 \n\t"
|
||||
"stxvd2x 49, %5, %8 \n\t"
|
||||
"stxvd2x 50, %6, %8 \n\t"
|
||||
"stxvd2x 51, %7, %8 \n\t"
|
||||
|
||||
"addi %8, %8, 64 \n\t"
|
||||
|
||||
"stxvd2x 52, 0, %8 \n\t"
|
||||
"stxvd2x 53, %5, %8 \n\t"
|
||||
"stxvd2x 54, %6, %8 \n\t"
|
||||
"stxvd2x 55, %7, %8 \n\t"
|
||||
|
||||
"addi %8, %8, 64 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (y1), // 3
|
||||
"r" (alpha), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (y2), // 8
|
||||
"r" (pre) // 9
|
||||
: "cr0", "%0", "%2" , "%3", "%8", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
131
kernel/power/dcopy.c
Normal file
131
kernel/power/dcopy.c
Normal file
@@ -0,0 +1,131 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "dcopy_microk_power8.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
f0 = x1[0];
|
||||
f1 = x1[1];
|
||||
f2 = x1[2];
|
||||
f3 = x1[3];
|
||||
f4 = x1[4];
|
||||
f5 = x1[5];
|
||||
f6 = x1[6];
|
||||
f7 = x1[7];
|
||||
|
||||
y1[0] = f0;
|
||||
y1[1] = f1;
|
||||
y1[2] = f2;
|
||||
y1[3] = f3;
|
||||
y1[4] = f4;
|
||||
y1[5] = f5;
|
||||
y1[6] = f6;
|
||||
y1[7] = f7;
|
||||
|
||||
x1 += 8;
|
||||
y1 += 8;
|
||||
|
||||
i+=8;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
dcopy_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[i] = x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
174
kernel/power/dcopy_microk_power8.c
Normal file
174
kernel/power/dcopy_microk_power8.c
Normal file
@@ -0,0 +1,174 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_32 1
|
||||
|
||||
static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
BLASLONG pre = 384;
|
||||
BLASLONG alpha=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvd2x 50, 0, %2 \n\t"
|
||||
"lxvd2x 51, %5, %2 \n\t"
|
||||
"lxvd2x 52, %6, %2 \n\t"
|
||||
"lxvd2x 53, %7, %2 \n\t"
|
||||
"lxvd2x 54, %8, %2 \n\t"
|
||||
"lxvd2x 55, %9, %2 \n\t"
|
||||
"lxvd2x 56, %10, %2 \n\t"
|
||||
"lxvd2x 57, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %1 \n\t"
|
||||
"stxvd2x 41, %5, %1 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"stxvd2x 42, %6, %1 \n\t"
|
||||
"stxvd2x 43, %7, %1 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"stxvd2x 44, %8, %1 \n\t"
|
||||
"stxvd2x 45, %9, %1 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"stxvd2x 46, %10, %1 \n\t"
|
||||
"stxvd2x 47, %11, %1 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"stxvd2x 50, 0, %1 \n\t"
|
||||
"stxvd2x 51, %5, %1 \n\t"
|
||||
"lxvd2x 50, 0, %2 \n\t"
|
||||
"lxvd2x 51, %5, %2 \n\t"
|
||||
"stxvd2x 52, %6, %1 \n\t"
|
||||
"stxvd2x 53, %7, %1 \n\t"
|
||||
"lxvd2x 52, %6, %2 \n\t"
|
||||
"lxvd2x 53, %7, %2 \n\t"
|
||||
"stxvd2x 54, %8, %1 \n\t"
|
||||
"stxvd2x 55, %9, %1 \n\t"
|
||||
"lxvd2x 54, %8, %2 \n\t"
|
||||
"lxvd2x 55, %9, %2 \n\t"
|
||||
"stxvd2x 56, %10, %1 \n\t"
|
||||
"stxvd2x 57, %11, %1 \n\t"
|
||||
"lxvd2x 56, %10, %2 \n\t"
|
||||
"lxvd2x 57, %11, %2 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %1 \n\t"
|
||||
"stxvd2x 41, %5, %1 \n\t"
|
||||
"stxvd2x 42, %6, %1 \n\t"
|
||||
"stxvd2x 43, %7, %1 \n\t"
|
||||
"stxvd2x 44, %8, %1 \n\t"
|
||||
"stxvd2x 45, %9, %1 \n\t"
|
||||
"stxvd2x 46, %10, %1 \n\t"
|
||||
"stxvd2x 47, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"stxvd2x 50, 0, %1 \n\t"
|
||||
"stxvd2x 51, %5, %1 \n\t"
|
||||
"stxvd2x 52, %6, %1 \n\t"
|
||||
"stxvd2x 53, %7, %1 \n\t"
|
||||
"stxvd2x 54, %8, %1 \n\t"
|
||||
"stxvd2x 55, %9, %1 \n\t"
|
||||
"stxvd2x 56, %10, %1 \n\t"
|
||||
"stxvd2x 57, %11, %1 \n\t"
|
||||
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (y1), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (alpha), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
139
kernel/power/ddot.c
Normal file
139
kernel/power/ddot.c
Normal file
@@ -0,0 +1,139 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/20 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "ddot_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
FLOAT dot = 0.0;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
dot += y[i] * x[i]
|
||||
+ y[i+1] * x[i+1]
|
||||
+ y[i+2] * x[i+2]
|
||||
+ y[i+3] * x[i+3]
|
||||
+ y[i+4] * x[i+4]
|
||||
+ y[i+5] * x[i+5]
|
||||
+ y[i+6] * x[i+6]
|
||||
+ y[i+7] * x[i+7] ;
|
||||
|
||||
i+=8 ;
|
||||
|
||||
}
|
||||
*d += dot;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
FLOAT dot = 0.0 ;
|
||||
|
||||
if ( n <= 0 ) return(dot);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if ( n1 )
|
||||
ddot_kernel_8(n1, x, y , &dot );
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[i] * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
|
||||
}
|
||||
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
|
||||
while(i < n1)
|
||||
{
|
||||
|
||||
FLOAT m1 = y[iy] * x[ix] ;
|
||||
FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
|
||||
|
||||
FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
|
||||
FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
|
||||
|
||||
ix += inc_x*4 ;
|
||||
iy += inc_y*4 ;
|
||||
|
||||
temp1 += m1+m3;
|
||||
temp2 += m2+m4;
|
||||
|
||||
i+=4 ;
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
temp1 += y[iy] * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
dot = temp1 + temp2;
|
||||
return(dot);
|
||||
|
||||
}
|
||||
|
||||
|
||||
178
kernel/power/ddot_microk_power8.c
Normal file
178
kernel/power/ddot_microk_power8.c
Normal file
@@ -0,0 +1,178 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/20 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||
|
||||
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"xxlxor 32,32,32 \n\t"
|
||||
"xxlxor 33,33,33 \n\t"
|
||||
"xxlxor 34,34,34 \n\t"
|
||||
"xxlxor 35,35,35 \n\t"
|
||||
"xxlxor 36,36,36 \n\t"
|
||||
"xxlxor 37,37,37 \n\t"
|
||||
"xxlxor 38,38,38 \n\t"
|
||||
"xxlxor 39,39,39 \n\t"
|
||||
|
||||
"dcbt %2, %12 \n\t"
|
||||
"dcbt %3, %12 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 48, 0, %3 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 49, %5, %3 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 50, %6, %3 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 51, %7, %3 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 52, %8, %3 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 53, %9, %3 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 54, %10, %3 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
"lxvd2x 55, %11, %3 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2, %12 \n\t"
|
||||
"dcbt %3, %12 \n\t"
|
||||
|
||||
"xvmaddadp 32, 40, 48 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 48, 0, %3 \n\t"
|
||||
"xvmaddadp 33, 41, 49 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 49, %5, %3 \n\t"
|
||||
"xvmaddadp 34, 42, 50 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 50, %6, %3 \n\t"
|
||||
"xvmaddadp 35, 43, 51 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 51, %7, %3 \n\t"
|
||||
"xvmaddadp 36, 44, 52 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 52, %8, %3 \n\t"
|
||||
"xvmaddadp 37, 45, 53 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 53, %9, %3 \n\t"
|
||||
"xvmaddadp 38, 46, 54 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 54, %10, %3 \n\t"
|
||||
"xvmaddadp 39, 47, 55 \n\t"
|
||||
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
"lxvd2x 55, %11, %3 \n\t"
|
||||
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"xvmaddadp 32, 40, 48 \n\t"
|
||||
"xvmaddadp 33, 41, 49 \n\t"
|
||||
"xvmaddadp 34, 42, 50 \n\t"
|
||||
"xvmaddadp 35, 43, 51 \n\t"
|
||||
"xvmaddadp 36, 44, 52 \n\t"
|
||||
"xvmaddadp 37, 45, 53 \n\t"
|
||||
"xvmaddadp 38, 46, 54 \n\t"
|
||||
"xvmaddadp 39, 47, 55 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 33 \n\t"
|
||||
"xvadddp 34, 34, 35 \n\t"
|
||||
"xvadddp 36, 36, 37 \n\t"
|
||||
"xvadddp 38, 38, 39 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 34 \n\t"
|
||||
"xvadddp 36, 36, 38 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 36 \n\t"
|
||||
|
||||
"xxswapd 33, 32 \n\t"
|
||||
|
||||
"xsadddp 32, 32, 33 \n\t"
|
||||
|
||||
"stxsdx 32, 0, %4 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (y1), // 3
|
||||
"r" (dot), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112), // 11
|
||||
"r" (pre) // 12
|
||||
: "cr0", "%0", "%2" , "%3", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
426
kernel/power/dgemv_n.c
Normal file
426
kernel/power/dgemv_n.c
Normal file
@@ -0,0 +1,426 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "dgemv_n_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#define NBMAX 4096
|
||||
|
||||
#ifndef HAVE_KERNEL_4x4
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));;
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
a2 = ap[2];
|
||||
a3 = ap[3];
|
||||
|
||||
for ( i=0; i<4; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
|
||||
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
|
||||
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
|
||||
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x2
|
||||
|
||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));;
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
|
||||
for ( i=0; i<2; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0] + a1[i]*x[1];
|
||||
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1];
|
||||
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1];
|
||||
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x1
|
||||
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));;
|
||||
a0 = ap;
|
||||
|
||||
for ( i=0; i<1; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0];
|
||||
y[i+1] += a0[i+1]*x[0];
|
||||
y[i+2] += a0[i+2]*x[0];
|
||||
y[i+3] += a0[i+3]*x[0];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
BLASLONG i;
|
||||
if ( inc_dest != 1 )
|
||||
{
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
*dest += *src;
|
||||
src++;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
BLASLONG lda4 = lda << 2;
|
||||
FLOAT *ap[4] __attribute__ ((aligned (16)));;
|
||||
FLOAT xbuffer[8] __attribute__ ((aligned (16)));;
|
||||
FLOAT alpha_r[4] __attribute__ ((aligned (16)));;
|
||||
FLOAT *ybuffer;
|
||||
|
||||
alpha_r[0] = alpha;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
ybuffer = buffer;
|
||||
|
||||
n1 = n >> 2 ;
|
||||
n2 = n & 3 ;
|
||||
|
||||
m3 = m & 3 ;
|
||||
m1 = m & -4 ;
|
||||
m2 = (m & (NBMAX-1)) - m3 ;
|
||||
|
||||
y_ptr = y;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
if ( m2 == 0 ) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
|
||||
if ( inc_y != 1 )
|
||||
memset(ybuffer,0,NB*8);
|
||||
else
|
||||
ybuffer = y_ptr;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,alpha_r);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
if ( n2 & 2 )
|
||||
{
|
||||
dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,alpha_r);
|
||||
a_ptr += lda*2;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
|
||||
if ( n2 & 1 )
|
||||
{
|
||||
dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha_r);
|
||||
a_ptr += lda;
|
||||
x_ptr += 1;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[1] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[2] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[3] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha_r);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
}
|
||||
|
||||
for( i = 0; i < n2 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha_r);
|
||||
a_ptr += lda;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
a += NB;
|
||||
if ( inc_y != 1 )
|
||||
{
|
||||
add_y(NB,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += NB * inc_y;
|
||||
}
|
||||
else
|
||||
y_ptr += NB ;
|
||||
|
||||
}
|
||||
|
||||
if ( m3 == 0 ) return(0);
|
||||
|
||||
if ( m3 == 3 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
if ( lda == 3 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < ( n & -4 ); i+=4 )
|
||||
{
|
||||
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
|
||||
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
|
||||
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
|
||||
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
|
||||
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
|
||||
|
||||
a_ptr += 12;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += 3;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp2;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
if ( m3 == 2 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
if ( lda == 2 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4) ; i+=4 )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
|
||||
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
|
||||
a_ptr += 8;
|
||||
x_ptr += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += 2;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
return(0);
|
||||
}
|
||||
|
||||
if ( m3 == 1 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp = 0.0;
|
||||
if ( lda == 1 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4); i+=4 )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
|
||||
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i];
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[0] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
301
kernel/power/dgemv_n_microk_power8.c
Normal file
301
kernel/power/dgemv_n_microk_power8.c
Normal file
@@ -0,0 +1,301 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i=n;
|
||||
BLASLONG o8 = 8;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o24 = 24;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
FLOAT *y1=y+1;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));;
|
||||
a0 = ap[0]+1;
|
||||
a1 = ap[1]+1;
|
||||
a2 = ap[2]+1;
|
||||
a3 = ap[3]+1;
|
||||
|
||||
x[0]=xo[0] * *alpha;
|
||||
x[1]=xo[1] * *alpha;
|
||||
x[2]=xo[2] * *alpha;
|
||||
x[3]=xo[3] * *alpha;
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"lxvdsx 32, 0 , %1 \n\t" // x0
|
||||
"lxvdsx 33,%3 , %1 \n\t" // x1
|
||||
"lxvdsx 34,%4 , %1 \n\t" // x2
|
||||
"lxvdsx 35,%5 , %1 \n\t" // x3
|
||||
"addi %2 , %2 , -8 \n\t"
|
||||
"addi %6 , %6 , -8 \n\t"
|
||||
"addi %7 , %7 , -8 \n\t"
|
||||
"addi %8 , %8 , -8 \n\t"
|
||||
"addi %9 , %9 , -8 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
|
||||
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
|
||||
|
||||
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
|
||||
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
|
||||
|
||||
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
|
||||
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
|
||||
|
||||
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
|
||||
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
|
||||
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"addi %7, %7, 32 \n\t"
|
||||
"addi %8, %8, 32 \n\t"
|
||||
"addi %9, %9, 32 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -4 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2, %10 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"lxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"dcbt %6, %10 \n\t"
|
||||
"dcbt %7, %10 \n\t"
|
||||
"dcbt %8, %10 \n\t"
|
||||
"dcbt %9, %10 \n\t"
|
||||
|
||||
"xvmaddadp 40, 48, 32 \n\t"
|
||||
"xvmaddadp 41, 49, 32 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
|
||||
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
|
||||
|
||||
"xvmaddadp 40, 50, 33 \n\t"
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"xvmaddadp 41, 51, 33 \n\t"
|
||||
|
||||
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
|
||||
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
|
||||
|
||||
"xvmaddadp 40, 52, 34 \n\t"
|
||||
"addi %7, %7, 32 \n\t"
|
||||
"xvmaddadp 41, 53, 34 \n\t"
|
||||
|
||||
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
|
||||
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
|
||||
|
||||
"xvmaddadp 40, 54, 35 \n\t"
|
||||
"addi %8, %8, 32 \n\t"
|
||||
"xvmaddadp 41, 55, 35 \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"stxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
|
||||
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
|
||||
|
||||
"addi %9, %9, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -4 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"lxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"xvmaddadp 40, 48, 32 \n\t"
|
||||
"xvmaddadp 41, 49, 32 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
|
||||
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
|
||||
|
||||
"xvmaddadp 40, 50, 33 \n\t"
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"xvmaddadp 41, 51, 33 \n\t"
|
||||
|
||||
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
|
||||
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
|
||||
|
||||
"xvmaddadp 40, 52, 34 \n\t"
|
||||
"addi %7, %7, 32 \n\t"
|
||||
"xvmaddadp 41, 53, 34 \n\t"
|
||||
|
||||
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
|
||||
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
|
||||
|
||||
"xvmaddadp 40, 54, 35 \n\t"
|
||||
"addi %8, %8, 32 \n\t"
|
||||
"xvmaddadp 41, 55, 35 \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"stxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
|
||||
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
|
||||
|
||||
"addi %9, %9, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -4 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"lxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"xvmaddadp 40, 48, 32 \n\t"
|
||||
"xvmaddadp 41, 49, 32 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
|
||||
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
|
||||
|
||||
"xvmaddadp 40, 50, 33 \n\t"
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"xvmaddadp 41, 51, 33 \n\t"
|
||||
|
||||
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
|
||||
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
|
||||
|
||||
"xvmaddadp 40, 52, 34 \n\t"
|
||||
"addi %7, %7, 32 \n\t"
|
||||
"xvmaddadp 41, 53, 34 \n\t"
|
||||
|
||||
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
|
||||
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
|
||||
|
||||
"xvmaddadp 40, 54, 35 \n\t"
|
||||
"addi %8, %8, 32 \n\t"
|
||||
"xvmaddadp 41, 55, 35 \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"stxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
|
||||
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
|
||||
|
||||
"addi %9, %9, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -4 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"lxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"xvmaddadp 40, 48, 32 \n\t"
|
||||
"xvmaddadp 41, 49, 32 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
|
||||
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
|
||||
|
||||
"xvmaddadp 40, 50, 33 \n\t"
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"xvmaddadp 41, 51, 33 \n\t"
|
||||
|
||||
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
|
||||
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
|
||||
|
||||
"xvmaddadp 40, 52, 34 \n\t"
|
||||
"addi %7, %7, 32 \n\t"
|
||||
"xvmaddadp 41, 53, 34 \n\t"
|
||||
|
||||
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
|
||||
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
|
||||
|
||||
"xvmaddadp 40, 54, 35 \n\t"
|
||||
"addi %8, %8, 32 \n\t"
|
||||
"xvmaddadp 41, 55, 35 \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"stxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
|
||||
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
|
||||
|
||||
"addi %9, %9, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -4 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"lxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"xvmaddadp 40, 48, 32 \n\t"
|
||||
"xvmaddadp 41, 49, 32 \n\t"
|
||||
|
||||
"xvmaddadp 40, 50, 33 \n\t"
|
||||
"xvmaddadp 41, 51, 33 \n\t"
|
||||
|
||||
"xvmaddadp 40, 52, 34 \n\t"
|
||||
"xvmaddadp 41, 53, 34 \n\t"
|
||||
|
||||
"xvmaddadp 40, 54, 35 \n\t"
|
||||
"xvmaddadp 41, 55, 35 \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"stxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (x), // 1
|
||||
"r" (y1), // 2
|
||||
"r" (o8), // 3
|
||||
"r" (o16), // 4
|
||||
"r" (o24), // 5
|
||||
"r" (a0), // 6
|
||||
"r" (a1), // 7
|
||||
"r" (a2), // 8
|
||||
"r" (a3), // 9
|
||||
"r" (pre) // 10
|
||||
: "cr0", "%0", "%2" , "%6", "%7", "%8", "%9", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user