Compare commits
150 Commits
arm_soft_f
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5f998efd7b | ||
|
|
88a35ff457 | ||
|
|
5dde4e65d3 | ||
|
|
27a9df6477 | ||
|
|
7224022473 | ||
|
|
468ac3df9e | ||
|
|
376048156b | ||
|
|
d1c5b8f913 | ||
|
|
91bde7d315 | ||
|
|
80373ea039 | ||
|
|
d12b75a6c4 | ||
|
|
7294fb1d9d | ||
|
|
31e086d6a6 | ||
|
|
cbb47736af | ||
|
|
2a7c6930ac | ||
|
|
c4ec882020 | ||
|
|
d33fc32cf3 | ||
|
|
529bfc36ec | ||
|
|
88249ca5f7 | ||
|
|
731c518cff | ||
|
|
29fc429d9a | ||
|
|
e2d3b1561a | ||
|
|
4a012c3d20 | ||
|
|
d5ef0dee9a | ||
|
|
a590e6135c | ||
|
|
4239dd65ce | ||
|
|
3db2adf872 | ||
|
|
ad2462811a | ||
|
|
c1cf62d2c0 | ||
|
|
bfe1656b8b | ||
|
|
f02d535fde | ||
|
|
49e62c0e77 | ||
|
|
3381f23709 | ||
|
|
fa6a920caa | ||
|
|
a6515bb858 | ||
|
|
c66b842d66 | ||
|
|
df2dfe65d6 | ||
|
|
2c8d634619 | ||
|
|
37efb5bc1d | ||
|
|
97d671eb61 | ||
|
|
305cd2e8b4 | ||
|
|
09bc6ebe5b | ||
|
|
872a11a2bf | ||
|
|
eda9e8632a | ||
|
|
8f83d3f961 | ||
|
|
e5e47cfdb5 | ||
|
|
ebf9e9dabe | ||
|
|
83bd547517 | ||
|
|
e25f4c01d6 | ||
|
|
54915ce343 | ||
|
|
0150fabdb6 | ||
|
|
4f0773f07d | ||
|
|
aa5edebc80 | ||
|
|
89924b3d5b | ||
|
|
da7f0ff425 | ||
|
|
0d5c8e5386 | ||
|
|
912410f214 | ||
|
|
b122413fb0 | ||
|
|
9b7b5f7fdc | ||
|
|
34513be726 | ||
|
|
482015f8d6 | ||
|
|
639000e34f | ||
|
|
5de7727cc7 | ||
|
|
96df4b9b17 | ||
|
|
29dc8e0c61 | ||
|
|
65e56cb29d | ||
|
|
bd831a03a8 | ||
|
|
edc97918f8 | ||
|
|
e0034de22d | ||
|
|
32c7fe6bff | ||
|
|
19bdf9d52b | ||
|
|
4f09030fdc | ||
|
|
6f4eca5ea4 | ||
|
|
be55f96cbd | ||
|
|
96dd0ef4f7 | ||
|
|
8f0d6c06a9 | ||
|
|
410a07cbec | ||
|
|
72f95a0acc | ||
|
|
e545b81e76 | ||
|
|
d7afdf9137 | ||
|
|
4f4daaa42a | ||
|
|
42bbe74791 | ||
|
|
c8322c65e4 | ||
|
|
87dde1fde6 | ||
|
|
42466e54fa | ||
|
|
3b0624d50f | ||
|
|
fd4e68128e | ||
|
|
6464d1723a | ||
|
|
59c97cfee4 | ||
|
|
de7875ca5d | ||
|
|
67836c2ab4 | ||
|
|
5fecfe0f42 | ||
|
|
bba6676803 | ||
|
|
5649b2c53a | ||
|
|
6e972994b2 | ||
|
|
5b04cf7ab4 | ||
|
|
d5ea8fd823 | ||
|
|
4beffaaa4b | ||
|
|
fb28e4adc9 | ||
|
|
26faa3ca47 | ||
|
|
4f75989634 | ||
|
|
1e06b49854 | ||
|
|
7f546f54fa | ||
|
|
a809431e34 | ||
|
|
5ee1cf0223 | ||
|
|
9aea7a0d9a | ||
|
|
da0987507c | ||
|
|
81fed55782 | ||
|
|
35387edb8d | ||
|
|
9c884986ad | ||
|
|
f2f0e98bb5 | ||
|
|
166d64eb7c | ||
|
|
e078339e8d | ||
|
|
832a272784 | ||
|
|
356606314c | ||
|
|
ed79a29d87 | ||
|
|
77d16ffc69 | ||
|
|
56762d5e4c | ||
|
|
90dd190a6d | ||
|
|
ab9ec4ab4e | ||
|
|
0cbd2d34e4 | ||
|
|
62979fd104 | ||
|
|
20a413e154 | ||
|
|
dc40bc7368 | ||
|
|
1acfc78c8f | ||
|
|
b4071d0d16 | ||
|
|
7908efafc8 | ||
|
|
66dc10b019 | ||
|
|
c9ff735da6 | ||
|
|
99880f7906 | ||
|
|
cd135e2b59 | ||
|
|
ad124a5e8b | ||
|
|
211d2eceb5 | ||
|
|
5813ed095b | ||
|
|
e44b028fe5 | ||
|
|
a6efabf155 | ||
|
|
ea26b00c06 | ||
|
|
08786c4b95 | ||
|
|
12e476f7a2 | ||
|
|
8de40955ad | ||
|
|
9b24688eed | ||
|
|
43224f7273 | ||
|
|
9254a701f3 | ||
|
|
26a614fdd1 | ||
|
|
7ae64f4f9c | ||
|
|
82e80fa82b | ||
|
|
bcfc298c38 | ||
|
|
ce7c6c6b2d | ||
|
|
85636ff1a0 | ||
|
|
12ab1804b6 |
@@ -2,16 +2,19 @@
|
||||
## Author: Hank Anderson <hank@statease.com>
|
||||
##
|
||||
|
||||
cmake_minimum_required(VERSION 2.8.4)
|
||||
cmake_minimum_required(VERSION 2.8.5)
|
||||
project(OpenBLAS)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 2)
|
||||
set(OpenBLAS_PATCH_VERSION 20.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 20)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
enable_language(ASM)
|
||||
enable_language(C)
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
include(GNUInstallDirs)
|
||||
|
||||
if(MSVC)
|
||||
set(OpenBLAS_LIBNAME libopenblas)
|
||||
else()
|
||||
@@ -117,9 +120,12 @@ if (${NO_STATIC} AND ${NO_SHARED})
|
||||
endif ()
|
||||
|
||||
#Set default output directory
|
||||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
|
||||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
|
||||
|
||||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
if(MSVC)
|
||||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug)
|
||||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release)
|
||||
endif ()
|
||||
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
|
||||
set(TARGET_OBJS "")
|
||||
foreach (SUBDIR ${SUBDIRS})
|
||||
@@ -139,9 +145,12 @@ if (NOT NO_LAPACKE)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
#Only generate .def for dll on MSVC
|
||||
# Only generate .def for dll on MSVC and always produce pdb files for debug and release
|
||||
if(MSVC)
|
||||
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi")
|
||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
|
||||
set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
|
||||
endif()
|
||||
|
||||
# add objects to the openblas lib
|
||||
@@ -156,15 +165,15 @@ set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG
|
||||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
||||
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
endforeach()
|
||||
|
||||
enable_testing()
|
||||
add_subdirectory(utest)
|
||||
|
||||
if(NOT MSVC)
|
||||
if (NOT MSVC)
|
||||
#only build shared library for MSVC
|
||||
|
||||
add_library(${OpenBLAS_LIBNAME}_static STATIC ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS})
|
||||
@@ -216,23 +225,69 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
|
||||
|
||||
# Install libraries
|
||||
install(TARGETS ${OpenBLAS_LIBNAME}
|
||||
RUNTIME DESTINATION bin
|
||||
ARCHIVE DESTINATION lib
|
||||
LIBRARY DESTINATION lib )
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
||||
|
||||
# Install include files
|
||||
FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.h")
|
||||
install (FILES ${INCLUDE_FILES} DESTINATION include)
|
||||
set (GENCONFIG_BIN ${CMAKE_BINARY_DIR}/gen_config_h${CMAKE_EXECUTABLE_SUFFIX})
|
||||
ADD_CUSTOM_COMMAND(
|
||||
OUTPUT ${CMAKE_BINARY_DIR}/openblas_config.h
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
|
||||
COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h
|
||||
)
|
||||
|
||||
ADD_CUSTOM_TARGET(genconfig
|
||||
ALL
|
||||
DEPENDS openblas_config.h
|
||||
)
|
||||
add_dependencies(genconfig ${OpenBLAS_LIBNAME})
|
||||
|
||||
install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
|
||||
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
|
||||
ADD_CUSTOM_TARGET(genf77blas
|
||||
ALL
|
||||
COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
|
||||
)
|
||||
add_dependencies(genf77blas ${OpenBLAS_LIBNAME})
|
||||
|
||||
install (FILES ${CMAKE_BINARY_DIR}/f77blas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
|
||||
if(NOT NO_CBLAS)
|
||||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
|
||||
ADD_CUSTOM_TARGET(gencblas
|
||||
ALL
|
||||
COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp"
|
||||
COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h"
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h
|
||||
)
|
||||
add_dependencies(gencblas ${OpenBLAS_LIBNAME})
|
||||
|
||||
install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
endif()
|
||||
|
||||
if(NOT NO_LAPACKE)
|
||||
message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
add_dependencies( ${OpenBLAS_LIBNAME} genlapacke)
|
||||
FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h")
|
||||
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
|
||||
ADD_CUSTOM_TARGET(genlapacke
|
||||
COMMAND cp ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
|
||||
)
|
||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
endif()
|
||||
|
||||
if(NOT MSVC)
|
||||
install (TARGETS ${OpenBLAS_LIBNAME}_static DESTINATION lib)
|
||||
install (TARGETS ${OpenBLAS_LIBNAME}_static DESTINATION ${CMAKE_INSTALL_LIBDIR})
|
||||
endif()
|
||||
|
||||
include(FindPkgConfig QUIET)
|
||||
if(PKG_CONFIG_FOUND)
|
||||
set(prefix ${CMAKE_INSTALL_PREFIX})
|
||||
set(libdir ${CMAKE_INSTALL_PREFIX}/lib)
|
||||
set(includedir ${CMAKE_INSTALL_PREFIX}/include)
|
||||
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY)
|
||||
install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION lib/pkgconfig/)
|
||||
install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
||||
endif()
|
||||
|
||||
@@ -165,5 +165,6 @@ In chronological order:
|
||||
* Abdelrauf <https://github.com/quickwritereader>
|
||||
* [2017-01-01] dgemm and dtrmm kernels for IBM z13
|
||||
* [2017-02-26] ztrmm kernel for IBM z13
|
||||
* [2017-03-13] strmm and ctrmm kernel for IBM z13
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,45 @@
|
||||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.2.20
|
||||
24-Jul-2017
|
||||
|
||||
common:
|
||||
* Improved CMake support
|
||||
* Fixed several thread race and locking bugs
|
||||
* Fixed default LAPACK optimization level
|
||||
* Updated LAPACK to 3.7.0
|
||||
* Added ReLAPACK (https://github.com/HPAC/ReLAPACK, make BUILD_RELAPACK=1)
|
||||
|
||||
POWER:
|
||||
* Optimizations for Power9
|
||||
* Fixed several Power8 assembly bugs
|
||||
|
||||
ARM:
|
||||
* New optimized Vulcan and ThunderX2T99 targets
|
||||
* Support for ARMV7 SOFT_FP ABI (make ARM_SOFTFP_ABI=1)
|
||||
* Detect all cpu cores including offline ones
|
||||
* Fix compilation with CLANG
|
||||
* Support building a shared library for Android
|
||||
|
||||
MIPS:
|
||||
* Fixed several threading issues
|
||||
* Fix compilation with CLANG
|
||||
|
||||
x86_64:
|
||||
* Detect Intel Bay Trail and Apollo Lake
|
||||
* Detect Intel Sky Lake and Kaby Lake
|
||||
* Detect Intel Knights Landing
|
||||
* Detect AMD A8, A10, A12 and Ryzen
|
||||
* Support 64bit builds with Visual Studio
|
||||
* Fix building with Intel and PGI compilers
|
||||
* Fix building with MINGW and TDM-GCC
|
||||
* Fix cmake builds for Haswell and related cpus
|
||||
* Fix building for Sandybridge with CLANG 3.9
|
||||
* Add support for the FLANG compiler
|
||||
|
||||
IBM Z:
|
||||
* New target z13 with BLAS3 optimizations
|
||||
|
||||
====================================================================
|
||||
Version 0.2.19
|
||||
1-Sep-2016
|
||||
|
||||
22
Makefile
22
Makefile
@@ -16,14 +16,19 @@ ifneq ($(NO_LAPACK), 1)
|
||||
SUBDIRS += lapack
|
||||
endif
|
||||
|
||||
RELA =
|
||||
ifeq ($(BUILD_RELAPACK), 1)
|
||||
RELA = re_lapack
|
||||
endif
|
||||
|
||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
|
||||
|
||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
|
||||
|
||||
.PHONY : all libs netlib test ctest shared install
|
||||
.NOTPARALLEL : all libs prof lapack-test install blas-test
|
||||
.PHONY : all libs netlib $(RELA) test ctest shared install
|
||||
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
|
||||
|
||||
all :: libs netlib tests shared
|
||||
all :: libs netlib $(RELA) tests shared
|
||||
@echo
|
||||
@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
|
||||
@echo
|
||||
@@ -81,7 +86,7 @@ endif
|
||||
|
||||
shared :
|
||||
ifndef NO_SHARED
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
@@ -215,6 +220,14 @@ ifndef NO_LAPACKE
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(NO_LAPACK), 1)
|
||||
re_lapack :
|
||||
|
||||
else
|
||||
re_lapack :
|
||||
@$(MAKE) -C relapack
|
||||
endif
|
||||
|
||||
prof_lapack : lapack_prebuild
|
||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
|
||||
|
||||
@@ -326,6 +339,7 @@ endif
|
||||
@touch $(NETLIB_LAPACK_DIR)/make.inc
|
||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
|
||||
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
|
||||
@$(MAKE) -C relapack clean
|
||||
@rm -f *.grd Makefile.conf_last config_last.h
|
||||
@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)
|
||||
@echo Done.
|
||||
|
||||
23
Makefile.arm
23
Makefile.arm
@@ -1,5 +1,4 @@
|
||||
#ifeq logical or
|
||||
ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
|
||||
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
|
||||
ifeq ($(OSNAME), Android)
|
||||
CCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
@@ -9,28 +8,12 @@ FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV7)
|
||||
ifeq ($(OSNAME), Android)
|
||||
ifeq ($(ARM_SOFTFP_ABI), 1)
|
||||
CCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
else
|
||||
CCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
|
||||
FCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV6)
|
||||
CCOMMON_OPT += -mfpu=vfp -march=armv6
|
||||
FCOMMON_OPT += -mfpu=vfp -march=armv6
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(CORE), ARMV5)
|
||||
CCOMMON_OPT += -marm -march=armv5
|
||||
FCOMMON_OPT += -marm -march=armv5
|
||||
CCOMMON_OPT += -march=armv5
|
||||
FCOMMON_OPT += -march=armv5
|
||||
endif
|
||||
|
||||
@@ -20,6 +20,6 @@ FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX2T99)
|
||||
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
|
||||
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
|
||||
CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
|
||||
FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
|
||||
endif
|
||||
|
||||
@@ -66,7 +66,7 @@ endif
|
||||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
|
||||
@@ -43,7 +43,7 @@ endif
|
||||
|
||||
ifeq ($(USE_MASS), 1)
|
||||
# Path to MASS libs, change it if the libs are installed at any other location
|
||||
MASSPATH = /opt/ibm/xlmass/8.1.3/lib
|
||||
MASSPATH = /opt/ibm/xlmass/8.1.5/lib
|
||||
COMMON_OPT += -mveclibabi=mass -ftree-vectorize -funsafe-math-optimizations -DUSE_MASS
|
||||
EXTRALIB += -L$(MASSPATH) -lmass -lmassvp8 -lmass_simdp8
|
||||
endif
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.2.20.dev
|
||||
VERSION = 0.2.20
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
@@ -83,6 +83,9 @@ VERSION = 0.2.20.dev
|
||||
# Build LAPACK Deprecated functions since LAPACK 3.6.0
|
||||
BUILD_LAPACK_DEPRECATED = 1
|
||||
|
||||
# Build RecursiveLAPACK on top of LAPACK
|
||||
# BUILD_RELAPACK = 1
|
||||
|
||||
# If you want to use legacy threaded Level 3 implementation.
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
||||
@@ -97,7 +100,7 @@ BUILD_LAPACK_DEPRECATED = 1
|
||||
NO_WARMUP = 1
|
||||
|
||||
# If you want to disable CPU/Memory affinity on Linux.
|
||||
NO_AFFINITY = 1
|
||||
#NO_AFFINITY = 1
|
||||
|
||||
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
|
||||
# BIGNUMA = 1
|
||||
|
||||
@@ -68,6 +68,9 @@ endif
|
||||
ifeq ($(TARGET), EXCAVATOR)
|
||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||
endif
|
||||
ifeq ($(TARGET), ZEN)
|
||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
@@ -98,6 +101,9 @@ endif
|
||||
ifeq ($(TARGET_CORE), EXCAVATOR)
|
||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), ZEN)
|
||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
@@ -236,6 +242,10 @@ EXTRALIB += -lm
|
||||
NO_EXPRECISION = 1
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), Android)
|
||||
EXTRALIB += -lm
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), AIX)
|
||||
EXTRALIB += -lm
|
||||
endif
|
||||
@@ -408,7 +418,6 @@ CCOMMON_OPT += -fopenmp
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
$(error OpenBLAS: Clang didn't support OpenMP yet.)
|
||||
CCOMMON_OPT += -fopenmp
|
||||
endif
|
||||
|
||||
@@ -443,12 +452,13 @@ ifneq ($(NO_AVX), 1)
|
||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
|
||||
endif
|
||||
ifneq ($(NO_AVX2), 1)
|
||||
DYNAMIC_CORE += HASWELL
|
||||
DYNAMIC_CORE += HASWELL ZEN
|
||||
endif
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
ifndef DYNAMIC_CORE
|
||||
DYNAMIC_ARCH =
|
||||
override DYNAMIC_ARCH=
|
||||
endif
|
||||
endif
|
||||
|
||||
@@ -480,12 +490,18 @@ BINARY_DEFINED = 1
|
||||
CCOMMON_OPT += -marm
|
||||
FCOMMON_OPT += -marm
|
||||
|
||||
# If softfp abi is mentioned on the command line, force it.
|
||||
ifeq ($(ARM_SOFTFP_ABI), 1)
|
||||
CCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI
|
||||
FCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI
|
||||
CCOMMON_OPT += -mfloat-abi=softfp
|
||||
FCOMMON_OPT += -mfloat-abi=softfp
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), Android)
|
||||
ifeq ($(ARM_SOFTFP_ABI), 1)
|
||||
EXTRALIB += -lm
|
||||
else
|
||||
CCOMMON_OPT += -mfloat-abi=hard
|
||||
FCOMMON_OPT += -mfloat-abi=hard
|
||||
EXTRALIB += -Wl,-lm_hard
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
@@ -588,6 +604,23 @@ endif
|
||||
# Fortran Compiler dependent settings
|
||||
#
|
||||
|
||||
ifeq ($(F_COMPILER), FLANG)
|
||||
CCOMMON_OPT += -DF_INTERFACE_FLANG
|
||||
ifdef BINARY64
|
||||
ifdef INTERFACE64
|
||||
ifneq ($(INTERFACE64), 0)
|
||||
FCOMMON_OPT += -i8
|
||||
endif
|
||||
endif
|
||||
FCOMMON_OPT += -Wall
|
||||
else
|
||||
FCOMMON_OPT += -Wall
|
||||
endif
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -fopenmp
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), G77)
|
||||
CCOMMON_OPT += -DF_INTERFACE_G77
|
||||
FCOMMON_OPT += -Wall
|
||||
@@ -1096,6 +1129,9 @@ LIB_COMPONENTS += LAPACK
|
||||
ifneq ($(NO_LAPACKE), 1)
|
||||
LIB_COMPONENTS += LAPACKE
|
||||
endif
|
||||
ifeq ($(BUILD_RELAPACK), 1)
|
||||
LIB_COMPONENTS += ReLAPACK
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ONLY_CBLAS), 1)
|
||||
|
||||
20
README.md
20
README.md
@@ -51,18 +51,18 @@ The library can be installed as below -
|
||||
|
||||
* On Ubuntu:
|
||||
|
||||
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
|
||||
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install libxlmass-devel.8.1.3
|
||||
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -</br>
|
||||
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list</br>
|
||||
sudo apt-get update</br>
|
||||
sudo apt-get install libxlmass-devel.8.1.5</br>
|
||||
|
||||
* On RHEL/CentOS:
|
||||
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
|
||||
sudo rpm --import repomd.xml.key
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
|
||||
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
|
||||
sudo yum install libxlmass-devel.8.1.3
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key</br>
|
||||
sudo rpm --import repomd.xml.key</br>
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo</br>
|
||||
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/</br>
|
||||
sudo yum install libxlmass-devel.8.1.5</br>
|
||||
|
||||
After installing MASS library, compile openblas with USE_MASS=1.
|
||||
|
||||
@@ -107,7 +107,7 @@ Please read GotoBLAS_01Readme.txt
|
||||
- **ARM Cortex-A57**: Experimental
|
||||
|
||||
#### IBM zEnterprise System:
|
||||
- **Z13**: blas3 for double
|
||||
- **Z13**: Optimized Level-3 BLAS
|
||||
|
||||
|
||||
### Support OS:
|
||||
|
||||
@@ -34,6 +34,7 @@ BULLDOZER
|
||||
PILEDRIVER
|
||||
STEAMROLLER
|
||||
EXCAVATOR
|
||||
ZEN
|
||||
|
||||
c)VIA CPU:
|
||||
SSE_GENERIC
|
||||
|
||||
@@ -73,7 +73,7 @@ if (DYNAMIC_ARCH)
|
||||
set(DYNAMIC_CORE "${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER")
|
||||
endif ()
|
||||
if (NOT NO_AVX2)
|
||||
set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL")
|
||||
set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL ZEN")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
||||
@@ -73,6 +73,10 @@ if (${ARCH} STREQUAL "X86")
|
||||
set(ARCH x86)
|
||||
endif ()
|
||||
|
||||
if (${ARCH} MATCHES "ppc")
|
||||
set(ARCH power)
|
||||
endif ()
|
||||
|
||||
set(COMPILER_ID ${CMAKE_CXX_COMPILER_ID})
|
||||
if (${COMPILER_ID} STREQUAL "GNU")
|
||||
set(COMPILER_ID "GCC")
|
||||
@@ -87,3 +91,8 @@ file(WRITE ${TARGET_CONF}
|
||||
"#define __${BINARY}BIT__\t1\n"
|
||||
"#define FUNDERSCORE\t${FU}\n")
|
||||
|
||||
if (${HOST_OS} STREQUAL "WINDOWSSTORE")
|
||||
file(APPEND ${TARGET_CONF}
|
||||
"#define OS_WINNT\t1\n")
|
||||
endif ()
|
||||
|
||||
|
||||
@@ -3,6 +3,21 @@
|
||||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||
## Sets Fortran related variables.
|
||||
|
||||
if (${F_COMPILER} STREQUAL "FLANG")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
||||
if (BINARY64)
|
||||
if (INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
|
||||
endif ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
|
||||
else ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
|
||||
endif ()
|
||||
if (USE_OPENMP)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "G77")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G77")
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
|
||||
|
||||
@@ -318,7 +318,7 @@ set(ZLASRC
|
||||
zlarfg.f zlarft.f zlarfgp.f
|
||||
zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
|
||||
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
|
||||
zlassq.f zlasyf.f zlasyf_rook.f zlasy_aa.f
|
||||
zlassq.f zlasyf.f zlasyf_rook.f zlasyf_aa.f
|
||||
zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f DEPRECATED/zlatzm.f
|
||||
zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f
|
||||
zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
prefix=@prefix@
|
||||
libdir=@libdir@
|
||||
includedir=@includedir@
|
||||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||
|
||||
Name: OpenBLAS
|
||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||
|
||||
@@ -77,7 +77,7 @@ if (CYGWIN)
|
||||
set(NO_EXPRECISION 1)
|
||||
endif ()
|
||||
|
||||
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix")
|
||||
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
|
||||
if (SMP)
|
||||
set(EXTRALIB "${EXTRALIB} -lpthread")
|
||||
endif ()
|
||||
|
||||
@@ -4,7 +4,8 @@
|
||||
## This is triggered by system.cmake and runs before any of the code is built.
|
||||
## Creates config.h and Makefile.conf by first running the c_check perl script (which creates those files).
|
||||
## Next it runs f_check and appends some fortran information to the files.
|
||||
## Finally it runs getarch and getarch_2nd for even more environment information.
|
||||
## Then it runs getarch and getarch_2nd for even more environment information.
|
||||
## Finally it builds gen_config_h for use at build time to generate config.h.
|
||||
|
||||
# CMake vars set by this file:
|
||||
# CORE
|
||||
@@ -71,16 +72,26 @@ if (MSVC)
|
||||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
|
||||
endif()
|
||||
|
||||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
# disable WindowsStore strict CRT checks
|
||||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
|
||||
endif ()
|
||||
|
||||
set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
|
||||
set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
|
||||
file(MAKE_DIRECTORY ${GETARCH_DIR})
|
||||
try_compile(GETARCH_RESULT ${GETARCH_DIR}
|
||||
SOURCES ${GETARCH_SRC}
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GETARCH_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
|
||||
)
|
||||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
try_compile(GETARCH_RESULT ${GETARCH_DIR}
|
||||
SOURCES ${GETARCH_SRC}
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GETARCH_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
|
||||
)
|
||||
|
||||
if (NOT ${GETARCH_RESULT})
|
||||
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
|
||||
endif ()
|
||||
endif ()
|
||||
message(STATUS "Running getarch")
|
||||
|
||||
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
|
||||
@@ -96,12 +107,18 @@ ParseGetArchVars(${GETARCH_MAKE_OUT})
|
||||
set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
|
||||
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
|
||||
file(MAKE_DIRECTORY ${GETARCH2_DIR})
|
||||
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
|
||||
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GETARCH2_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
|
||||
)
|
||||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
|
||||
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GETARCH2_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
|
||||
)
|
||||
|
||||
if (NOT ${GETARCH2_RESULT})
|
||||
MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
|
||||
execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 0 OUTPUT_VARIABLE GETARCH2_MAKE_OUT)
|
||||
@@ -111,3 +128,21 @@ execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE
|
||||
file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT})
|
||||
ParseGetArchVars(${GETARCH2_MAKE_OUT})
|
||||
|
||||
# compile get_config_h
|
||||
set(GEN_CONFIG_H_DIR "${PROJECT_BINARY_DIR}/genconfig_h_build")
|
||||
set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}")
|
||||
set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"")
|
||||
file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR})
|
||||
|
||||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
|
||||
SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GEN_CONFIG_H_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
|
||||
)
|
||||
|
||||
if (NOT ${GEN_CONFIG_H_RESULT})
|
||||
MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
|
||||
endif ()
|
||||
endif ()
|
||||
@@ -22,7 +22,7 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
||||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE")
|
||||
set(TARGET "NEHALEM")
|
||||
endif ()
|
||||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER")
|
||||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
||||
set(TARGET "BARCELONA")
|
||||
endif ()
|
||||
endif ()
|
||||
@@ -312,6 +312,8 @@ endif ()
|
||||
|
||||
set(AWK awk)
|
||||
|
||||
set(SED sed)
|
||||
|
||||
set(REVISION "-r${OpenBLAS_VERSION}")
|
||||
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})
|
||||
|
||||
|
||||
22
common.h
22
common.h
@@ -425,6 +425,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#ifdef OS_WINDOWSSTORE
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
#define readenv(p, n) 0
|
||||
#else
|
||||
#ifdef OS_WINDOWS
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
|
||||
@@ -432,6 +436,7 @@ typedef char env_var_t[MAX_PATH];
|
||||
typedef char* env_var_t;
|
||||
#define readenv(p, n) ((p)=getenv(n))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS)
|
||||
#ifdef _POSIX_MONOTONIC_CLOCK
|
||||
@@ -556,8 +561,13 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||
#endif
|
||||
|
||||
#if defined(C_PGI) || defined(C_SUN)
|
||||
#define CREAL(X) (*((FLOAT *)&X + 0))
|
||||
#define CIMAG(X) (*((FLOAT *)&X + 1))
|
||||
#if defined(__STDC_IEC_559_COMPLEX__)
|
||||
#define CREAL(X) creal(X)
|
||||
#define CIMAG(X) cimag(X)
|
||||
#else
|
||||
#define CREAL(X) (*((FLOAT *)&X + 0))
|
||||
#define CIMAG(X) (*((FLOAT *)&X + 1))
|
||||
#endif
|
||||
#else
|
||||
#ifdef OPENBLAS_COMPLEX_STRUCT
|
||||
#define CREAL(Z) ((Z).real)
|
||||
@@ -649,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){
|
||||
*address = 0;
|
||||
}
|
||||
|
||||
|
||||
#ifdef OS_WINDOWSSTORE
|
||||
static __inline int readenv_atoi(char *env) {
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
#ifdef OS_WINDOWS
|
||||
static __inline int readenv_atoi(char *env) {
|
||||
env_var_t p;
|
||||
@@ -664,7 +678,7 @@ static __inline int readenv_atoi(char *env) {
|
||||
return(0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
|
||||
|
||||
|
||||
@@ -111,11 +111,6 @@ REALNAME:
|
||||
|
||||
#define PROFCODE
|
||||
|
||||
#ifdef __ARM_PCS
|
||||
//-mfloat-abi=softfp
|
||||
#define SOFT_FLOAT_ABI
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@@ -39,7 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifdef F_INTERFACE_FLANG
|
||||
#define RETURN_BY_STACK
|
||||
#else
|
||||
#define RETURN_BY_COMPLEX
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
|
||||
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#ifndef COMMON_MIPS
|
||||
#define COMMON_MIPS
|
||||
|
||||
#define MB
|
||||
#define WMB
|
||||
#define MB __sync_synchronize()
|
||||
#define WMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
@@ -42,11 +42,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
static void INLINE blas_lock(volatile unsigned long *address){
|
||||
|
||||
}
|
||||
#define BLAS_LOCK_DEFINED
|
||||
|
||||
static inline unsigned int rpcc(void){
|
||||
unsigned long ret;
|
||||
|
||||
|
||||
@@ -71,35 +71,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#ifndef COMMON_MIPS64
|
||||
#define COMMON_MIPS64
|
||||
|
||||
#define MB
|
||||
#define WMB
|
||||
#define MB __sync_synchronize()
|
||||
#define WMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
static void INLINE blas_lock(volatile unsigned long *address){
|
||||
|
||||
long int ret, val = 1;
|
||||
|
||||
do {
|
||||
while (*address) {YIELDING;};
|
||||
|
||||
__asm__ __volatile__(
|
||||
"1: ll %0, %3\n"
|
||||
" ori %2, %0, 1\n"
|
||||
" sc %2, %1\n"
|
||||
" beqz %2, 1b\n"
|
||||
" andi %2, %0, 1\n"
|
||||
" sync\n"
|
||||
: "=&r" (val), "=m" (address), "=&r" (ret)
|
||||
: "m" (address)
|
||||
: "memory");
|
||||
|
||||
} while (ret);
|
||||
}
|
||||
#define BLAS_LOCK_DEFINED
|
||||
|
||||
static inline unsigned int rpcc(void){
|
||||
unsigned long ret;
|
||||
|
||||
|
||||
@@ -245,6 +245,10 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
#define RETURN_BY_STACK
|
||||
#endif
|
||||
|
||||
#ifdef F_INTERFACE_FLANG
|
||||
#define RETURN_BY_STACK
|
||||
#endif
|
||||
|
||||
#ifdef F_INTERFACE_PGI
|
||||
#define RETURN_BY_STACK
|
||||
#endif
|
||||
|
||||
2
cpuid.h
2
cpuid.h
@@ -114,6 +114,7 @@
|
||||
#define CORE_HASWELL 24
|
||||
#define CORE_STEAMROLLER 25
|
||||
#define CORE_EXCAVATOR 26
|
||||
#define CORE_ZEN 27
|
||||
|
||||
#define HAVE_SSE (1 << 0)
|
||||
#define HAVE_SSE2 (1 << 1)
|
||||
@@ -209,5 +210,6 @@ typedef struct {
|
||||
#define CPUTYPE_HASWELL 48
|
||||
#define CPUTYPE_STEAMROLLER 49
|
||||
#define CPUTYPE_EXCAVATOR 50
|
||||
#define CPUTYPE_ZEN 51
|
||||
|
||||
#endif
|
||||
|
||||
54
cpuid_x86.c
54
cpuid_x86.c
@@ -1279,8 +1279,11 @@ int get_cpuname(void){
|
||||
return CPUTYPE_OPTERON;
|
||||
case 1:
|
||||
case 3:
|
||||
case 7:
|
||||
case 10:
|
||||
return CPUTYPE_BARCELONA;
|
||||
case 5:
|
||||
return CPUTYPE_BOBCAT;
|
||||
case 6:
|
||||
switch (model) {
|
||||
case 1:
|
||||
@@ -1295,12 +1298,13 @@ int get_cpuname(void){
|
||||
return CPUTYPE_PILEDRIVER;
|
||||
else
|
||||
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||
case 5: // New EXCAVATOR CPUS
|
||||
if(support_avx())
|
||||
case 5: // New EXCAVATOR CPUS
|
||||
if(support_avx())
|
||||
return CPUTYPE_EXCAVATOR;
|
||||
else
|
||||
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||
case 0:
|
||||
case 8:
|
||||
switch(exmodel){
|
||||
case 1: //AMD Trinity
|
||||
if(support_avx())
|
||||
@@ -1322,8 +1326,19 @@ int get_cpuname(void){
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
return CPUTYPE_BOBCAT;
|
||||
case 8:
|
||||
switch (model) {
|
||||
case 1:
|
||||
// AMD Ryzen
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_ZEN;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_BARCELONA;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -1450,6 +1465,7 @@ static char *cpuname[] = {
|
||||
"HASWELL",
|
||||
"STEAMROLLER",
|
||||
"EXCAVATOR",
|
||||
"ZEN",
|
||||
};
|
||||
|
||||
static char *lowercpuname[] = {
|
||||
@@ -1503,6 +1519,7 @@ static char *lowercpuname[] = {
|
||||
"haswell",
|
||||
"steamroller",
|
||||
"excavator",
|
||||
"zen",
|
||||
};
|
||||
|
||||
static char *corename[] = {
|
||||
@@ -1533,6 +1550,7 @@ static char *corename[] = {
|
||||
"HASWELL",
|
||||
"STEAMROLLER",
|
||||
"EXCAVATOR",
|
||||
"ZEN",
|
||||
};
|
||||
|
||||
static char *corename_lower[] = {
|
||||
@@ -1563,6 +1581,7 @@ static char *corename_lower[] = {
|
||||
"haswell",
|
||||
"steamroller",
|
||||
"excavator",
|
||||
"zen",
|
||||
};
|
||||
|
||||
|
||||
@@ -1776,15 +1795,16 @@ int get_coretype(void){
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 14) // Kaby Lake
|
||||
if (model == 14) { // Kaby Lake
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -1820,6 +1840,7 @@ int get_coretype(void){
|
||||
else
|
||||
return CORE_BARCELONA; //OS don't support AVX.
|
||||
case 0:
|
||||
case 8:
|
||||
switch(exmodel){
|
||||
case 1: //AMD Trinity
|
||||
if(support_avx())
|
||||
@@ -1841,9 +1862,22 @@ int get_coretype(void){
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
}else return CORE_BARCELONA;
|
||||
} else if (exfamily == 8) {
|
||||
switch (model) {
|
||||
case 1:
|
||||
// AMD Ryzen
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_ZEN;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator
|
||||
#endif
|
||||
else
|
||||
return CORE_BARCELONA;
|
||||
}
|
||||
} else {
|
||||
return CORE_BARCELONA;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -42,9 +42,27 @@ static char *cpuname_lower[] = {
|
||||
|
||||
int detect(void)
|
||||
{
|
||||
// return CPU_GENERIC;
|
||||
return CPU_Z13;
|
||||
|
||||
FILE *infile;
|
||||
char buffer[512], *p;
|
||||
|
||||
p = (char *)NULL;
|
||||
infile = fopen("/proc/sysinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("Type", buffer, 4)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if (strstr(p, "2964")) return CPU_Z13;
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
void get_libname(void)
|
||||
|
||||
@@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT
|
||||
|
||||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
||||
@@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||
#endif
|
||||
|
||||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
blas_queue_t queue[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER];
|
||||
|
||||
|
||||
@@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
|
||||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
||||
|
||||
@@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
|
||||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
||||
|
||||
@@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
|
||||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
||||
|
||||
@@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
||||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
||||
|
||||
@@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
||||
if (nthreads - num_cpu > 1) {
|
||||
|
||||
di = (double)i;
|
||||
width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask;
|
||||
width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1);
|
||||
|
||||
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
|
||||
|
||||
@@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
||||
if (nthreads - num_cpu > 1) {
|
||||
|
||||
di = (double)(arg -> n - i);
|
||||
width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask;
|
||||
width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1);
|
||||
|
||||
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
|
||||
|
||||
|
||||
@@ -12,6 +12,8 @@ if (SMP)
|
||||
set(BLAS_SERVER blas_server_omp.c)
|
||||
elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
|
||||
set(BLAS_SERVER blas_server_win32.c)
|
||||
elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore")
|
||||
set(BLAS_SERVER blas_server_win32.c)
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED BLAS_SERVER)
|
||||
|
||||
@@ -443,8 +443,11 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||
SetEvent(pool.killed);
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
||||
TerminateThread(blas_threads[i],0);
|
||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
||||
#ifndef OS_WINDOWSSTORE
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
TerminateThread(blas_threads[i],0);
|
||||
#endif
|
||||
}
|
||||
|
||||
blas_server_avail = 0;
|
||||
|
||||
@@ -70,8 +70,10 @@ extern gotoblas_t gotoblas_STEAMROLLER;
|
||||
extern gotoblas_t gotoblas_EXCAVATOR;
|
||||
#ifdef NO_AVX2
|
||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||
#else
|
||||
extern gotoblas_t gotoblas_HASWELL;
|
||||
extern gotoblas_t gotoblas_ZEN;
|
||||
#endif
|
||||
#else
|
||||
//Use NEHALEM kernels for sandy bridge
|
||||
@@ -81,6 +83,7 @@ extern gotoblas_t gotoblas_HASWELL;
|
||||
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
||||
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
||||
#define gotoblas_EXCAVATOR gotoblas_BARCELONA
|
||||
#define gotoblas_ZEN gotoblas_BARCELONA
|
||||
#endif
|
||||
|
||||
|
||||
@@ -355,14 +358,14 @@ static gotoblas_t *get_coretype(void){
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 5){
|
||||
if(support_avx())
|
||||
return &gotoblas_EXCAVATOR;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 0){
|
||||
}else if(model == 5){
|
||||
if(support_avx())
|
||||
return &gotoblas_EXCAVATOR;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 0 || model == 8){
|
||||
if (exmodel == 1) {
|
||||
//AMD Trinity
|
||||
if(support_avx())
|
||||
@@ -389,9 +392,16 @@ static gotoblas_t *get_coretype(void){
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
} else if (exfamily == 8) {
|
||||
if (model == 1) {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
}else {
|
||||
return &gotoblas_BARCELONA;
|
||||
}
|
||||
}
|
||||
@@ -431,6 +441,7 @@ static char *corename[] = {
|
||||
"Haswell",
|
||||
"Steamroller",
|
||||
"Excavator",
|
||||
"Zen"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
@@ -457,6 +468,7 @@ char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_HASWELL) return corename[20];
|
||||
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
|
||||
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
|
||||
if (gotoblas == &gotoblas_ZEN) return corename[23];
|
||||
|
||||
return corename[0];
|
||||
}
|
||||
@@ -469,7 +481,7 @@ static gotoblas_t *force_coretype(char *coretype){
|
||||
char message[128];
|
||||
//char mname[20];
|
||||
|
||||
for ( i=1 ; i <= 22; i++)
|
||||
for ( i=1 ; i <= 23; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype,corename[i],20))
|
||||
{
|
||||
@@ -487,6 +499,7 @@ static gotoblas_t *force_coretype(char *coretype){
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 23: return (&gotoblas_ZEN);
|
||||
case 22: return (&gotoblas_EXCAVATOR);
|
||||
case 21: return (&gotoblas_STEAMROLLER);
|
||||
case 20: return (&gotoblas_HASWELL);
|
||||
|
||||
@@ -354,6 +354,24 @@ static int numa_check(void) {
|
||||
return common -> num_nodes;
|
||||
}
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int sched_getcpu(void)
|
||||
{
|
||||
int cpu;
|
||||
FILE *fp = NULL;
|
||||
if ( (fp = fopen("/proc/self/stat", "r")) == NULL)
|
||||
return -1;
|
||||
if ( fscanf( fp, "%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%d", &cpu) != 1) {
|
||||
fclose (fp);
|
||||
return -1;
|
||||
}
|
||||
fclose (fp);
|
||||
return(cpu);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static void numa_mapping(void) {
|
||||
|
||||
int node, cpu, core;
|
||||
@@ -808,7 +826,6 @@ void gotoblas_affinity_init(void) {
|
||||
common -> shmid = pshmid;
|
||||
|
||||
if (common -> magic != SH_MAGIC) {
|
||||
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, "Shared Memory Initialization.\n");
|
||||
#endif
|
||||
@@ -830,7 +847,7 @@ void gotoblas_affinity_init(void) {
|
||||
if (common -> num_nodes > 1) numa_mapping();
|
||||
|
||||
common -> final_num_procs = 0;
|
||||
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number.
|
||||
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number.
|
||||
|
||||
for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0;
|
||||
|
||||
|
||||
@@ -1015,7 +1015,7 @@ void *blas_memory_alloc(int procpos){
|
||||
mypos = WhereAmI();
|
||||
|
||||
position = mypos;
|
||||
while (position > NUM_BUFFERS) position >>= 1;
|
||||
while (position >= NUM_BUFFERS) position >>= 1;
|
||||
|
||||
do {
|
||||
if (!memory[position].used && (memory[position].pos == mypos)) {
|
||||
@@ -1164,8 +1164,8 @@ void blas_memory_free(void *free_area){
|
||||
position = 0;
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
while ((memory[position].addr != free_area)
|
||||
&& (position < NUM_BUFFERS)) position++;
|
||||
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
|
||||
position++;
|
||||
|
||||
if (memory[position].addr != free_area) goto error;
|
||||
|
||||
@@ -1479,12 +1479,30 @@ static int on_process_term(void)
|
||||
#else
|
||||
#pragma comment(linker, "/INCLUDE:__tls_used")
|
||||
#endif
|
||||
#pragma data_seg(push, old_seg)
|
||||
|
||||
#ifdef _WIN64
|
||||
#pragma const_seg(".CRT$XLB")
|
||||
#else
|
||||
#pragma data_seg(".CRT$XLB")
|
||||
#endif
|
||||
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
#ifdef _WIN64
|
||||
#pragma const_seg()
|
||||
#else
|
||||
#pragma data_seg()
|
||||
#endif
|
||||
|
||||
#ifdef _WIN64
|
||||
#pragma const_seg(".CRT$XTU")
|
||||
#else
|
||||
#pragma data_seg(".CRT$XTU")
|
||||
#endif
|
||||
static int(*p_process_term)(void) = on_process_term;
|
||||
#pragma data_seg(pop, old_seg)
|
||||
#ifdef _WIN64
|
||||
#pragma const_seg()
|
||||
#else
|
||||
#pragma data_seg()
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
|
||||
|
||||
@@ -167,7 +167,7 @@ int get_L2_size(void){
|
||||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
|
||||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
|
||||
|
||||
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
@@ -251,7 +251,7 @@ int get_L2_size(void){
|
||||
void blas_set_parameter(void){
|
||||
|
||||
int factor;
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
|
||||
int size = 16;
|
||||
#else
|
||||
int size = get_L2_size();
|
||||
|
||||
@@ -118,10 +118,16 @@ endif
|
||||
dllinit.$(SUFFIX) : dllinit.c
|
||||
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
||||
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
|
||||
so : ../$(LIBSONAME)
|
||||
|
||||
ifeq ($(OSNAME), Android)
|
||||
INTERNALNAME = $(LIBPREFIX).so
|
||||
else
|
||||
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
|
||||
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
||||
../$(LIBSONAME) : ../$(LIBNAME) linktest.c
|
||||
else
|
||||
@@ -132,13 +138,13 @@ endif
|
||||
ifneq ($(C_COMPILER), LSB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
|
||||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||
else
|
||||
#for LSB
|
||||
env LSBCC_SHAREDLIBS=gfortran $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
|
||||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
|
||||
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||
endif
|
||||
rm -f linktest
|
||||
|
||||
3010
exports/gensymbol
3010
exports/gensymbol
File diff suppressed because it is too large
Load Diff
20
f_check
20
f_check
@@ -33,6 +33,7 @@ if ($compiler eq "") {
|
||||
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
|
||||
"pathf90", "pathf95",
|
||||
"pgf95", "pgf90", "pgf77",
|
||||
"flang",
|
||||
"ifort");
|
||||
|
||||
OUTER:
|
||||
@@ -78,8 +79,13 @@ if ($compiler eq "") {
|
||||
$vendor = GFORTRAN;
|
||||
$openmp = "-fopenmp";
|
||||
} else {
|
||||
$vendor = G77;
|
||||
$openmp = "";
|
||||
if ($compiler =~ /flang/) {
|
||||
$vendor = FLANG;
|
||||
$openmp = "-fopenmp";
|
||||
} else {
|
||||
$vendor = G77;
|
||||
$openmp = "";
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -197,6 +203,12 @@ if ($compiler eq "") {
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /flang/) {
|
||||
$vendor = FLANG;
|
||||
$bu = "_";
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($vendor eq "") {
|
||||
$nofortran = 1;
|
||||
$compiler = "gfortran";
|
||||
@@ -331,6 +343,10 @@ if ($vendor eq "INTEL"){
|
||||
$linker_a .= "-lgfortran"
|
||||
}
|
||||
|
||||
if ($vendor eq "FLANG"){
|
||||
$linker_a .= "-lflang"
|
||||
}
|
||||
|
||||
open(MAKEFILE, ">> $makefile") || die "Can't append $makefile";
|
||||
open(CONFFILE, ">> $config" ) || die "Can't append $config";
|
||||
|
||||
|
||||
36
gen_config_h.c
Normal file
36
gen_config_h.c
Normal file
@@ -0,0 +1,36 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
int main(int argc, char**argv) {
|
||||
FILE *fp;
|
||||
char line[100];
|
||||
char line2[80];
|
||||
char *s;
|
||||
int i;
|
||||
|
||||
fprintf(stdout,"#ifndef OPENBLAS_CONFIG_H\n");
|
||||
fprintf(stdout,"#define OPENBLAS_CONFIG_H\n");
|
||||
fp=fopen(argv[1],"r");
|
||||
do{
|
||||
s=fgets(line,80,fp);
|
||||
if (s== NULL) break;
|
||||
memset(line2,0,80);
|
||||
i=sscanf(line,"#define %70c",line2);
|
||||
if (i!=0) {
|
||||
fprintf(stdout,"#define OPENBLAS_%s",line2);
|
||||
} else {
|
||||
fprintf(stdout,"\n");
|
||||
}
|
||||
} while (1);
|
||||
fclose(fp);
|
||||
fprintf(stdout,"#define OPENBLAS_VERSION \"OpenBLAS %s\"\n", VERSION);
|
||||
fp=fopen(argv[2],"r");
|
||||
do{
|
||||
s=fgets(line,100,fp);
|
||||
if (s== NULL) break;
|
||||
fprintf(stdout,"%s",line);
|
||||
} while(1);
|
||||
fclose(fp);
|
||||
fprintf(stdout,"#endif /* OPENBLAS_CONFIG_H */\n");
|
||||
exit(0);
|
||||
}
|
||||
19
getarch.c
19
getarch.c
@@ -473,6 +473,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define CORENAME "EXCAVATOR"
|
||||
#endif
|
||||
|
||||
#if defined (FORCE_ZEN)
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#define SUBARCHITECTURE "ZEN"
|
||||
#define ARCHCONFIG "-DZEN " \
|
||||
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL2_CODE_ASSOCIATIVE=8 " \
|
||||
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DL3_SIZE=16777216 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=8 " \
|
||||
"-DITB_DEFAULT_ENTRIES=64 -DITB_SIZE=4096 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
|
||||
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
|
||||
"-DHAVE_AVX -DHAVE_FMA3 -DFMA3"
|
||||
#define LIBNAME "zen"
|
||||
#define CORENAME "ZEN"
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FORCE_SSE_GENERIC
|
||||
#define FORCE
|
||||
|
||||
@@ -315,7 +315,7 @@ CCBLAS3OBJS = \
|
||||
cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \
|
||||
cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \
|
||||
cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\
|
||||
cblas_cgeadd.$(SUFFIX)
|
||||
cblas_cgeadd.$(SUFFIX) cblas_xerbla.$(SUFFIX)
|
||||
|
||||
|
||||
|
||||
@@ -2137,3 +2137,5 @@ cblas_cgeadd.$(SUFFIX) cblas_cgeadd.$(PSUFFIX) : zgeadd.c
|
||||
cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c
|
||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||
|
||||
cblas_xerbla.$(SUFFIX) cblas_xerbla.$(PSUFFIX) : xerbla.c
|
||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||
|
||||
22
interface/xerbla.c
Normal file
22
interface/xerbla.c
Normal file
@@ -0,0 +1,22 @@
|
||||
#ifdef CBLAS
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include "common.h"
|
||||
|
||||
void CNAME(blasint p, char *rout, char *form, ...)
|
||||
{
|
||||
va_list args;
|
||||
|
||||
va_start(args, form);
|
||||
|
||||
if (p)
|
||||
fprintf(stderr, "Parameter %d to routine %s was incorrect\n", p, rout);
|
||||
vfprintf(stderr, form, args);
|
||||
va_end(args);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -160,9 +160,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasin
|
||||
|
||||
if (n <= 0) {
|
||||
#ifdef FORCE_USE_STACK
|
||||
//*result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0);
|
||||
CREAL(*result) = 0.0;
|
||||
CIMAG(*result) = 0.0;
|
||||
OPENBLAS_COMPLEX_FLOAT zero=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0);
|
||||
*result = zero;
|
||||
// CREAL(*result) = 0.0;
|
||||
// CIMAG(*result) = 0.0;
|
||||
return;
|
||||
#else
|
||||
return zero;
|
||||
|
||||
@@ -125,9 +125,8 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef NEW_IMATCOPY
|
||||
if (*lda == *ldb) {
|
||||
if (*lda == *ldb && *cols == *rows) {
|
||||
if ( order == BlasColMajor )
|
||||
{
|
||||
|
||||
@@ -180,7 +179,7 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||
b = malloc(msize);
|
||||
if ( b == NULL )
|
||||
{
|
||||
printf("Memory alloc failed\n");
|
||||
printf("Memory alloc failed in zimatcopy\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@@ -205,14 +204,14 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||
if ( trans == BlasTrans )
|
||||
{
|
||||
OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
}
|
||||
if ( trans == BlasTransConj )
|
||||
{
|
||||
OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
}
|
||||
@@ -238,20 +237,20 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||
if ( trans == BlasTrans )
|
||||
{
|
||||
OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
}
|
||||
if ( trans == BlasTransConj )
|
||||
{
|
||||
OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
free(b);
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
@@ -118,7 +118,7 @@ endforeach ()
|
||||
# Makefile.L3
|
||||
set(USE_TRMM false)
|
||||
|
||||
if (${ARCH} STREQUAL "arm" OR ${ARCH} STREQUAL "arm64" OR "${TARGET}" STREQUAL "LONGSOON3B" OR "${TARGET}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell")
|
||||
if (${ARCH} STREQUAL "arm" OR ${ARCH} STREQUAL "arm64" OR "${TARGET}" STREQUAL "LONGSOON3B" OR "${TARGET}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "{CORE}" STREQUAL "zen")
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
|
||||
|
||||
@@ -32,6 +32,10 @@ ifeq ($(CORE), HASWELL)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ZEN)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER8)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
include $(KERNELDIR)/KERNEL.ARMV5
|
||||
|
||||
|
||||
|
||||
###############################################################################
|
||||
SAMAXKERNEL = iamax_vfp.S
|
||||
DAMAXKERNEL = iamax_vfp.S
|
||||
CAMAXKERNEL = iamax_vfp.S
|
||||
@@ -44,10 +42,10 @@ DAXPYKERNEL = axpy_vfp.S
|
||||
CAXPYKERNEL = axpy_vfp.S
|
||||
ZAXPYKERNEL = axpy_vfp.S
|
||||
|
||||
SCOPYKERNEL = copy.c
|
||||
DCOPYKERNEL = copy.c
|
||||
CCOPYKERNEL = zcopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
SROTKERNEL = rot_vfp.S
|
||||
DROTKERNEL = rot_vfp.S
|
||||
CROTKERNEL = rot_vfp.S
|
||||
ZROTKERNEL = rot_vfp.S
|
||||
|
||||
SDOTKERNEL = sdot_vfp.S
|
||||
DDOTKERNEL = ddot_vfp.S
|
||||
@@ -59,16 +57,6 @@ DNRM2KERNEL = nrm2_vfp.S
|
||||
CNRM2KERNEL = nrm2_vfp.S
|
||||
ZNRM2KERNEL = nrm2_vfp.S
|
||||
|
||||
SROTKERNEL = rot_vfp.S
|
||||
DROTKERNEL = rot_vfp.S
|
||||
CROTKERNEL = rot_vfp.S
|
||||
ZROTKERNEL = rot_vfp.S
|
||||
|
||||
SSCALKERNEL = scal.c
|
||||
DSCALKERNEL = scal.c
|
||||
CSCALKERNEL = zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
|
||||
SSWAPKERNEL = swap_vfp.S
|
||||
DSWAPKERNEL = swap_vfp.S
|
||||
CSWAPKERNEL = swap_vfp.S
|
||||
@@ -84,26 +72,25 @@ DGEMVTKERNEL = gemv_t_vfp.S
|
||||
CGEMVTKERNEL = cgemv_t_vfp.S
|
||||
ZGEMVTKERNEL = zgemv_t_vfp.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x2_vfp.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
|
||||
CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_4x2_vfp.S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = sgemm_ncopy_4_vfp.S
|
||||
SGEMMITCOPY = sgemm_tcopy_4_vfp.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
endif
|
||||
SGEMMONCOPY = sgemm_ncopy_2_vfp.S
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_4x2_vfp.S
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
DGEMMINCOPY = dgemm_ncopy_4_vfp.S
|
||||
DGEMMITCOPY = dgemm_tcopy_4_vfp.S
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
endif
|
||||
DGEMMONCOPY = dgemm_ncopy_2_vfp.S
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
@@ -121,26 +108,8 @@ ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x2_vfp.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
|
||||
CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
|
||||
|
||||
|
||||
@@ -1,91 +1,12 @@
|
||||
|
||||
#################################################################################
|
||||
SAMAXKERNEL = iamax_vfp.S
|
||||
DAMAXKERNEL = iamax_vfp.S
|
||||
CAMAXKERNEL = iamax_vfp.S
|
||||
ZAMAXKERNEL = iamax_vfp.S
|
||||
|
||||
SAMINKERNEL = iamax_vfp.S
|
||||
DAMINKERNEL = iamax_vfp.S
|
||||
CAMINKERNEL = iamax_vfp.S
|
||||
ZAMINKERNEL = iamax_vfp.S
|
||||
|
||||
SMAXKERNEL = iamax_vfp.S
|
||||
DMAXKERNEL = iamax_vfp.S
|
||||
|
||||
SMINKERNEL = iamax_vfp.S
|
||||
DMINKERNEL = iamax_vfp.S
|
||||
|
||||
ISAMAXKERNEL = iamax_vfp.S
|
||||
IDAMAXKERNEL = iamax_vfp.S
|
||||
ICAMAXKERNEL = iamax_vfp.S
|
||||
IZAMAXKERNEL = iamax_vfp.S
|
||||
|
||||
ISAMINKERNEL = iamax_vfp.S
|
||||
IDAMINKERNEL = iamax_vfp.S
|
||||
ICAMINKERNEL = iamax_vfp.S
|
||||
IZAMINKERNEL = iamax_vfp.S
|
||||
|
||||
ISMAXKERNEL = iamax_vfp.S
|
||||
IDMAXKERNEL = iamax_vfp.S
|
||||
|
||||
ISMINKERNEL = iamax_vfp.S
|
||||
IDMINKERNEL = iamax_vfp.S
|
||||
|
||||
SSWAPKERNEL = swap_vfp.S
|
||||
DSWAPKERNEL = swap_vfp.S
|
||||
CSWAPKERNEL = swap_vfp.S
|
||||
ZSWAPKERNEL = swap_vfp.S
|
||||
|
||||
SASUMKERNEL = asum_vfp.S
|
||||
DASUMKERNEL = asum_vfp.S
|
||||
CASUMKERNEL = asum_vfp.S
|
||||
ZASUMKERNEL = asum_vfp.S
|
||||
|
||||
SAXPYKERNEL = axpy_vfp.S
|
||||
DAXPYKERNEL = axpy_vfp.S
|
||||
CAXPYKERNEL = axpy_vfp.S
|
||||
ZAXPYKERNEL = axpy_vfp.S
|
||||
|
||||
SCOPYKERNEL = copy.c
|
||||
DCOPYKERNEL = copy.c
|
||||
CCOPYKERNEL = zcopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
|
||||
SDOTKERNEL = sdot_vfp.S
|
||||
DDOTKERNEL = ddot_vfp.S
|
||||
CDOTKERNEL = cdot_vfp.S
|
||||
ZDOTKERNEL = zdot_vfp.S
|
||||
include $(KERNELDIR)/KERNEL.ARMV6
|
||||
|
||||
SNRM2KERNEL = nrm2_vfpv3.S
|
||||
DNRM2KERNEL = nrm2_vfpv3.S
|
||||
CNRM2KERNEL = nrm2_vfpv3.S
|
||||
ZNRM2KERNEL = nrm2_vfpv3.S
|
||||
|
||||
SROTKERNEL = rot_vfp.S
|
||||
DROTKERNEL = rot_vfp.S
|
||||
CROTKERNEL = rot_vfp.S
|
||||
ZROTKERNEL = rot_vfp.S
|
||||
|
||||
SSCALKERNEL = scal.c
|
||||
DSCALKERNEL = scal.c
|
||||
CSCALKERNEL = zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
|
||||
SGEMVNKERNEL = gemv_n_vfpv3.S
|
||||
DGEMVNKERNEL = gemv_n_vfpv3.S
|
||||
CGEMVNKERNEL = cgemv_n_vfp.S
|
||||
ZGEMVNKERNEL = zgemv_n_vfp.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t_vfp.S
|
||||
DGEMVTKERNEL = gemv_t_vfp.S
|
||||
CGEMVTKERNEL = cgemv_t_vfp.S
|
||||
ZGEMVTKERNEL = zgemv_t_vfp.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
|
||||
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S
|
||||
SGEMMONCOPY = sgemm_ncopy_4_vfp.S
|
||||
@@ -100,35 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S
|
||||
CGEMMONCOPY = cgemm_ncopy_2_vfp.S
|
||||
CGEMMOTCOPY = cgemm_tcopy_2_vfp.S
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S
|
||||
ZGEMMONCOPY = zgemm_ncopy_2_vfp.S
|
||||
ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
|
||||
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S
|
||||
|
||||
|
||||
@@ -475,6 +475,14 @@ asum_kernel_L999:
|
||||
vadd.f32 s0 , s0, s1 // set return value
|
||||
#endif
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vmov r0, s0
|
||||
#else
|
||||
vmov r0, r1, d0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
bx lr
|
||||
|
||||
EPILOGUE
|
||||
|
||||
@@ -38,18 +38,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#ifndef ARM_SOFTFP_ABI
|
||||
//hard
|
||||
#define OLD_INC_X [fp, #0 ]
|
||||
#define OLD_Y [fp, #4 ]
|
||||
#define OLD_INC_Y [fp, #8 ]
|
||||
#else
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_ALPHA r3
|
||||
#define OLD_X [fp, #0 ]
|
||||
#define OLD_INC_X [fp, #4 ]
|
||||
#define OLD_Y [fp, #8 ]
|
||||
#define OLD_INC_Y [fp, #12 ]
|
||||
#else
|
||||
#define OLD_ALPHA [fp, #0]
|
||||
#define OLD_X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12 ]
|
||||
#define OLD_Y [fp, #16 ]
|
||||
#define OLD_INC_Y [fp, #20 ]
|
||||
#endif
|
||||
|
||||
|
||||
#else //COMPLEX
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_ALPHAR r3
|
||||
#define OLD_ALPHAI [fp, #0 ]
|
||||
#define OLD_X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define OLD_Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#else
|
||||
#define OLD_ALPHAR [fp, #0]
|
||||
#define OLD_ALPHAI [fp, #8]
|
||||
#define OLD_X [fp, #16 ]
|
||||
#define OLD_INC_X [fp, #20 ]
|
||||
#define OLD_Y [fp, #24 ]
|
||||
#define OLD_INC_Y [fp, #28 ]
|
||||
#endif
|
||||
|
||||
#endif //!defined(COMPLEX)
|
||||
|
||||
#else //__ARM_PCS_VFP
|
||||
|
||||
#define OLD_INC_X [fp, #0 ]
|
||||
#define OLD_Y [fp, #4 ]
|
||||
#define OLD_INC_Y [fp, #8 ]
|
||||
|
||||
#endif //!defined(__ARM_PCS_VFP)
|
||||
|
||||
#define N r0
|
||||
#define Y r1
|
||||
#define INC_X r2
|
||||
@@ -71,14 +105,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#else
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
@@ -90,14 +124,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#else
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#endif
|
||||
@@ -370,13 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #8
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
#ifndef DOUBLE
|
||||
vmov s0, r3 //move alpha to s0
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(COMPLEX)
|
||||
#if !defined(DOUBLE)
|
||||
vmov s0, OLD_ALPHA
|
||||
ldr X, OLD_X
|
||||
#else
|
||||
vldr d0, OLD_ALPHA
|
||||
ldr X, OLD_X
|
||||
#endif
|
||||
#else //COMPLEX
|
||||
#if !defined(DOUBLE)
|
||||
vmov s0, OLD_ALPHAR
|
||||
vldr s1, OLD_ALPHAI
|
||||
ldr X, OLD_X
|
||||
#else
|
||||
vldr d0, OLD_ALPHAR
|
||||
vldr d1, OLD_ALPHAI
|
||||
ldr X, OLD_X
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
ldr INC_X , OLD_INC_X
|
||||
ldr Y, OLD_Y
|
||||
ldr INC_Y , OLD_INC_Y
|
||||
|
||||
@@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define N r0
|
||||
#define X r1
|
||||
#define INC_X r2
|
||||
#define OLD_Y r3
|
||||
|
||||
|
||||
/******************************************************
|
||||
* [fp, #-128] - [fp, #-64] is reserved
|
||||
@@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* registers
|
||||
*******************************************************/
|
||||
|
||||
#define OLD_INC_Y [fp, #4 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_RETURN_ADDR r0
|
||||
#define OLD_N r1
|
||||
#define OLD_X r2
|
||||
#define OLD_INC_X r3
|
||||
#define OLD_Y [fp, #0 ]
|
||||
#define OLD_INC_Y [fp, #4 ]
|
||||
#define RETURN_ADDR r8
|
||||
#else
|
||||
#define OLD_Y r3
|
||||
#define OLD_INC_Y [fp, #0 ]
|
||||
#endif
|
||||
|
||||
#define I r5
|
||||
#define Y r6
|
||||
@@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.align 5
|
||||
|
||||
push {r4 - r9, fp}
|
||||
add fp, sp, #24
|
||||
add fp, sp, #28
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
sub r4, fp, #128
|
||||
@@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmov s2, s0
|
||||
vmov s3, s0
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
mov RETURN_ADDR, OLD_RETURN_ADDR
|
||||
mov N, OLD_N
|
||||
mov X, OLD_X
|
||||
mov INC_X, OLD_INC_X
|
||||
ldr Y, OLD_Y
|
||||
ldr INC_Y, OLD_INC_Y
|
||||
#else
|
||||
mov Y, OLD_Y
|
||||
ldr INC_Y, OLD_INC_Y
|
||||
#endif
|
||||
|
||||
cmp N, #0
|
||||
ble cdot_kernel_L999
|
||||
@@ -265,7 +283,6 @@ cdot_kernel_S10:
|
||||
|
||||
|
||||
cdot_kernel_L999:
|
||||
|
||||
sub r3, fp, #128
|
||||
vldm r3, { s8 - s15} // restore floating point registers
|
||||
|
||||
@@ -276,8 +293,11 @@ cdot_kernel_L999:
|
||||
vadd.f32 s0 , s0, s2
|
||||
vsub.f32 s1 , s1, s3
|
||||
#endif
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vstm RETURN_ADDR, {s0 - s1}
|
||||
#endif
|
||||
|
||||
sub sp, fp, #24
|
||||
sub sp, fp, #28
|
||||
pop {r4 - r9, fp}
|
||||
bx lr
|
||||
|
||||
|
||||
@@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP r3
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define B [fp, #12 ]
|
||||
#define C [fp, #16 ]
|
||||
#define OLD_LDC [fp, #20 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
@@ -94,42 +103,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#endif
|
||||
@@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
@@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP r3
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define B [fp, #12 ]
|
||||
#define C [fp, #16 ]
|
||||
#define OLD_LDC [fp, #20 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
@@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define FADD_R fsubs
|
||||
#define FADD_I fadds
|
||||
|
||||
#define FMAC_R1 fnmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R1 vmls.f32
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fnmacs
|
||||
#define FMAC_I2 vmls.f32
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
@@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
@@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define FADD_I fsubs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
@@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define FADD_R fsubs
|
||||
#define FADD_I fadds
|
||||
|
||||
#define FMAC_R1 fnmacs
|
||||
#define FMAC_R1 vmls.f32
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I2 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 vmls.f32
|
||||
|
||||
#endif
|
||||
|
||||
@@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR r3
|
||||
#define OLD_ALPHAI [fp, #0 ]
|
||||
#define OLD_A_SOFTFP [fp, #4 ]
|
||||
#define OLD_LDA [fp, #8 ]
|
||||
#define X [fp, #12 ]
|
||||
#define OLD_INC_X [fp, #16 ]
|
||||
#define Y [fp, #20 ]
|
||||
#define OLD_INC_Y [fp, #24 ]
|
||||
#else
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_M r0
|
||||
|
||||
@@ -78,42 +90,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif !defined(CONJ) && defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#endif
|
||||
@@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
cmp N, #0
|
||||
ble cgemvn_kernel_L999
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov s0, OLD_ALPHAR
|
||||
vldr s1, OLD_ALPHAI
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_M, M
|
||||
vstr s0 , ALPHA_R
|
||||
|
||||
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR r3
|
||||
#define OLD_ALPHAI [fp, #0 ]
|
||||
#define OLD_A_SOFTFP [fp, #4 ]
|
||||
#define OLD_LDA [fp, #8 ]
|
||||
#define X [fp, #12 ]
|
||||
#define OLD_INC_X [fp, #16 ]
|
||||
#define Y [fp, #20 ]
|
||||
#define OLD_INC_Y [fp, #24 ]
|
||||
#else
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_N r1
|
||||
|
||||
@@ -76,42 +88,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif !defined(CONJ) && defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#endif
|
||||
@@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
cmp OLD_N, #0
|
||||
ble cgemvt_kernel_L999
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov s0, OLD_ALPHAR
|
||||
vldr s1, OLD_ALPHAI
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_N, N
|
||||
|
||||
|
||||
@@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP r3
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define B [fp, #12 ]
|
||||
#define C [fp, #16 ]
|
||||
#define OLD_LDC [fp, #20 ]
|
||||
#define OFFSET [fp, #24 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
@@ -98,42 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#endif
|
||||
@@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
@@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP r3
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define B [fp, #12 ]
|
||||
#define C [fp, #16 ]
|
||||
#define OLD_LDC [fp, #20 ]
|
||||
#define OFFSET [fp, #24 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
@@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define FADD_R fsubs
|
||||
#define FADD_I fadds
|
||||
|
||||
#define FMAC_R1 fnmuls
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R1 vnmul.f32
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmuls
|
||||
#define FMAC_I2 fnmacs
|
||||
#define FMAC_I2 vmls.f32
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
@@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define FMAC_R1 fmuls
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmuls
|
||||
#define FMAC_I1 vnmul.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
@@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define FADD_I fsubs
|
||||
|
||||
#define FMAC_R1 fmuls
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmuls
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
@@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define FADD_R fsubs
|
||||
#define FADD_I fadds
|
||||
|
||||
#define FMAC_R1 fnmuls
|
||||
#define FMAC_R1 vnmul.f32
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmuls
|
||||
#define FMAC_I2 fnmacs
|
||||
#define FMAC_I1 vnmul.f32
|
||||
#define FMAC_I2 vmls.f32
|
||||
|
||||
#endif
|
||||
|
||||
@@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
@@ -246,6 +246,9 @@ ddot_kernel_L999:
|
||||
vldm r3, { d8 - d15} // restore floating point registers
|
||||
|
||||
vadd.f64 d0 , d0, d1 // set return value
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov r0, r1, d0
|
||||
#endif
|
||||
sub sp, fp, #24
|
||||
pop {r4 - r9, fp}
|
||||
bx lr
|
||||
|
||||
@@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define ALPHA [fp, #-280]
|
||||
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #12 ]
|
||||
#define B [fp, #16 ]
|
||||
#define C [fp, #20 ]
|
||||
#define OLD_LDC [fp, #24 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
@@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
@@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define ALPHA [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #12 ]
|
||||
#define B [fp, #16 ]
|
||||
#define C [fp, #20 ]
|
||||
#define OLD_LDC [fp, #24 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
@@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
@@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define ALPHA [fp, #-276 ]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #12 ]
|
||||
#define B [fp, #16 ]
|
||||
#define OLD_C [fp, #20 ]
|
||||
#define OLD_LDC [fp, #24 ]
|
||||
#define OFFSET [fp, #28 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define OLD_C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
@@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
@@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define ALPHA [fp, #-276 ]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #12 ]
|
||||
#define B [fp, #16 ]
|
||||
#define OLD_C [fp, #20 ]
|
||||
#define OLD_LDC [fp, #24 ]
|
||||
#define OFFSET [fp, #28 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define OLD_C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
@@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
@@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_ALPHA r3
|
||||
#define OLD_A_SOFTFP [fp, #0 ]
|
||||
#define OLD_LDA [fp, #4 ]
|
||||
#define X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12 ]
|
||||
#define Y [fp, #16 ]
|
||||
#define OLD_INC_Y [fp, #20 ]
|
||||
#else
|
||||
#define OLD_ALPHA [fp, #0 ]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define OLD_LDA [fp, #12]
|
||||
#define X [fp, #16]
|
||||
#define OLD_INC_X [fp, #20]
|
||||
#define Y [fp, #24]
|
||||
#define OLD_INC_Y [fp, #28]
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_M r0
|
||||
|
||||
@@ -508,6 +533,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
cmp N, #0
|
||||
ble gemvn_kernel_L999
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vmov s0, OLD_ALPHA
|
||||
#else
|
||||
vldr d0, OLD_ALPHA
|
||||
#endif
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_M, M
|
||||
|
||||
|
||||
@@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#ifndef ARM_SOFTFP_ABI
|
||||
//hard
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#define OLD_A r3
|
||||
#else
|
||||
#define OLD_A_SOFTFP [fp, #0 ]
|
||||
#define OLD_LDA [fp, #4 ]
|
||||
#define X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12 ]
|
||||
#define Y [fp, #16 ]
|
||||
#define OLD_INC_Y [fp, #20 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_ALPHA r3
|
||||
#define OLD_A r3
|
||||
#define OLD_A_SOFTFP [fp, #0 ]
|
||||
#define OLD_LDA [fp, #4 ]
|
||||
#define X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12 ]
|
||||
#define Y [fp, #16 ]
|
||||
#define OLD_INC_Y [fp, #20 ]
|
||||
#else
|
||||
#define OLD_ALPHA [fp, #0 ]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define OLD_LDA [fp, #12]
|
||||
#define X [fp, #16]
|
||||
#define OLD_INC_X [fp, #20]
|
||||
#define Y [fp, #24]
|
||||
#define OLD_INC_Y [fp, #28]
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_M r0
|
||||
|
||||
#define AO1 r0
|
||||
@@ -565,18 +577,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
cmp N, #0
|
||||
ble gemvn_kernel_L999
|
||||
|
||||
#ifndef DOUBLE
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
|
||||
vmov s0, OLD_ALPHA
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vmov s0, OLD_ALPHA
|
||||
#else
|
||||
vldr d0, OLD_ALPHA
|
||||
#endif
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_M, M
|
||||
|
||||
|
||||
|
||||
ldr INC_X , OLD_INC_X
|
||||
ldr INC_Y , OLD_INC_Y
|
||||
|
||||
|
||||
@@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#ifndef ARM_SOFTFP_ABI
|
||||
//hard abi
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#define OLD_A r3
|
||||
#else
|
||||
#define OLD_A_SOFTFP [fp, #0 ]
|
||||
#define OLD_LDA [fp, #4 ]
|
||||
#define X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12 ]
|
||||
#define Y [fp, #16 ]
|
||||
#define OLD_INC_Y [fp, #20 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_ALPHA r3
|
||||
#define OLD_A r3
|
||||
#define OLD_A_SOFTFP [fp, #0 ]
|
||||
#define OLD_LDA [fp, #4 ]
|
||||
#define X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12 ]
|
||||
#define Y [fp, #16 ]
|
||||
#define OLD_INC_Y [fp, #20 ]
|
||||
#else
|
||||
#define OLD_ALPHA [fp, #0 ]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define OLD_LDA [fp, #12]
|
||||
#define X [fp, #16]
|
||||
#define OLD_INC_X [fp, #20]
|
||||
#define Y [fp, #24]
|
||||
#define OLD_INC_Y [fp, #28]
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_N r1
|
||||
|
||||
#define M r0
|
||||
@@ -518,11 +530,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
cmp OLD_N, #0
|
||||
ble gemvt_kernel_L999
|
||||
|
||||
#ifndef DOUBLE
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
vmov s0, OLD_ALPHA
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vmov s0, OLD_ALPHA
|
||||
#else
|
||||
vldr d0, OLD_ALPHA
|
||||
#endif
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
|
||||
@@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_ALPHA r3
|
||||
#define OLD_A_SOFTFP [fp, #0 ]
|
||||
#define OLD_LDA [fp, #4 ]
|
||||
#define X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12 ]
|
||||
#define Y [fp, #16 ]
|
||||
#define OLD_INC_Y [fp, #20 ]
|
||||
#else
|
||||
#define OLD_ALPHA [fp, #0 ]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define OLD_LDA [fp, #12]
|
||||
#define X [fp, #16]
|
||||
#define OLD_INC_X [fp, #20]
|
||||
#define Y [fp, #24]
|
||||
#define OLD_INC_Y [fp, #28]
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_N r1
|
||||
|
||||
@@ -476,6 +501,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
cmp OLD_N, #0
|
||||
ble gemvt_kernel_L999
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vmov s0, OLD_ALPHA
|
||||
#else
|
||||
vldr d0, OLD_ALPHA
|
||||
#endif
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_N, N
|
||||
|
||||
|
||||
@@ -573,6 +573,13 @@ nrm2_kernel_L999:
|
||||
#else
|
||||
vsqrt.f32 s1, s1
|
||||
vmul.f32 s0, s0, s1
|
||||
#endif
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vmov r0, s0
|
||||
#else
|
||||
vmov r0, r1, d0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
bx lr
|
||||
|
||||
@@ -503,8 +503,13 @@ nrm2_kernel_L999:
|
||||
#else
|
||||
vsqrt.f32 s1, s1
|
||||
vmul.f32 s0, s0, s1
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
vmov r0, s0
|
||||
#endif
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if defined(DOUBLE)
|
||||
vmov r0, r1, d0
|
||||
#else
|
||||
vmov r0, s0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
@@ -40,6 +40,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define OLD_INC_Y [fp, #0 ]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_C [fp, #4]
|
||||
#define OLD_S [fp, #8]
|
||||
#else
|
||||
#define OLD_C [fp, #8]
|
||||
#define OLD_S [fp, #16]
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define N r0
|
||||
#define X r1
|
||||
@@ -73,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
@@ -82,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
@@ -91,7 +100,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
@@ -100,7 +109,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
@@ -114,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
@@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X, { d2 }
|
||||
fstmiad Y, { d3 }
|
||||
|
||||
@@ -145,7 +154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
@@ -154,7 +163,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
@@ -163,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
@@ -172,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
@@ -186,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
@@ -199,7 +208,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X, { s2 }
|
||||
fstmias Y, { s3 }
|
||||
|
||||
@@ -226,13 +235,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
fnmacd d3 , d1, d5
|
||||
vmls.f64 d3 , d1, d5
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
@@ -241,13 +250,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
fnmacd d3 , d1, d5
|
||||
vmls.f64 d3 , d1, d5
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
@@ -259,13 +268,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
fnmacd d3 , d1, d5
|
||||
vmls.f64 d3 , d1, d5
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
@@ -274,13 +283,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
fnmacd d3 , d1, d5
|
||||
vmls.f64 d3 , d1, d5
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
@@ -294,13 +303,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
fnmacd d3 , d1, d5
|
||||
vmls.f64 d3 , d1, d5
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
@@ -314,13 +323,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
vstr d2 , [ X, #0 ]
|
||||
vstr d3 , [ Y, #0 ]
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
fnmacd d3 , d1, d5
|
||||
vmls.f64 d3 , d1, d5
|
||||
vstr d2 , [ X, #8 ]
|
||||
vstr d3 , [ Y, #8 ]
|
||||
|
||||
@@ -343,13 +352,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
fnmacs s3 , s1, s5
|
||||
vmls.f32 s3 , s1, s5
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
@@ -358,13 +367,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
fnmacs s3 , s1, s5
|
||||
vmls.f32 s3 , s1, s5
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
@@ -376,13 +385,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
fnmacs s3 , s1, s5
|
||||
vmls.f32 s3 , s1, s5
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
@@ -391,13 +400,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
fnmacs s3 , s1, s5
|
||||
vmls.f32 s3 , s1, s5
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
@@ -411,13 +420,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
fnmacs s3 , s1, s5
|
||||
vmls.f32 s3 , s1, s5
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
@@ -431,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
vstr s2 , [ X, #0 ]
|
||||
vstr s3 , [ Y, #0 ]
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
fnmacs s3 , s1, s5
|
||||
vmls.f32 s3 , s1, s5
|
||||
vstr s2 , [ X, #4 ]
|
||||
vstr s3 , [ Y, #4 ]
|
||||
|
||||
@@ -462,7 +471,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #8
|
||||
|
||||
ldr INC_Y , OLD_INC_Y
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vldr s0, OLD_C
|
||||
vldr s1, OLD_S
|
||||
#else
|
||||
vldr d0, OLD_C
|
||||
vldr d1, OLD_S
|
||||
#endif
|
||||
#endif
|
||||
|
||||
cmp N, #0
|
||||
ble rot_kernel_L999
|
||||
|
||||
@@ -138,14 +138,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
fnmacd d2, d1, d5
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X!, { d2 - d3 }
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
fnmacd d2, d1, d5
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X!, { d2 - d3 }
|
||||
@@ -154,14 +154,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
fnmacd d2, d1, d5
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X!, { d2 - d3 }
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
fnmacd d2, d1, d5
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X!, { d2 - d3 }
|
||||
@@ -173,7 +173,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
fnmacd d2, d1, d5
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X!, { d2 - d3 }
|
||||
@@ -184,7 +184,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
fnmacd d2, d1, d5
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X, { d2 - d3 }
|
||||
@@ -201,28 +201,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
fnmacs s2, s1, s5
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X!, { s2 - s3 }
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
fnmacs s2, s1, s5
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X!, { s2 - s3 }
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
fnmacs s2, s1, s5
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X!, { s2 - s3 }
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
fnmacs s2, s1, s5
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X!, { s2 - s3 }
|
||||
@@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
fnmacs s2, s1, s5
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X!, { s2 - s3 }
|
||||
@@ -245,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
fnmacs s2, s1, s5
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X, { s2 - s3 }
|
||||
|
||||
@@ -329,20 +329,19 @@ sdot_kernel_L999:
|
||||
vldm r3, { s8 - s15} // restore floating point registers
|
||||
|
||||
#if defined(DSDOT)
|
||||
|
||||
vadd.f64 d0 , d0, d1 // set return value
|
||||
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
vmov r0, r1, d0
|
||||
#else
|
||||
vadd.f32 s0 , s0, s1 // set return value
|
||||
#endif
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if defined(DSDOT)
|
||||
vmov r0, r1, d0
|
||||
#else
|
||||
|
||||
vadd.f32 s0 , s0, s1 // set return value
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
vmov r0, s0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
sub sp, fp, #24
|
||||
pop {r4 - r9, fp}
|
||||
bx lr
|
||||
|
||||
@@ -62,9 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define ALPHA [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP r3
|
||||
#define OLD_A_SOFTFP [fp, #4 ]
|
||||
#define B [fp, #8 ]
|
||||
#define C [fp, #12 ]
|
||||
#define OLD_LDC [fp, #16 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
@@ -416,6 +424,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
@@ -58,14 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define OLD_M r0
|
||||
#define OLD_N r1
|
||||
#define OLD_K r2
|
||||
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
#define OLD_ALPHA r3
|
||||
//#define OLD_A
|
||||
#else //hard
|
||||
#define OLD_A r3
|
||||
#define OLD_ALPHA s0
|
||||
#endif
|
||||
|
||||
/******************************************************
|
||||
* [fp, #-128] - [fp, #-64] is reserved
|
||||
@@ -77,10 +71,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define M [fp, #-256 ]
|
||||
#define N [fp, #-260 ]
|
||||
#define K [fp, #-264 ]
|
||||
|
||||
#ifndef ARM_SOFTFP_ABI
|
||||
#define A [fp, #-268 ]
|
||||
#endif
|
||||
|
||||
#define FP_ZERO [fp, #-240]
|
||||
#define FP_ZERO_0 [fp, #-240]
|
||||
@@ -88,17 +79,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define ALPHA [fp, #-280]
|
||||
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
#define A [fp, #4 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP r3
|
||||
#define OLD_A_SOFTFP [fp, #4 ]
|
||||
#define B [fp, #8 ]
|
||||
#define C [fp, #12 ]
|
||||
#define OLD_LDC [fp, #16 ]
|
||||
#else //hard
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
#define L r2
|
||||
@@ -867,16 +859,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
str OLD_ALPHA, ALPHA
|
||||
#else //hard
|
||||
str OLD_A, A
|
||||
vstr OLD_ALPHA, ALPHA
|
||||
#endif
|
||||
|
||||
sub r3, fp, #128
|
||||
vstm r3, { s8 - s31} // store floating point registers
|
||||
|
||||
|
||||
@@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define ALPHA [fp, #-276 ]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP r3
|
||||
#define OLD_A_SOFTFP [fp, #4 ]
|
||||
#define B [fp, #8 ]
|
||||
#define OLD_C [fp, #12 ]
|
||||
#define OLD_LDC [fp, #16 ]
|
||||
#define OFFSET [fp, #20 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define OLD_C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
@@ -395,6 +404,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
@@ -64,10 +64,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define ALPHA [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP r3
|
||||
#define OLD_A_SOFTFP [fp, #4 ]
|
||||
#define B [fp, #8 ]
|
||||
#define C [fp, #12 ]
|
||||
#define OLD_LDC [fp, #16 ]
|
||||
#define OFFSET [fp, #20 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
@@ -782,6 +791,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
@@ -38,9 +38,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_X [fp, #0 ]
|
||||
#define OLD_INC_X [fp, #4 ]
|
||||
#define OLD_Y [fp, #8 ]
|
||||
#define OLD_INC_Y [fp, #12 ]
|
||||
#else
|
||||
#define OLD_X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12]
|
||||
#define OLD_Y [fp, #16]
|
||||
#define OLD_INC_Y [fp, #20]
|
||||
#endif
|
||||
|
||||
#else //COMPLEX
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define OLD_Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#else
|
||||
#define OLD_X [fp, #16]
|
||||
#define OLD_INC_X [fp, #20]
|
||||
#define OLD_Y [fp, #24]
|
||||
#define OLD_INC_Y [fp, #28]
|
||||
#endif
|
||||
|
||||
#endif // !defined(__ARM_PCS_VFP)
|
||||
|
||||
#else
|
||||
#define OLD_INC_X [fp, #0 ]
|
||||
#define OLD_Y [fp, #4 ]
|
||||
#define OLD_INC_Y [fp, #8 ]
|
||||
#endif
|
||||
|
||||
|
||||
#define N r0
|
||||
@@ -229,6 +263,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
push {r4 , fp}
|
||||
add fp, sp, #8
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
ldr X, OLD_X
|
||||
#endif
|
||||
ldr INC_X , OLD_INC_X
|
||||
ldr Y, OLD_Y
|
||||
ldr INC_Y , OLD_INC_Y
|
||||
|
||||
@@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define N r0
|
||||
#define X r1
|
||||
#define INC_X r2
|
||||
#define OLD_Y r3
|
||||
|
||||
|
||||
/******************************************************
|
||||
* [fp, #-128] - [fp, #-64] is reserved
|
||||
@@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* registers
|
||||
*******************************************************/
|
||||
|
||||
#define OLD_INC_Y [fp, #4 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_RETURN_ADDR r0
|
||||
#define OLD_N r1
|
||||
#define OLD_X r2
|
||||
#define OLD_INC_X r3
|
||||
#define OLD_Y [fp, #0 ]
|
||||
#define OLD_INC_Y [fp, #4 ]
|
||||
#define RETURN_ADDR r8
|
||||
#else
|
||||
#define OLD_Y r3
|
||||
#define OLD_INC_Y [fp, #0 ]
|
||||
#endif
|
||||
|
||||
#define I r5
|
||||
#define Y r6
|
||||
@@ -181,7 +190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.align 5
|
||||
|
||||
push {r4 - r9, fp}
|
||||
add fp, sp, #24
|
||||
add fp, sp, #28
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
sub r4, fp, #128
|
||||
@@ -194,9 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vcvt.f64.f32 d2, s0
|
||||
vcvt.f64.f32 d3, s0
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
mov RETURN_ADDR, OLD_RETURN_ADDR
|
||||
mov N, OLD_N
|
||||
mov X, OLD_X
|
||||
mov INC_X, OLD_INC_X
|
||||
ldr Y, OLD_Y
|
||||
ldr INC_Y, OLD_INC_Y
|
||||
#else
|
||||
mov Y, OLD_Y
|
||||
ldr INC_Y, OLD_INC_Y
|
||||
|
||||
#endif
|
||||
|
||||
cmp N, #0
|
||||
ble zdot_kernel_L999
|
||||
@@ -280,8 +297,11 @@ zdot_kernel_L999:
|
||||
vadd.f64 d0 , d0, d2
|
||||
vsub.f64 d1 , d1, d3
|
||||
#endif
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vstm RETURN_ADDR, {d0 - d1}
|
||||
#endif
|
||||
|
||||
sub sp, fp, #24
|
||||
sub sp, fp, #28
|
||||
pop {r4 - r9, fp}
|
||||
bx lr
|
||||
|
||||
|
||||
@@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP [fp, #4]
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #12]
|
||||
#define OLD_A_SOFTFP [fp, #20 ]
|
||||
#define B [fp, #24 ]
|
||||
#define C [fp, #28 ]
|
||||
#define OLD_LDC [fp, #32 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
@@ -87,42 +96,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#endif
|
||||
@@ -863,6 +872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
@@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP [fp, #4]
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #12]
|
||||
#define OLD_A_SOFTFP [fp, #20 ]
|
||||
#define B [fp, #24 ]
|
||||
#define C [fp, #28 ]
|
||||
#define OLD_LDC [fp, #32 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
@@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define FADD_R fsubd
|
||||
#define FADD_I faddd
|
||||
|
||||
#define FMAC_R1 fnmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R1 vmls.f64
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fnmacd
|
||||
#define FMAC_I2 vmls.f64
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
@@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
@@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define FADD_I fsubd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
@@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define FADD_R fsubd
|
||||
#define FADD_I faddd
|
||||
|
||||
#define FMAC_R1 fnmacd
|
||||
#define FMAC_R1 vmls.f64
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I2 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 vmls.f64
|
||||
|
||||
#endif
|
||||
|
||||
@@ -909,6 +918,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR [fp, #0 ]
|
||||
#define OLD_ALPHAI [fp, #8 ]
|
||||
#define OLD_A_SOFTFP [fp, #16]
|
||||
#define OLD_LDA [fp, #20]
|
||||
#define X [fp, #24]
|
||||
#define OLD_INC_X [fp, #28]
|
||||
#define Y [fp, #32]
|
||||
#define OLD_INC_Y [fp, #36]
|
||||
#else
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_M r0
|
||||
|
||||
@@ -79,42 +91,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif !defined(CONJ) && defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#endif
|
||||
@@ -465,6 +477,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
cmp N, #0
|
||||
ble zgemvn_kernel_L999
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr d0, OLD_ALPHAR
|
||||
vldr d1, OLD_ALPHAI
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_M, M
|
||||
vstr d0 , ALPHA_R
|
||||
|
||||
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR [fp, #0 ]
|
||||
#define OLD_ALPHAI [fp, #8 ]
|
||||
#define OLD_A_SOFTFP [fp, #16]
|
||||
#define OLD_LDA [fp, #20]
|
||||
#define X [fp, #24]
|
||||
#define OLD_INC_X [fp, #28]
|
||||
#define Y [fp, #32]
|
||||
#define OLD_INC_Y [fp, #36]
|
||||
#else
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_N r1
|
||||
|
||||
@@ -77,42 +89,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif !defined(CONJ) && defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#endif
|
||||
@@ -360,6 +372,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
cmp OLD_N, #0
|
||||
ble zgemvt_kernel_L999
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr d0, OLD_ALPHAR
|
||||
vldr d1, OLD_ALPHAI
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_N, N
|
||||
|
||||
|
||||
@@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP [fp, #4]
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #12]
|
||||
#define OLD_A_SOFTFP [fp, #20 ]
|
||||
#define B [fp, #24 ]
|
||||
#define C [fp, #28 ]
|
||||
#define OLD_LDC [fp, #32 ]
|
||||
#define OFFSET [fp, #36 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
@@ -96,42 +106,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#endif
|
||||
@@ -882,6 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
@@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP [fp, #4]
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #12]
|
||||
#define OLD_A_SOFTFP [fp, #20 ]
|
||||
#define B [fp, #24 ]
|
||||
#define C [fp, #28 ]
|
||||
#define OLD_LDC [fp, #32 ]
|
||||
#define OFFSET [fp, #36 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
@@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define FADD_R fsubd
|
||||
#define FADD_I faddd
|
||||
|
||||
#define FMAC_R1 fnmuld
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R1 vnmul.f64
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmuld
|
||||
#define FMAC_I2 fnmacd
|
||||
#define FMAC_I2 vmls.f64
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
@@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define FMAC_R1 fmuld
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmuld
|
||||
#define FMAC_I1 vnmul.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
@@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define FADD_I fsubd
|
||||
|
||||
#define FMAC_R1 fmuld
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmuld
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
@@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define FADD_R fsubd
|
||||
#define FADD_I faddd
|
||||
|
||||
#define FMAC_R1 fnmuld
|
||||
#define FMAC_R1 vnmul.f64
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmuld
|
||||
#define FMAC_I2 fnmacd
|
||||
#define FMAC_I1 vnmul.f64
|
||||
#define FMAC_I2 vmls.f64
|
||||
|
||||
#endif
|
||||
|
||||
@@ -883,6 +893,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
@@ -56,14 +56,14 @@ static float casum_kernel_16 (long n, float *x)
|
||||
"xxlxor 38, 38, 38 \n\t"
|
||||
"xxlxor 39, 39, 39 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %8, %2 \n\t"
|
||||
"lxvw4x 42, %9, %2 \n\t"
|
||||
"lxvw4x 43, %10, %2 \n\t"
|
||||
"lxvw4x 44, %11, %2 \n\t"
|
||||
"lxvw4x 45, %12, %2 \n\t"
|
||||
"lxvw4x 46, %13, %2 \n\t"
|
||||
"lxvw4x 47, %14, %2 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %8, %2 \n\t"
|
||||
"lxvd2x 42, %9, %2 \n\t"
|
||||
"lxvd2x 43, %10, %2 \n\t"
|
||||
"lxvd2x 44, %11, %2 \n\t"
|
||||
"lxvd2x 45, %12, %2 \n\t"
|
||||
"lxvd2x 46, %13, %2 \n\t"
|
||||
"lxvd2x 47, %14, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
@@ -78,26 +78,26 @@ static float casum_kernel_16 (long n, float *x)
|
||||
"xvabssp 50, 42 \n\t"
|
||||
"xvabssp 51, 43 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %8, %2 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %8, %2 \n\t"
|
||||
|
||||
"xvabssp %x3, 44 \n\t"
|
||||
"xvabssp %x4, 45 \n\t"
|
||||
|
||||
"lxvw4x 42, %9, %2 \n\t"
|
||||
"lxvw4x 43, %10, %2 \n\t"
|
||||
"lxvd2x 42, %9, %2 \n\t"
|
||||
"lxvd2x 43, %10, %2 \n\t"
|
||||
|
||||
"xvabssp %x5, 46 \n\t"
|
||||
"xvabssp %x6, 47 \n\t"
|
||||
|
||||
"lxvw4x 44, %11, %2 \n\t"
|
||||
"lxvw4x 45, %12, %2 \n\t"
|
||||
"lxvd2x 44, %11, %2 \n\t"
|
||||
"lxvd2x 45, %12, %2 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 48 \n\t"
|
||||
"xvaddsp 33, 33, 49 \n\t"
|
||||
|
||||
"lxvw4x 46, %13, %2 \n\t"
|
||||
"lxvw4x 47, %14, %2 \n\t"
|
||||
"lxvd2x 46, %13, %2 \n\t"
|
||||
"lxvd2x 47, %14, %2 \n\t"
|
||||
|
||||
"xvaddsp 34, 34, 50 \n\t"
|
||||
"xvaddsp 35, 35, 51 \n\t"
|
||||
@@ -146,7 +146,7 @@ static float casum_kernel_16 (long n, float *x)
|
||||
"xxsldwi 33, 32, 32, 1 \n\t"
|
||||
"xvaddsp 32, 32, 33 \n\t"
|
||||
|
||||
"xscvspdp %0, 32 \n"
|
||||
"xscvspdp %x0, 32 \n"
|
||||
|
||||
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
|
||||
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
|
||||
|
||||
@@ -39,25 +39,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
"lxvw4x 32, 0, %2 \n\t"
|
||||
"lxvw4x 33, %5, %2 \n\t"
|
||||
"lxvw4x 34, %6, %2 \n\t"
|
||||
"lxvw4x 35, %7, %2 \n\t"
|
||||
"lxvw4x 36, %8, %2 \n\t"
|
||||
"lxvw4x 37, %9, %2 \n\t"
|
||||
"lxvw4x 38, %10, %2 \n\t"
|
||||
"lxvw4x 39, %11, %2 \n\t"
|
||||
"lxvd2x 32, 0, %2 \n\t"
|
||||
"lxvd2x 33, %5, %2 \n\t"
|
||||
"lxvd2x 34, %6, %2 \n\t"
|
||||
"lxvd2x 35, %7, %2 \n\t"
|
||||
"lxvd2x 36, %8, %2 \n\t"
|
||||
"lxvd2x 37, %9, %2 \n\t"
|
||||
"lxvd2x 38, %10, %2 \n\t"
|
||||
"lxvd2x 39, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
@@ -67,42 +67,42 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
|
||||
".p2align 5 \n"
|
||||
"1: \n\t"
|
||||
|
||||
"stxvw4x 32, 0, %3 \n\t"
|
||||
"stxvw4x 33, %5, %3 \n\t"
|
||||
"lxvw4x 32, 0, %2 \n\t"
|
||||
"lxvw4x 33, %5, %2 \n\t"
|
||||
"stxvw4x 34, %6, %3 \n\t"
|
||||
"stxvw4x 35, %7, %3 \n\t"
|
||||
"lxvw4x 34, %6, %2 \n\t"
|
||||
"lxvw4x 35, %7, %2 \n\t"
|
||||
"stxvw4x 36, %8, %3 \n\t"
|
||||
"stxvw4x 37, %9, %3 \n\t"
|
||||
"lxvw4x 36, %8, %2 \n\t"
|
||||
"lxvw4x 37, %9, %2 \n\t"
|
||||
"stxvw4x 38, %10, %3 \n\t"
|
||||
"stxvw4x 39, %11, %3 \n\t"
|
||||
"lxvw4x 38, %10, %2 \n\t"
|
||||
"lxvw4x 39, %11, %2 \n\t"
|
||||
"stxvd2x 32, 0, %3 \n\t"
|
||||
"stxvd2x 33, %5, %3 \n\t"
|
||||
"lxvd2x 32, 0, %2 \n\t"
|
||||
"lxvd2x 33, %5, %2 \n\t"
|
||||
"stxvd2x 34, %6, %3 \n\t"
|
||||
"stxvd2x 35, %7, %3 \n\t"
|
||||
"lxvd2x 34, %6, %2 \n\t"
|
||||
"lxvd2x 35, %7, %2 \n\t"
|
||||
"stxvd2x 36, %8, %3 \n\t"
|
||||
"stxvd2x 37, %9, %3 \n\t"
|
||||
"lxvd2x 36, %8, %2 \n\t"
|
||||
"lxvd2x 37, %9, %2 \n\t"
|
||||
"stxvd2x 38, %10, %3 \n\t"
|
||||
"stxvd2x 39, %11, %3 \n\t"
|
||||
"lxvd2x 38, %10, %2 \n\t"
|
||||
"lxvd2x 39, %11, %2 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t"
|
||||
"stxvw4x 41, %5, %3 \n\t"
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"stxvw4x 42, %6, %3 \n\t"
|
||||
"stxvw4x 43, %7, %3 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"stxvw4x 44, %8, %3 \n\t"
|
||||
"stxvw4x 45, %9, %3 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"stxvw4x 46, %10, %3 \n\t"
|
||||
"stxvw4x 47, %11, %3 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
"stxvd2x 40, 0, %3 \n\t"
|
||||
"stxvd2x 41, %5, %3 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"stxvd2x 42, %6, %3 \n\t"
|
||||
"stxvd2x 43, %7, %3 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"stxvd2x 44, %8, %3 \n\t"
|
||||
"stxvd2x 45, %9, %3 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"stxvd2x 46, %10, %3 \n\t"
|
||||
"stxvd2x 47, %11, %3 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
@@ -112,25 +112,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"stxvw4x 32, 0, %3 \n\t"
|
||||
"stxvw4x 33, %5, %3 \n\t"
|
||||
"stxvw4x 34, %6, %3 \n\t"
|
||||
"stxvw4x 35, %7, %3 \n\t"
|
||||
"stxvw4x 36, %8, %3 \n\t"
|
||||
"stxvw4x 37, %9, %3 \n\t"
|
||||
"stxvw4x 38, %10, %3 \n\t"
|
||||
"stxvw4x 39, %11, %3 \n\t"
|
||||
"stxvd2x 32, 0, %3 \n\t"
|
||||
"stxvd2x 33, %5, %3 \n\t"
|
||||
"stxvd2x 34, %6, %3 \n\t"
|
||||
"stxvd2x 35, %7, %3 \n\t"
|
||||
"stxvd2x 36, %8, %3 \n\t"
|
||||
"stxvd2x 37, %9, %3 \n\t"
|
||||
"stxvd2x 38, %10, %3 \n\t"
|
||||
"stxvd2x 39, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t"
|
||||
"stxvw4x 41, %5, %3 \n\t"
|
||||
"stxvw4x 42, %6, %3 \n\t"
|
||||
"stxvw4x 43, %7, %3 \n\t"
|
||||
"stxvw4x 44, %8, %3 \n\t"
|
||||
"stxvw4x 45, %9, %3 \n\t"
|
||||
"stxvw4x 46, %10, %3 \n\t"
|
||||
"stxvw4x 47, %11, %3 \n"
|
||||
"stxvd2x 40, 0, %3 \n\t"
|
||||
"stxvd2x 41, %5, %3 \n\t"
|
||||
"stxvd2x 42, %6, %3 \n\t"
|
||||
"stxvd2x 43, %7, %3 \n\t"
|
||||
"stxvd2x 44, %8, %3 \n\t"
|
||||
"stxvd2x 45, %9, %3 \n\t"
|
||||
"stxvd2x 46, %10, %3 \n\t"
|
||||
"stxvd2x 47, %11, %3 \n"
|
||||
|
||||
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
||||
:
|
||||
|
||||
@@ -42,91 +42,91 @@ static void cswap_kernel_32 (long n, float *x, float *y)
|
||||
".p2align 5 \n"
|
||||
"1: \n\t"
|
||||
|
||||
"lxvw4x 32, 0, %4 \n\t"
|
||||
"lxvw4x 33, %5, %4 \n\t"
|
||||
"lxvw4x 34, %6, %4 \n\t"
|
||||
"lxvw4x 35, %7, %4 \n\t"
|
||||
"lxvw4x 36, %8, %4 \n\t"
|
||||
"lxvw4x 37, %9, %4 \n\t"
|
||||
"lxvw4x 38, %10, %4 \n\t"
|
||||
"lxvw4x 39, %11, %4 \n\t"
|
||||
"lxvd2x 32, 0, %4 \n\t"
|
||||
"lxvd2x 33, %5, %4 \n\t"
|
||||
"lxvd2x 34, %6, %4 \n\t"
|
||||
"lxvd2x 35, %7, %4 \n\t"
|
||||
"lxvd2x 36, %8, %4 \n\t"
|
||||
"lxvd2x 37, %9, %4 \n\t"
|
||||
"lxvd2x 38, %10, %4 \n\t"
|
||||
"lxvd2x 39, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %4 \n\t"
|
||||
"lxvw4x 41, %5, %4 \n\t"
|
||||
"lxvw4x 42, %6, %4 \n\t"
|
||||
"lxvw4x 43, %7, %4 \n\t"
|
||||
"lxvw4x 44, %8, %4 \n\t"
|
||||
"lxvw4x 45, %9, %4 \n\t"
|
||||
"lxvw4x 46, %10, %4 \n\t"
|
||||
"lxvw4x 47, %11, %4 \n\t"
|
||||
"lxvd2x 40, 0, %4 \n\t"
|
||||
"lxvd2x 41, %5, %4 \n\t"
|
||||
"lxvd2x 42, %6, %4 \n\t"
|
||||
"lxvd2x 43, %7, %4 \n\t"
|
||||
"lxvd2x 44, %8, %4 \n\t"
|
||||
"lxvd2x 45, %9, %4 \n\t"
|
||||
"lxvd2x 46, %10, %4 \n\t"
|
||||
"lxvd2x 47, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, -128 \n\t"
|
||||
|
||||
"lxvw4x 48, 0, %3 \n\t"
|
||||
"lxvw4x 49, %5, %3 \n\t"
|
||||
"lxvw4x 50, %6, %3 \n\t"
|
||||
"lxvw4x 51, %7, %3 \n\t"
|
||||
"lxvw4x 0, %8, %3 \n\t"
|
||||
"lxvw4x 1, %9, %3 \n\t"
|
||||
"lxvw4x 2, %10, %3 \n\t"
|
||||
"lxvw4x 3, %11, %3 \n\t"
|
||||
"lxvd2x 48, 0, %3 \n\t"
|
||||
"lxvd2x 49, %5, %3 \n\t"
|
||||
"lxvd2x 50, %6, %3 \n\t"
|
||||
"lxvd2x 51, %7, %3 \n\t"
|
||||
"lxvd2x 0, %8, %3 \n\t"
|
||||
"lxvd2x 1, %9, %3 \n\t"
|
||||
"lxvd2x 2, %10, %3 \n\t"
|
||||
"lxvd2x 3, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"lxvw4x 4, 0, %3 \n\t"
|
||||
"lxvw4x 5, %5, %3 \n\t"
|
||||
"lxvw4x 6, %6, %3 \n\t"
|
||||
"lxvw4x 7, %7, %3 \n\t"
|
||||
"lxvw4x 8, %8, %3 \n\t"
|
||||
"lxvw4x 9, %9, %3 \n\t"
|
||||
"lxvw4x 10, %10, %3 \n\t"
|
||||
"lxvw4x 11, %11, %3 \n\t"
|
||||
"lxvd2x 4, 0, %3 \n\t"
|
||||
"lxvd2x 5, %5, %3 \n\t"
|
||||
"lxvd2x 6, %6, %3 \n\t"
|
||||
"lxvd2x 7, %7, %3 \n\t"
|
||||
"lxvd2x 8, %8, %3 \n\t"
|
||||
"lxvd2x 9, %9, %3 \n\t"
|
||||
"lxvd2x 10, %10, %3 \n\t"
|
||||
"lxvd2x 11, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, -128 \n\t"
|
||||
|
||||
"stxvw4x 32, 0, %3 \n\t"
|
||||
"stxvw4x 33, %5, %3 \n\t"
|
||||
"stxvw4x 34, %6, %3 \n\t"
|
||||
"stxvw4x 35, %7, %3 \n\t"
|
||||
"stxvw4x 36, %8, %3 \n\t"
|
||||
"stxvw4x 37, %9, %3 \n\t"
|
||||
"stxvw4x 38, %10, %3 \n\t"
|
||||
"stxvw4x 39, %11, %3 \n\t"
|
||||
"stxvd2x 32, 0, %3 \n\t"
|
||||
"stxvd2x 33, %5, %3 \n\t"
|
||||
"stxvd2x 34, %6, %3 \n\t"
|
||||
"stxvd2x 35, %7, %3 \n\t"
|
||||
"stxvd2x 36, %8, %3 \n\t"
|
||||
"stxvd2x 37, %9, %3 \n\t"
|
||||
"stxvd2x 38, %10, %3 \n\t"
|
||||
"stxvd2x 39, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t"
|
||||
"stxvw4x 41, %5, %3 \n\t"
|
||||
"stxvw4x 42, %6, %3 \n\t"
|
||||
"stxvw4x 43, %7, %3 \n\t"
|
||||
"stxvw4x 44, %8, %3 \n\t"
|
||||
"stxvw4x 45, %9, %3 \n\t"
|
||||
"stxvw4x 46, %10, %3 \n\t"
|
||||
"stxvw4x 47, %11, %3 \n\t"
|
||||
"stxvd2x 40, 0, %3 \n\t"
|
||||
"stxvd2x 41, %5, %3 \n\t"
|
||||
"stxvd2x 42, %6, %3 \n\t"
|
||||
"stxvd2x 43, %7, %3 \n\t"
|
||||
"stxvd2x 44, %8, %3 \n\t"
|
||||
"stxvd2x 45, %9, %3 \n\t"
|
||||
"stxvd2x 46, %10, %3 \n\t"
|
||||
"stxvd2x 47, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvw4x 48, 0, %4 \n\t"
|
||||
"stxvw4x 49, %5, %4 \n\t"
|
||||
"stxvw4x 50, %6, %4 \n\t"
|
||||
"stxvw4x 51, %7, %4 \n\t"
|
||||
"stxvw4x 0, %8, %4 \n\t"
|
||||
"stxvw4x 1, %9, %4 \n\t"
|
||||
"stxvw4x 2, %10, %4 \n\t"
|
||||
"stxvw4x 3, %11, %4 \n\t"
|
||||
"stxvd2x 48, 0, %4 \n\t"
|
||||
"stxvd2x 49, %5, %4 \n\t"
|
||||
"stxvd2x 50, %6, %4 \n\t"
|
||||
"stxvd2x 51, %7, %4 \n\t"
|
||||
"stxvd2x 0, %8, %4 \n\t"
|
||||
"stxvd2x 1, %9, %4 \n\t"
|
||||
"stxvd2x 2, %10, %4 \n\t"
|
||||
"stxvd2x 3, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
"stxvw4x 4, 0, %4 \n\t"
|
||||
"stxvw4x 5, %5, %4 \n\t"
|
||||
"stxvw4x 6, %6, %4 \n\t"
|
||||
"stxvw4x 7, %7, %4 \n\t"
|
||||
"stxvw4x 8, %8, %4 \n\t"
|
||||
"stxvw4x 9, %9, %4 \n\t"
|
||||
"stxvw4x 10, %10, %4 \n\t"
|
||||
"stxvw4x 11, %11, %4 \n\t"
|
||||
"stxvd2x 4, 0, %4 \n\t"
|
||||
"stxvd2x 5, %5, %4 \n\t"
|
||||
"stxvd2x 6, %6, %4 \n\t"
|
||||
"stxvd2x 7, %7, %4 \n\t"
|
||||
"stxvd2x 8, %8, %4 \n\t"
|
||||
"stxvd2x 9, %9, %4 \n\t"
|
||||
"stxvd2x 10, %10, %4 \n\t"
|
||||
"stxvd2x 11, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
|
||||
@@ -44,16 +44,16 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||
|
||||
__asm__
|
||||
(
|
||||
"lxvd2x 34, 0, %9 \n\t" // x0, x1
|
||||
"lxvd2x 35, %10, %9 \n\t" // x2, x3
|
||||
"xxspltd 32, %x8, 0 \n\t" // alpha, alpha
|
||||
"lxvd2x 34, 0, %10 \n\t" // x0, x1
|
||||
"lxvd2x 35, %11, %10 \n\t" // x2, x3
|
||||
"xxspltd 32, %x9, 0 \n\t" // alpha, alpha
|
||||
|
||||
"sldi %6, %4, 3 \n\t" // lda * sizeof (double)
|
||||
"sldi %6, %13, 3 \n\t" // lda * sizeof (double)
|
||||
|
||||
"xvmuldp 34, 34, 32 \n\t" // x0 * alpha, x1 * alpha
|
||||
"xvmuldp 35, 35, 32 \n\t" // x2 * alpha, x3 * alpha
|
||||
|
||||
"add %4, %3, %6 \n\t" // a1 = a0 + lda
|
||||
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
|
||||
"add %6, %6, %6 \n\t" // 2 * lda
|
||||
|
||||
"xxspltd 32, 34, 0 \n\t" // x0 * alpha, x0 * alpha
|
||||
@@ -70,16 +70,16 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||
"dcbt 0, %6 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1]
|
||||
"lxvd2x 41, %10, %3 \n\t" // a0[2], a0[3]
|
||||
"lxvd2x 41, %11, %3 \n\t" // a0[2], a0[3]
|
||||
|
||||
"lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1]
|
||||
"lxvd2x 43, %10, %4 \n\t" // a1[2], a1[3]
|
||||
"lxvd2x 43, %11, %4 \n\t" // a1[2], a1[3]
|
||||
|
||||
"lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1]
|
||||
"lxvd2x 45, %10, %5 \n\t" // a2[2], a2[3]
|
||||
"lxvd2x 45, %11, %5 \n\t" // a2[2], a2[3]
|
||||
|
||||
"lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1]
|
||||
"lxvd2x 47, %10, %6 \n\t" // a3[2], a3[3]
|
||||
"lxvd2x 47, %11, %6 \n\t" // a3[2], a3[3]
|
||||
|
||||
"dcbt 0, %2 \n\t"
|
||||
|
||||
@@ -95,37 +95,37 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||
"1: \n\t"
|
||||
|
||||
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||
"lxvd2x 37, %10, %2 \n\t" // y2, y3
|
||||
"lxvd2x 37, %11, %2 \n\t" // y2, y3
|
||||
|
||||
"xvmaddadp 36, 40, 32 \n\t"
|
||||
"xvmaddadp 37, 41, 32 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1]
|
||||
"lxvd2x 41, %10, %3 \n\t" // a0[2], a0[3]
|
||||
"lxvd2x 41, %11, %3 \n\t" // a0[2], a0[3]
|
||||
|
||||
"xvmaddadp 36, 42, 33 \n\t"
|
||||
"addi %3, %3, 32 \n\t"
|
||||
"xvmaddadp 37, 43, 33 \n\t"
|
||||
|
||||
"lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1]
|
||||
"lxvd2x 43, %10, %4 \n\t" // a1[2], a1[3]
|
||||
"lxvd2x 43, %11, %4 \n\t" // a1[2], a1[3]
|
||||
|
||||
"xvmaddadp 36, 44, 34 \n\t"
|
||||
"addi %4, %4, 32 \n\t"
|
||||
"xvmaddadp 37, 45, 34 \n\t"
|
||||
|
||||
"lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1]
|
||||
"lxvd2x 45, %10, %5 \n\t" // a2[2], a2[3]
|
||||
"lxvd2x 45, %11, %5 \n\t" // a2[2], a2[3]
|
||||
|
||||
"xvmaddadp 36, 46, 35 \n\t"
|
||||
"addi %5, %5, 32 \n\t"
|
||||
"xvmaddadp 37, 47, 35 \n\t"
|
||||
|
||||
"stxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||
"stxvd2x 37, %10, %2 \n\t" // y2, y3
|
||||
"stxvd2x 37, %11, %2 \n\t" // y2, y3
|
||||
|
||||
"lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1]
|
||||
"lxvd2x 47, %10, %6 \n\t" // a3[2], a3[3]
|
||||
"lxvd2x 47, %11, %6 \n\t" // a3[2], a3[3]
|
||||
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
@@ -135,37 +135,37 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||
|
||||
|
||||
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||
"lxvd2x 37, %10, %2 \n\t" // y2, y3
|
||||
"lxvd2x 37, %11, %2 \n\t" // y2, y3
|
||||
|
||||
"xvmaddadp 36, 40, 32 \n\t"
|
||||
"xvmaddadp 37, 41, 32 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1]
|
||||
"lxvd2x 41, %10, %3 \n\t" // a0[2], a0[3]
|
||||
"lxvd2x 41, %11, %3 \n\t" // a0[2], a0[3]
|
||||
|
||||
"xvmaddadp 36, 42, 33 \n\t"
|
||||
"addi %3, %3, 32 \n\t"
|
||||
"xvmaddadp 37, 43, 33 \n\t"
|
||||
|
||||
"lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1]
|
||||
"lxvd2x 43, %10, %4 \n\t" // a1[2], a1[3]
|
||||
"lxvd2x 43, %11, %4 \n\t" // a1[2], a1[3]
|
||||
|
||||
"xvmaddadp 36, 44, 34 \n\t"
|
||||
"addi %4, %4, 32 \n\t"
|
||||
"xvmaddadp 37, 45, 34 \n\t"
|
||||
|
||||
"lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1]
|
||||
"lxvd2x 45, %10, %5 \n\t" // a2[2], a2[3]
|
||||
"lxvd2x 45, %11, %5 \n\t" // a2[2], a2[3]
|
||||
|
||||
"xvmaddadp 36, 46, 35 \n\t"
|
||||
"addi %5, %5, 32 \n\t"
|
||||
"xvmaddadp 37, 47, 35 \n\t"
|
||||
|
||||
"stxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||
"stxvd2x 37, %10, %2 \n\t" // y2, y3
|
||||
"stxvd2x 37, %11, %2 \n\t" // y2, y3
|
||||
|
||||
"lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1]
|
||||
"lxvd2x 47, %10, %6 \n\t" // a3[2], a3[3]
|
||||
"lxvd2x 47, %11, %6 \n\t" // a3[2], a3[3]
|
||||
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
@@ -175,37 +175,37 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||
|
||||
|
||||
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||
"lxvd2x 37, %10, %2 \n\t" // y2, y3
|
||||
"lxvd2x 37, %11, %2 \n\t" // y2, y3
|
||||
|
||||
"xvmaddadp 36, 40, 32 \n\t"
|
||||
"xvmaddadp 37, 41, 32 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1]
|
||||
"lxvd2x 41, %10, %3 \n\t" // a0[2], a0[3]
|
||||
"lxvd2x 41, %11, %3 \n\t" // a0[2], a0[3]
|
||||
|
||||
"xvmaddadp 36, 42, 33 \n\t"
|
||||
"addi %3, %3, 32 \n\t"
|
||||
"xvmaddadp 37, 43, 33 \n\t"
|
||||
|
||||
"lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1]
|
||||
"lxvd2x 43, %10, %4 \n\t" // a1[2], a1[3]
|
||||
"lxvd2x 43, %11, %4 \n\t" // a1[2], a1[3]
|
||||
|
||||
"xvmaddadp 36, 44, 34 \n\t"
|
||||
"addi %4, %4, 32 \n\t"
|
||||
"xvmaddadp 37, 45, 34 \n\t"
|
||||
|
||||
"lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1]
|
||||
"lxvd2x 45, %10, %5 \n\t" // a2[2], a2[3]
|
||||
"lxvd2x 45, %11, %5 \n\t" // a2[2], a2[3]
|
||||
|
||||
"xvmaddadp 36, 46, 35 \n\t"
|
||||
"addi %5, %5, 32 \n\t"
|
||||
"xvmaddadp 37, 47, 35 \n\t"
|
||||
|
||||
"stxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||
"stxvd2x 37, %10, %2 \n\t" // y2, y3
|
||||
"stxvd2x 37, %11, %2 \n\t" // y2, y3
|
||||
|
||||
"lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1]
|
||||
"lxvd2x 47, %10, %6 \n\t" // a3[2], a3[3]
|
||||
"lxvd2x 47, %11, %6 \n\t" // a3[2], a3[3]
|
||||
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
@@ -215,37 +215,37 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||
|
||||
|
||||
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||
"lxvd2x 37, %10, %2 \n\t" // y2, y3
|
||||
"lxvd2x 37, %11, %2 \n\t" // y2, y3
|
||||
|
||||
"xvmaddadp 36, 40, 32 \n\t"
|
||||
"xvmaddadp 37, 41, 32 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1]
|
||||
"lxvd2x 41, %10, %3 \n\t" // a0[2], a0[3]
|
||||
"lxvd2x 41, %11, %3 \n\t" // a0[2], a0[3]
|
||||
|
||||
"xvmaddadp 36, 42, 33 \n\t"
|
||||
"addi %3, %3, 32 \n\t"
|
||||
"xvmaddadp 37, 43, 33 \n\t"
|
||||
|
||||
"lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1]
|
||||
"lxvd2x 43, %10, %4 \n\t" // a1[2], a1[3]
|
||||
"lxvd2x 43, %11, %4 \n\t" // a1[2], a1[3]
|
||||
|
||||
"xvmaddadp 36, 44, 34 \n\t"
|
||||
"addi %4, %4, 32 \n\t"
|
||||
"xvmaddadp 37, 45, 34 \n\t"
|
||||
|
||||
"lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1]
|
||||
"lxvd2x 45, %10, %5 \n\t" // a2[2], a2[3]
|
||||
"lxvd2x 45, %11, %5 \n\t" // a2[2], a2[3]
|
||||
|
||||
"xvmaddadp 36, 46, 35 \n\t"
|
||||
"addi %5, %5, 32 \n\t"
|
||||
"xvmaddadp 37, 47, 35 \n\t"
|
||||
|
||||
"stxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||
"stxvd2x 37, %10, %2 \n\t" // y2, y3
|
||||
"stxvd2x 37, %11, %2 \n\t" // y2, y3
|
||||
|
||||
"lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1]
|
||||
"lxvd2x 47, %10, %6 \n\t" // a3[2], a3[3]
|
||||
"lxvd2x 47, %11, %6 \n\t" // a3[2], a3[3]
|
||||
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
@@ -256,7 +256,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||
"2: \n\t"
|
||||
|
||||
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||
"lxvd2x 37, %10, %2 \n\t" // y2, y3
|
||||
"lxvd2x 37, %11, %2 \n\t" // y2, y3
|
||||
|
||||
"xvmaddadp 36, 40, 32 \n\t"
|
||||
"xvmaddadp 37, 41, 32 \n\t"
|
||||
@@ -271,12 +271,12 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||
"xvmaddadp 37, 47, 35 \n\t"
|
||||
|
||||
"stxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||
"stxvd2x 37, %10, %2 \n" // y2, y3
|
||||
"stxvd2x 37, %11, %2 \n" // y2, y3
|
||||
|
||||
"#n=%1 ap=%11 lda=%12 x=%7=%9 y=%0=%2 alpha=%8 o16=%10\n"
|
||||
"#n=%1 ap=%8=%12 lda=%13 x=%7=%10 y=%0=%2 alpha=%9 o16=%11\n"
|
||||
"#a0=%3 a1=%4 a2=%5 a3=%6"
|
||||
:
|
||||
"=m" (*y),
|
||||
"+m" (*y),
|
||||
"+r" (n), // 1
|
||||
"+b" (y), // 2
|
||||
"=b" (a0), // 3
|
||||
@@ -285,11 +285,12 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||
"=&b" (a3) // 6
|
||||
:
|
||||
"m" (*x),
|
||||
"d" (alpha), // 8
|
||||
"r" (x), // 9
|
||||
"b" (16), // 10
|
||||
"3" (ap), // 11
|
||||
"4" (lda) // 12
|
||||
"m" (*ap),
|
||||
"d" (alpha), // 9
|
||||
"r" (x), // 10
|
||||
"b" (16), // 11
|
||||
"3" (ap), // 12
|
||||
"4" (lda) // 13
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37",
|
||||
|
||||
@@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x)
|
||||
"xxlxor 38, 38, 38 \n\t"
|
||||
"xxlxor 39, 39, 39 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %8, %2 \n\t"
|
||||
"lxvw4x 42, %9, %2 \n\t"
|
||||
"lxvw4x 43, %10, %2 \n\t"
|
||||
"lxvw4x 44, %11, %2 \n\t"
|
||||
"lxvw4x 45, %12, %2 \n\t"
|
||||
"lxvw4x 46, %13, %2 \n\t"
|
||||
"lxvw4x 47, %14, %2 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %8, %2 \n\t"
|
||||
"lxvd2x 42, %9, %2 \n\t"
|
||||
"lxvd2x 43, %10, %2 \n\t"
|
||||
"lxvd2x 44, %11, %2 \n\t"
|
||||
"lxvd2x 45, %12, %2 \n\t"
|
||||
"lxvd2x 46, %13, %2 \n\t"
|
||||
"lxvd2x 47, %14, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
@@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x)
|
||||
"xvabssp 50, 42 \n\t"
|
||||
"xvabssp 51, 43 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %8, %2 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %8, %2 \n\t"
|
||||
|
||||
"xvabssp %x3, 44 \n\t"
|
||||
"xvabssp %x4, 45 \n\t"
|
||||
|
||||
"lxvw4x 42, %9, %2 \n\t"
|
||||
"lxvw4x 43, %10, %2 \n\t"
|
||||
"lxvd2x 42, %9, %2 \n\t"
|
||||
"lxvd2x 43, %10, %2 \n\t"
|
||||
|
||||
"xvabssp %x5, 46 \n\t"
|
||||
"xvabssp %x6, 47 \n\t"
|
||||
|
||||
"lxvw4x 44, %11, %2 \n\t"
|
||||
"lxvw4x 45, %12, %2 \n\t"
|
||||
"lxvd2x 44, %11, %2 \n\t"
|
||||
"lxvd2x 45, %12, %2 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 48 \n\t"
|
||||
"xvaddsp 33, 33, 49 \n\t"
|
||||
|
||||
"lxvw4x 46, %13, %2 \n\t"
|
||||
"lxvw4x 47, %14, %2 \n\t"
|
||||
"lxvd2x 46, %13, %2 \n\t"
|
||||
"lxvd2x 47, %14, %2 \n\t"
|
||||
|
||||
"xvaddsp 34, 34, 50 \n\t"
|
||||
"xvaddsp 35, 35, 51 \n\t"
|
||||
@@ -146,7 +146,7 @@ static float sasum_kernel_32 (long n, float *x)
|
||||
"xxsldwi 33, 32, 32, 1 \n\t"
|
||||
"xvaddsp 32, 32, 33 \n\t"
|
||||
|
||||
"xscvspdp %0, 32 \n"
|
||||
"xscvspdp %x0, 32 \n"
|
||||
|
||||
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
|
||||
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
|
||||
|
||||
@@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
@@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y)
|
||||
".p2align 5 \n"
|
||||
"1: \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t"
|
||||
"stxvw4x 41, %5, %3 \n\t"
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"stxvw4x 42, %6, %3 \n\t"
|
||||
"stxvw4x 43, %7, %3 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"stxvw4x 44, %8, %3 \n\t"
|
||||
"stxvw4x 45, %9, %3 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"stxvw4x 46, %10, %3 \n\t"
|
||||
"stxvw4x 47, %11, %3 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
"stxvd2x 40, 0, %3 \n\t"
|
||||
"stxvd2x 41, %5, %3 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"stxvd2x 42, %6, %3 \n\t"
|
||||
"stxvd2x 43, %7, %3 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"stxvd2x 44, %8, %3 \n\t"
|
||||
"stxvd2x 45, %9, %3 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"stxvd2x 46, %10, %3 \n\t"
|
||||
"stxvd2x 47, %11, %3 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
@@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t"
|
||||
"stxvw4x 41, %5, %3 \n\t"
|
||||
"stxvw4x 42, %6, %3 \n\t"
|
||||
"stxvw4x 43, %7, %3 \n\t"
|
||||
"stxvw4x 44, %8, %3 \n\t"
|
||||
"stxvw4x 45, %9, %3 \n\t"
|
||||
"stxvw4x 46, %10, %3 \n\t"
|
||||
"stxvw4x 47, %11, %3 \n"
|
||||
"stxvd2x 40, 0, %3 \n\t"
|
||||
"stxvd2x 41, %5, %3 \n\t"
|
||||
"stxvd2x 42, %6, %3 \n\t"
|
||||
"stxvd2x 43, %7, %3 \n\t"
|
||||
"stxvd2x 44, %8, %3 \n\t"
|
||||
"stxvd2x 45, %9, %3 \n\t"
|
||||
"stxvd2x 46, %10, %3 \n\t"
|
||||
"stxvd2x 47, %11, %3 \n"
|
||||
|
||||
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
||||
:
|
||||
|
||||
@@ -57,22 +57,22 @@ static float sdot_kernel_16 (long n, float *x, float *y)
|
||||
"xxlxor 38, 38, 38 \n\t"
|
||||
"xxlxor 39, 39, 39 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 48, 0, %3 \n\t"
|
||||
"lxvw4x 41, %10, %2 \n\t"
|
||||
"lxvw4x 49, %10, %3 \n\t"
|
||||
"lxvw4x 42, %11, %2 \n\t"
|
||||
"lxvw4x 50, %11, %3 \n\t"
|
||||
"lxvw4x 43, %12, %2 \n\t"
|
||||
"lxvw4x 51, %12, %3 \n\t"
|
||||
"lxvw4x 44, %13, %2 \n\t"
|
||||
"lxvw4x %x4, %13, %3 \n\t"
|
||||
"lxvw4x 45, %14, %2 \n\t"
|
||||
"lxvw4x %x5, %14, %3 \n\t"
|
||||
"lxvw4x 46, %15, %2 \n\t"
|
||||
"lxvw4x %x6, %15, %3 \n\t"
|
||||
"lxvw4x 47, %16, %2 \n\t"
|
||||
"lxvw4x %x7, %16, %3 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 48, 0, %3 \n\t"
|
||||
"lxvd2x 41, %10, %2 \n\t"
|
||||
"lxvd2x 49, %10, %3 \n\t"
|
||||
"lxvd2x 42, %11, %2 \n\t"
|
||||
"lxvd2x 50, %11, %3 \n\t"
|
||||
"lxvd2x 43, %12, %2 \n\t"
|
||||
"lxvd2x 51, %12, %3 \n\t"
|
||||
"lxvd2x 44, %13, %2 \n\t"
|
||||
"lxvd2x %x4, %13, %3 \n\t"
|
||||
"lxvd2x 45, %14, %2 \n\t"
|
||||
"lxvd2x %x5, %14, %3 \n\t"
|
||||
"lxvd2x 46, %15, %2 \n\t"
|
||||
"lxvd2x %x6, %15, %3 \n\t"
|
||||
"lxvd2x 47, %16, %2 \n\t"
|
||||
"lxvd2x %x7, %16, %3 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
@@ -84,29 +84,29 @@ static float sdot_kernel_16 (long n, float *x, float *y)
|
||||
"1: \n\t"
|
||||
|
||||
"xvmaddasp 32, 40, 48 \n\t"
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 48, 0, %3 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 48, 0, %3 \n\t"
|
||||
"xvmaddasp 33, 41, 49 \n\t"
|
||||
"lxvw4x 41, %10, %2 \n\t"
|
||||
"lxvw4x 49, %10, %3 \n\t"
|
||||
"lxvd2x 41, %10, %2 \n\t"
|
||||
"lxvd2x 49, %10, %3 \n\t"
|
||||
"xvmaddasp 34, 42, 50 \n\t"
|
||||
"lxvw4x 42, %11, %2 \n\t"
|
||||
"lxvw4x 50, %11, %3 \n\t"
|
||||
"lxvd2x 42, %11, %2 \n\t"
|
||||
"lxvd2x 50, %11, %3 \n\t"
|
||||
"xvmaddasp 35, 43, 51 \n\t"
|
||||
"lxvw4x 43, %12, %2 \n\t"
|
||||
"lxvw4x 51, %12, %3 \n\t"
|
||||
"lxvd2x 43, %12, %2 \n\t"
|
||||
"lxvd2x 51, %12, %3 \n\t"
|
||||
"xvmaddasp 36, 44, %x4 \n\t"
|
||||
"lxvw4x 44, %13, %2 \n\t"
|
||||
"lxvw4x %x4, %13, %3 \n\t"
|
||||
"lxvd2x 44, %13, %2 \n\t"
|
||||
"lxvd2x %x4, %13, %3 \n\t"
|
||||
"xvmaddasp 37, 45, %x5 \n\t"
|
||||
"lxvw4x 45, %14, %2 \n\t"
|
||||
"lxvw4x %x5, %14, %3 \n\t"
|
||||
"lxvd2x 45, %14, %2 \n\t"
|
||||
"lxvd2x %x5, %14, %3 \n\t"
|
||||
"xvmaddasp 38, 46, %x6 \n\t"
|
||||
"lxvw4x 46, %15, %2 \n\t"
|
||||
"lxvw4x %x6, %15, %3 \n\t"
|
||||
"lxvd2x 46, %15, %2 \n\t"
|
||||
"lxvd2x %x6, %15, %3 \n\t"
|
||||
"xvmaddasp 39, 47, %x7 \n\t"
|
||||
"lxvw4x 47, %16, %2 \n\t"
|
||||
"lxvw4x %x7, %16, %3 \n\t"
|
||||
"lxvd2x 47, %16, %2 \n\t"
|
||||
"lxvd2x %x7, %16, %3 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
@@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
|
||||
"xscvdpspn 37, %x14 \n\t" // load s to all words
|
||||
"xxspltw 37, 37, 0 \n\t"
|
||||
|
||||
"lxvw4x 32, 0, %3 \n\t" // load x
|
||||
"lxvw4x 33, %15, %3 \n\t"
|
||||
"lxvw4x 34, %16, %3 \n\t"
|
||||
"lxvw4x 35, %17, %3 \n\t"
|
||||
"lxvd2x 32, 0, %3 \n\t" // load x
|
||||
"lxvd2x 33, %15, %3 \n\t"
|
||||
"lxvd2x 34, %16, %3 \n\t"
|
||||
"lxvd2x 35, %17, %3 \n\t"
|
||||
|
||||
"lxvw4x 48, 0, %4 \n\t" // load y
|
||||
"lxvw4x 49, %15, %4 \n\t"
|
||||
"lxvw4x 50, %16, %4 \n\t"
|
||||
"lxvw4x 51, %17, %4 \n\t"
|
||||
"lxvd2x 48, 0, %4 \n\t" // load y
|
||||
"lxvd2x 49, %15, %4 \n\t"
|
||||
"lxvd2x 50, %16, %4 \n\t"
|
||||
"lxvd2x 51, %17, %4 \n\t"
|
||||
|
||||
"addi %3, %3, 64 \n\t"
|
||||
"addi %4, %4, 64 \n\t"
|
||||
@@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
|
||||
"xvmulsp 44, 32, 37 \n\t" // s * x
|
||||
"xvmulsp 45, 33, 37 \n\t"
|
||||
|
||||
"lxvw4x 32, 0, %3 \n\t" // load x
|
||||
"lxvw4x 33, %15, %3 \n\t"
|
||||
"lxvd2x 32, 0, %3 \n\t" // load x
|
||||
"lxvd2x 33, %15, %3 \n\t"
|
||||
|
||||
"xvmulsp 46, 34, 37 \n\t"
|
||||
"xvmulsp 47, 35, 37 \n\t"
|
||||
|
||||
"lxvw4x 34, %16, %3 \n\t"
|
||||
"lxvw4x 35, %17, %3 \n\t"
|
||||
"lxvd2x 34, %16, %3 \n\t"
|
||||
"lxvd2x 35, %17, %3 \n\t"
|
||||
|
||||
"xvmulsp %x9, 48, 37 \n\t" // s * y
|
||||
"xvmulsp %x10, 49, 37 \n\t"
|
||||
|
||||
"lxvw4x 48, 0, %4 \n\t" // load y
|
||||
"lxvw4x 49, %15, %4 \n\t"
|
||||
"lxvd2x 48, 0, %4 \n\t" // load y
|
||||
"lxvd2x 49, %15, %4 \n\t"
|
||||
|
||||
"xvmulsp %x11, 50, 37 \n\t"
|
||||
"xvmulsp %x12, 51, 37 \n\t"
|
||||
|
||||
"lxvw4x 50, %16, %4 \n\t"
|
||||
"lxvw4x 51, %17, %4 \n\t"
|
||||
"lxvd2x 50, %16, %4 \n\t"
|
||||
"lxvd2x 51, %17, %4 \n\t"
|
||||
|
||||
"xvaddsp 40, 40, %x9 \n\t" // c * x + s * y
|
||||
"xvaddsp 41, 41, %x10 \n\t" // c * x + s * y
|
||||
@@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
|
||||
"xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
|
||||
"xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t" // store x
|
||||
"stxvw4x 41, %15, %3 \n\t"
|
||||
"stxvw4x 42, %16, %3 \n\t"
|
||||
"stxvw4x 43, %17, %3 \n\t"
|
||||
"stxvd2x 40, 0, %3 \n\t" // store x
|
||||
"stxvd2x 41, %15, %3 \n\t"
|
||||
"stxvd2x 42, %16, %3 \n\t"
|
||||
"stxvd2x 43, %17, %3 \n\t"
|
||||
|
||||
"stxvw4x %x5, 0, %4 \n\t" // store y
|
||||
"stxvw4x %x6, %15, %4 \n\t"
|
||||
"stxvw4x %x7, %16, %4 \n\t"
|
||||
"stxvw4x %x8, %17, %4 \n\t"
|
||||
"stxvd2x %x5, 0, %4 \n\t" // store y
|
||||
"stxvd2x %x6, %15, %4 \n\t"
|
||||
"stxvd2x %x7, %16, %4 \n\t"
|
||||
"stxvd2x %x8, %17, %4 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
"addi %4, %4, 128 \n\t"
|
||||
@@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
|
||||
"xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
|
||||
"xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t" // store x
|
||||
"stxvw4x 41, %15, %3 \n\t"
|
||||
"stxvw4x 42, %16, %3 \n\t"
|
||||
"stxvw4x 43, %17, %3 \n\t"
|
||||
"stxvd2x 40, 0, %3 \n\t" // store x
|
||||
"stxvd2x 41, %15, %3 \n\t"
|
||||
"stxvd2x 42, %16, %3 \n\t"
|
||||
"stxvd2x 43, %17, %3 \n\t"
|
||||
|
||||
"stxvw4x %x5, 0, %4 \n\t" // store y
|
||||
"stxvw4x %x6, %15, %4 \n\t"
|
||||
"stxvw4x %x7, %16, %4 \n\t"
|
||||
"stxvw4x %x8, %17, %4 \n"
|
||||
"stxvd2x %x5, 0, %4 \n\t" // store y
|
||||
"stxvd2x %x6, %15, %4 \n\t"
|
||||
"stxvd2x %x7, %16, %4 \n\t"
|
||||
"stxvd2x %x8, %17, %4 \n"
|
||||
|
||||
"#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
|
||||
"#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"
|
||||
|
||||
@@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
|
||||
"xscvdpspn %x3, %x3 \n\t"
|
||||
"xxspltw %x3, %x3, 0 \n\t"
|
||||
|
||||
"lxvw4x 32, 0, %2 \n\t"
|
||||
"lxvw4x 33, %4, %2 \n\t"
|
||||
"lxvw4x 34, %5, %2 \n\t"
|
||||
"lxvw4x 35, %6, %2 \n\t"
|
||||
"lxvw4x 36, %7, %2 \n\t"
|
||||
"lxvw4x 37, %8, %2 \n\t"
|
||||
"lxvw4x 38, %9, %2 \n\t"
|
||||
"lxvw4x 39, %10, %2 \n\t"
|
||||
"lxvd2x 32, 0, %2 \n\t"
|
||||
"lxvd2x 33, %4, %2 \n\t"
|
||||
"lxvd2x 34, %5, %2 \n\t"
|
||||
"lxvd2x 35, %6, %2 \n\t"
|
||||
"lxvd2x 36, %7, %2 \n\t"
|
||||
"lxvd2x 37, %8, %2 \n\t"
|
||||
"lxvd2x 38, %9, %2 \n\t"
|
||||
"lxvd2x 39, %10, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
@@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
|
||||
|
||||
"xvmulsp 40, 32, %x3 \n\t"
|
||||
"xvmulsp 41, 33, %x3 \n\t"
|
||||
"lxvw4x 32, 0, %2 \n\t"
|
||||
"lxvw4x 33, %4, %2 \n\t"
|
||||
"lxvd2x 32, 0, %2 \n\t"
|
||||
"lxvd2x 33, %4, %2 \n\t"
|
||||
"xvmulsp 42, 34, %x3 \n\t"
|
||||
"xvmulsp 43, 35, %x3 \n\t"
|
||||
"lxvw4x 34, %5, %2 \n\t"
|
||||
"lxvw4x 35, %6, %2 \n\t"
|
||||
"lxvd2x 34, %5, %2 \n\t"
|
||||
"lxvd2x 35, %6, %2 \n\t"
|
||||
"xvmulsp 44, 36, %x3 \n\t"
|
||||
"xvmulsp 45, 37, %x3 \n\t"
|
||||
"lxvw4x 36, %7, %2 \n\t"
|
||||
"lxvw4x 37, %8, %2 \n\t"
|
||||
"lxvd2x 36, %7, %2 \n\t"
|
||||
"lxvd2x 37, %8, %2 \n\t"
|
||||
"xvmulsp 46, 38, %x3 \n\t"
|
||||
"xvmulsp 47, 39, %x3 \n\t"
|
||||
"lxvw4x 38, %9, %2 \n\t"
|
||||
"lxvw4x 39, %10, %2 \n\t"
|
||||
"lxvd2x 38, %9, %2 \n\t"
|
||||
"lxvd2x 39, %10, %2 \n\t"
|
||||
|
||||
"addi %2, %2, -128 \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %2 \n\t"
|
||||
"stxvw4x 41, %4, %2 \n\t"
|
||||
"stxvw4x 42, %5, %2 \n\t"
|
||||
"stxvw4x 43, %6, %2 \n\t"
|
||||
"stxvw4x 44, %7, %2 \n\t"
|
||||
"stxvw4x 45, %8, %2 \n\t"
|
||||
"stxvw4x 46, %9, %2 \n\t"
|
||||
"stxvw4x 47, %10, %2 \n\t"
|
||||
"stxvd2x 40, 0, %2 \n\t"
|
||||
"stxvd2x 41, %4, %2 \n\t"
|
||||
"stxvd2x 42, %5, %2 \n\t"
|
||||
"stxvd2x 43, %6, %2 \n\t"
|
||||
"stxvd2x 44, %7, %2 \n\t"
|
||||
"stxvd2x 45, %8, %2 \n\t"
|
||||
"stxvd2x 46, %9, %2 \n\t"
|
||||
"stxvd2x 47, %10, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 256 \n\t"
|
||||
|
||||
@@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
|
||||
"xvmulsp 46, 38, %x3 \n\t"
|
||||
"xvmulsp 47, 39, %x3 \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %2 \n\t"
|
||||
"stxvw4x 41, %4, %2 \n\t"
|
||||
"stxvw4x 42, %5, %2 \n\t"
|
||||
"stxvw4x 43, %6, %2 \n\t"
|
||||
"stxvw4x 44, %7, %2 \n\t"
|
||||
"stxvw4x 45, %8, %2 \n\t"
|
||||
"stxvw4x 46, %9, %2 \n\t"
|
||||
"stxvw4x 47, %10, %2 \n"
|
||||
"stxvd2x 40, 0, %2 \n\t"
|
||||
"stxvd2x 41, %4, %2 \n\t"
|
||||
"stxvd2x 42, %5, %2 \n\t"
|
||||
"stxvd2x 43, %6, %2 \n\t"
|
||||
"stxvd2x 44, %7, %2 \n\t"
|
||||
"stxvd2x 45, %8, %2 \n\t"
|
||||
"stxvd2x 46, %9, %2 \n\t"
|
||||
"stxvd2x 47, %10, %2 \n"
|
||||
|
||||
"#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
|
||||
:
|
||||
@@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x)
|
||||
".p2align 5 \n"
|
||||
"1: \n\t"
|
||||
|
||||
"stxvw4x %x3, 0, %2 \n\t"
|
||||
"stxvw4x %x3, %4, %2 \n\t"
|
||||
"stxvw4x %x3, %5, %2 \n\t"
|
||||
"stxvw4x %x3, %6, %2 \n\t"
|
||||
"stxvw4x %x3, %7, %2 \n\t"
|
||||
"stxvw4x %x3, %8, %2 \n\t"
|
||||
"stxvw4x %x3, %9, %2 \n\t"
|
||||
"stxvw4x %x3, %10, %2 \n\t"
|
||||
"stxvd2x %x3, 0, %2 \n\t"
|
||||
"stxvd2x %x3, %4, %2 \n\t"
|
||||
"stxvd2x %x3, %5, %2 \n\t"
|
||||
"stxvd2x %x3, %6, %2 \n\t"
|
||||
"stxvd2x %x3, %7, %2 \n\t"
|
||||
"stxvd2x %x3, %8, %2 \n\t"
|
||||
"stxvd2x %x3, %9, %2 \n\t"
|
||||
"stxvd2x %x3, %10, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
|
||||
@@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y)
|
||||
".p2align 5 \n"
|
||||
"1: \n\t"
|
||||
|
||||
"lxvw4x 32, 0, %4 \n\t"
|
||||
"lxvw4x 33, %5, %4 \n\t"
|
||||
"lxvw4x 34, %6, %4 \n\t"
|
||||
"lxvw4x 35, %7, %4 \n\t"
|
||||
"lxvw4x 36, %8, %4 \n\t"
|
||||
"lxvw4x 37, %9, %4 \n\t"
|
||||
"lxvw4x 38, %10, %4 \n\t"
|
||||
"lxvw4x 39, %11, %4 \n\t"
|
||||
"lxvd2x 32, 0, %4 \n\t"
|
||||
"lxvd2x 33, %5, %4 \n\t"
|
||||
"lxvd2x 34, %6, %4 \n\t"
|
||||
"lxvd2x 35, %7, %4 \n\t"
|
||||
"lxvd2x 36, %8, %4 \n\t"
|
||||
"lxvd2x 37, %9, %4 \n\t"
|
||||
"lxvd2x 38, %10, %4 \n\t"
|
||||
"lxvd2x 39, %11, %4 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %3 \n\t"
|
||||
"lxvw4x 41, %5, %3 \n\t"
|
||||
"lxvw4x 42, %6, %3 \n\t"
|
||||
"lxvw4x 43, %7, %3 \n\t"
|
||||
"lxvw4x 44, %8, %3 \n\t"
|
||||
"lxvw4x 45, %9, %3 \n\t"
|
||||
"lxvw4x 46, %10, %3 \n\t"
|
||||
"lxvw4x 47, %11, %3 \n\t"
|
||||
"lxvd2x 40, 0, %3 \n\t"
|
||||
"lxvd2x 41, %5, %3 \n\t"
|
||||
"lxvd2x 42, %6, %3 \n\t"
|
||||
"lxvd2x 43, %7, %3 \n\t"
|
||||
"lxvd2x 44, %8, %3 \n\t"
|
||||
"lxvd2x 45, %9, %3 \n\t"
|
||||
"lxvd2x 46, %10, %3 \n\t"
|
||||
"lxvd2x 47, %11, %3 \n\t"
|
||||
|
||||
"stxvw4x 32, 0, %3 \n\t"
|
||||
"stxvw4x 33, %5, %3 \n\t"
|
||||
"stxvw4x 34, %6, %3 \n\t"
|
||||
"stxvw4x 35, %7, %3 \n\t"
|
||||
"stxvw4x 36, %8, %3 \n\t"
|
||||
"stxvw4x 37, %9, %3 \n\t"
|
||||
"stxvw4x 38, %10, %3 \n\t"
|
||||
"stxvw4x 39, %11, %3 \n\t"
|
||||
"stxvd2x 32, 0, %3 \n\t"
|
||||
"stxvd2x 33, %5, %3 \n\t"
|
||||
"stxvd2x 34, %6, %3 \n\t"
|
||||
"stxvd2x 35, %7, %3 \n\t"
|
||||
"stxvd2x 36, %8, %3 \n\t"
|
||||
"stxvd2x 37, %9, %3 \n\t"
|
||||
"stxvd2x 38, %10, %3 \n\t"
|
||||
"stxvd2x 39, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %4 \n\t"
|
||||
"stxvw4x 41, %5, %4 \n\t"
|
||||
"stxvw4x 42, %6, %4 \n\t"
|
||||
"stxvw4x 43, %7, %4 \n\t"
|
||||
"stxvw4x 44, %8, %4 \n\t"
|
||||
"stxvw4x 45, %9, %4 \n\t"
|
||||
"stxvw4x 46, %10, %4 \n\t"
|
||||
"stxvw4x 47, %11, %4 \n\t"
|
||||
"stxvd2x 40, 0, %4 \n\t"
|
||||
"stxvd2x 41, %5, %4 \n\t"
|
||||
"stxvd2x 42, %6, %4 \n\t"
|
||||
"stxvd2x 43, %7, %4 \n\t"
|
||||
"stxvd2x 44, %8, %4 \n\t"
|
||||
"stxvd2x 45, %9, %4 \n\t"
|
||||
"stxvd2x 46, %10, %4 \n\t"
|
||||
"stxvd2x 47, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user