diff --git a/.travis.yml b/.travis.yml index 4a25e7121..4efa23b8d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,11 +4,10 @@ dist: precise sudo: true language: c -jobs: +matrix: include: - &test-ubuntu os: linux - stage: test compiler: gcc addons: apt: @@ -59,7 +58,6 @@ jobs: - BTYPE="BINARY=32" - os: linux - stage: test compiler: gcc addons: apt: @@ -80,13 +78,12 @@ jobs: # that don't require sudo. - &test-alpine os: linux - stage: test dist: trusty sudo: true language: minimal before_install: - - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \ - && echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1" + - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ + && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } install: - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' @@ -124,7 +121,6 @@ jobs: - &test-cmake os: linux - stage: test compiler: clang addons: apt: @@ -153,7 +149,6 @@ jobs: - &test-macos os: osx - stage: test osx_image: xcode8 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" @@ -168,6 +163,42 @@ jobs: env: - BTYPE="BINARY=32" + - &emulated-arm + dist: trusty + sudo: required + services: docker + env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc + name: "Emulated Build for ARMV6 with gcc" + before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset + script: | + echo "FROM openblas/alpine:${IMAGE_ARCH} + COPY . /tmp/openblas + RUN mkdir /tmp/openblas/build && \ + cd /tmp/openblas/build && \ + CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \ + -D TARGET=${TARGET_ARCH} \ + -D BUILD_SHARED_LIBS=ON \ + -D BUILD_WITHOUT_LAPACK=ON \ + -D BUILD_WITHOUT_CBLAS=ON \ + -D CMAKE_BUILD_TYPE=Release ../ && \ + cmake --build ." > Dockerfile + docker build . + - <<: *emulated-arm + env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang + name: "Emulated Build for ARMV6 with clang" + - <<: *emulated-arm + env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc + name: "Emulated Build for ARMV8 with gcc" + - <<: *emulated-arm + env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang + name: "Emulated Build for ARMV8 with clang" + + allow_failures: + - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc + - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang + - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc + - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang + # whitelist branches: only: diff --git a/CMakeLists.txt b/CMakeLists.txt index 0f985455b..296113941 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 3) +set(OpenBLAS_PATCH_VERSION 4) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -15,16 +15,21 @@ include(GNUInstallDirs) include(CMakePackageConfigHelpers) -set(OpenBLAS_LIBNAME openblas) - ####### if(MSVC) -option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) +option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) endif() -option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) -option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF) -option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF) -option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF) +option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) +option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF) +option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) +option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) + +# Add a prefix or suffix to all exported symbol names in the shared library. +# Avoids conflicts with other BLAS libraries, especially when using +# 64 bit integer interfaces in OpenBLAS. + +set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" ) +set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) ####### if(BUILD_WITHOUT_LAPACK) set(NO_LAPACK 1) @@ -38,11 +43,13 @@ endif() ####### -message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") +message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") +set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE}) + set(BLASDIRS interface driver/level2 driver/level3 driver/others) if (NOT DYNAMIC_ARCH) @@ -210,15 +217,84 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES SOVERSION ${OpenBLAS_MAJOR_VERSION} ) +if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") +if (NOT DEFINED ARCH) + set(ARCH_IN "x86_64") +else() + set(ARCH_IN ${ARCH}) +endif() + +if (${CORE} STREQUAL "generic") + set(ARCH_IN "GENERIC") +endif () + +if (NOT DEFINED EXPRECISION) + set(EXPRECISION_IN 0) +else() + set(EXPRECISION_IN ${EXPRECISION}) +endif() + +if (NOT DEFINED NO_CBLAS) + set(NO_CBLAS_IN 0) +else() + set(NO_CBLAS_IN ${NO_CBLAS}) +endif() + +if (NOT DEFINED NO_LAPACK) + set(NO_LAPACK_IN 0) +else() + set(NO_LAPACK_IN ${NO_LAPACK}) +endif() + +if (NOT DEFINED NO_LAPACKE) + set(NO_LAPACKE_IN 0) +else() + set(NO_LAPACKE_IN ${NO_LAPACKE}) +endif() + +if (NOT DEFINED NEED2UNDERSCORES) + set(NEED2UNDERSCORES_IN 0) +else() + set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) +endif() + +if (NOT DEFINED ONLY_CBLAS) + set(ONLY_CBLAS_IN 0) +else() + set(ONLY_CBLAS_IN ${ONLY_CBLAS}) +endif() + +if (NOT DEFINED BU) + set(BU _) +endif() + +if (NOT ${SYMBOLPREFIX} STREQUAL "") +message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") +endif() +if (NOT ${SYMBOLSUFFIX} STREQUAL "") +message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") +endif() + add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD + COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so + COMMENT "renaming symbols" + ) +endif() + + # Install project # Install libraries install(TARGETS ${OpenBLAS_LIBNAME} - EXPORT "OpenBLASTargets" + EXPORT "OpenBLAS${SUFFIX64}Targets" RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +# Install headers +set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) +set(CMAKE_INSTALL_FULL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}) + message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}") set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h) @@ -266,29 +342,31 @@ if(NOT NO_LAPACKE) ADD_CUSTOM_TARGET(genlapacke COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" ) - install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) endif() include(FindPkgConfig QUIET) if(PKG_CONFIG_FOUND) - configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY) - install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) + configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY) + install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) endif() # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". set(PN OpenBLAS) -set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}") +set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}") configure_package_config_file(cmake/${PN}Config.cmake.in - "${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake" INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR}) write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake VERSION ${${PN}_VERSION} COMPATIBILITY AnyNewerVersion) -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake - ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake DESTINATION ${CMAKECONFIG_INSTALL_DIR}) -install(EXPORT "${PN}Targets" - NAMESPACE "${PN}::" +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake + RENAME ${PN}${SUFFIX64}ConfigVersion.cmake + DESTINATION ${CMAKECONFIG_INSTALL_DIR}) +install(EXPORT "${PN}${SUFFIX64}Targets" + NAMESPACE "${PN}${SUFFIX64}::" DESTINATION ${CMAKECONFIG_INSTALL_DIR}) diff --git a/Makefile b/Makefile index d99521b19..d42f9b8c3 100644 --- a/Makefile +++ b/Makefile @@ -251,7 +251,7 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/Makefile.arm64 b/Makefile.arm64 index d19e796a5..a529fab80 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -4,22 +4,37 @@ CCOMMON_OPT += -march=armv8-a FCOMMON_OPT += -march=armv8-a endif -ifeq ($(CORE), CORTEXA57) -CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 -FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 +ifeq ($(CORE), CORTEXA53) +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 endif -ifeq ($(CORE), VULCAN) -CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan -FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan +ifeq ($(CORE), CORTEXA57) +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 +endif + +ifeq ($(CORE), CORTEXA72) +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +endif + +ifeq ($(CORE), CORTEXA73) +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 endif ifeq ($(CORE), THUNDERX) -CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx -FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx +CCOMMON_OPT += -march=armv8-a -mtune=thunderx +FCOMMON_OPT += -march=armv8-a -mtune=thunderx +endif + +ifeq ($(CORE), FALKOR) +CCOMMON_OPT += -march=armv8.1-a -mtune=falkor +FCOMMON_OPT += -march=armv8.1-a -mtune=falkor endif ifeq ($(CORE), THUNDERX2T99) -CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 -FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 +CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif diff --git a/Makefile.install b/Makefile.install index fa657beba..069c96c6a 100644 --- a/Makefile.install +++ b/Makefile.install @@ -48,6 +48,7 @@ ifndef NO_CBLAS @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif +ifneq ($(OSNAME), AIX) ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" @@ -72,6 +73,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif + ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @@ -93,6 +95,33 @@ ifeq ($(OSNAME), CYGWIN_NT) endif endif +else +#install on AIX has different options syntax +ifndef NO_LAPACKE + @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" + @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" +endif + +#for install static library +ifndef NO_STATIC + @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) + @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ + ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) +endif +#for install shared library +ifndef NO_SHARED + @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) + @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ + ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ + ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) +endif + +endif #Generating openblas.pc @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" diff --git a/Makefile.rule b/Makefile.rule index 6457532c8..f3086a01b 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.3 +VERSION = 0.3.4 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -107,13 +107,13 @@ BUILD_LAPACK_DEPRECATED = 1 # BUILD_RELAPACK = 1 # If you want to use legacy threaded Level 3 implementation. -USE_SIMPLE_THREADED_LEVEL3 = 1 +# USE_SIMPLE_THREADED_LEVEL3 = 1 # If you want to use the new, still somewhat experimental code that uses # thread-local storage instead of a central memory buffer in memory.c # Note that if your system uses GLIBC, it needs to have at least glibc 2.21 # for this to work. -USE_TLS = 1 +# USE_TLS = 1 # If you want to drive whole 64bit region by BLAS. Not all Fortran # compiler supports this. It's safe to keep comment it out if you @@ -152,6 +152,9 @@ NO_AFFINITY = 1 # FUNCTION_PROFILE = 1 # Support for IEEE quad precision(it's *real* REAL*16)( under testing) +# This option should not be used - it is a holdover from unfinished code present +# in the original GotoBLAS2 library that may be usable as a starting point but +# is not even expected to compile in its present form. # QUAD_PRECISION = 1 # Theads are still working for a while after finishing BLAS operation @@ -189,8 +192,8 @@ NO_AFFINITY = 1 # Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT # COMMON_OPT = -O2 -# gfortran option for LAPACK -# enable this flag only on 64bit Linux and if you need a thread safe lapack library +# gfortran option for LAPACK to improve thread-safety +# It is enabled by default in Makefile.system for gfortran # Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT # FCOMMON_OPT = -frecursive diff --git a/Makefile.system b/Makefile.system index 2123af204..22fe24337 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,6 +9,11 @@ ifndef TOPDIR TOPDIR = . endif +# Catch conflicting usage of ARCH in some BSD environments +ifeq ($(ARCH), amd64) +override ARCH=x86_64 +endif + NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib # Default C compiler @@ -505,6 +510,13 @@ CCOMMON_OPT += $(XCCOMMON_OPT) #CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)' endif +ifeq ($(ARCH), arm64) +DYNAMIC_CORE = ARMV8 +DYNAMIC_CORE += CORTEXA57 +DYNAMIC_CORE += THUNDERX +DYNAMIC_CORE += THUNDERX2T99 +endif + # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty ifndef DYNAMIC_CORE override DYNAMIC_ARCH= @@ -713,6 +725,8 @@ endif ifeq ($(F_COMPILER), GFORTRAN) CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall +# make single-threaded LAPACK calls thread-safe #1847 +FCOMMON_OPT += -frecursive #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran @@ -1022,6 +1036,8 @@ ifdef USE_TLS CCOMMON_OPT += -DUSE_TLS endif +CCOMMON_OPT += -DVERSION=\"$(VERSION)\" + ifndef SYMBOLPREFIX SYMBOLPREFIX = endif @@ -1199,7 +1215,11 @@ endif LIBDLLNAME = $(LIBPREFIX).dll IMPLIBNAME = lib$(LIBNAMEBASE).dll.a +ifneq ($(OSNAME), AIX) LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) +else +LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a) +endif LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f831b5040..f2647fb7d 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -15,6 +15,11 @@ FCOMMON_OPT += -march=skylake-avx512 ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables endif +ifeq ($(OSNAME), WINNT) +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +endif endif endif diff --git a/TargetList.txt b/TargetList.txt index 31e4881c4..3d04a57cf 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -83,8 +83,11 @@ ARMV5 8.ARM 64-bit CPU: ARMV8 +CORTEXA53 CORTEXA57 -VULCAN +CORTEXA72 +CORTEXA73 +FALKOR THUNDERX THUNDERX2T99 diff --git a/c_check b/c_check index 66acf1cad..9dc237beb 100644 --- a/c_check +++ b/c_check @@ -205,7 +205,7 @@ $binformat = bin64 if ($data =~ /BINARY_64/); $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; - print $tmpf "int main(void){ __asm__ volatile($code); }\n"; + print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); system(@cmd) == 0; diff --git a/cblas.h b/cblas.h index 6461f4209..d340a2037 100644 --- a/cblas.h +++ b/cblas.h @@ -51,7 +51,8 @@ typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=1 typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; - +typedef CBLAS_ORDER CBLAS_LAYOUT; + float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); float cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 1446a900d..adec28a91 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -3,6 +3,11 @@ ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets Fortran related variables. +if (INTERFACE64) + set(SUFFIX64 64) + set(SUFFIX64_UNDERSCORE _64) +endif() + if (${F_COMPILER} STREQUAL "FLANG") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") if (BINARY64 AND INTERFACE64) @@ -39,7 +44,7 @@ endif () if (${F_COMPILER} STREQUAL "GFORTRAN") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") - set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc if (NOT NO_LAPACK) set(EXTRALIB "{EXTRALIB} -lgfortran") diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index ca88a6d5f..df4b2ab06 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -1,4 +1,5 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ +libsuffix=@SUFFIX64_UNDERSCORE@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ @@ -6,5 +7,5 @@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ URL: https://github.com/xianyi/OpenBLAS -Libs: -L${libdir} -lopenblas +Libs: -L${libdir} -lopenblas${libsuffix} Cflags: -I${includedir} diff --git a/cmake/system.cmake b/cmake/system.cmake index 18b2c3b87..d803bb9eb 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -41,6 +41,12 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) endif () endif () +if (DEFINED TARGET) +if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") +endif() +endif() + if (DEFINED TARGET) message(STATUS "Targeting the ${TARGET} architecture.") set(GETARCH_FLAGS "-DFORCE_${TARGET}") @@ -304,6 +310,8 @@ if (MIXED_MEMORY_ALLOCATION) set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION") endif () +set(CCOMMON_OPT "${CCOMMON_OPT} -DVERSION=\"\\\"${OpenBLAS_VERSION}\\\"\"") + set(REVISION "-r${OpenBLAS_VERSION}") set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index d339a755f..6b602c1b0 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -10,6 +10,16 @@ if (${HOST_OS} STREQUAL "WINDOWS") set(HOST_OS WINNT) endif () +if (${HOST_OS} STREQUAL "LINUX") +# check if we're building natively on Android (TERMUX) + EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) + if(${OPERATING_SYSTEM} MATCHES "Android") + set(HOST_OS ANDROID) + endif(${OPERATING_SYSTEM} MATCHES "Android") +endif() + + + if(CMAKE_COMPILER_IS_GNUCC AND WIN32) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE @@ -67,7 +77,7 @@ else() endif() if (X86_64 OR X86) - file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") + file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include \n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") diff --git a/common.h b/common.h index 6c3d5b15e..7fcd5e316 100644 --- a/common.h +++ b/common.h @@ -183,7 +183,7 @@ extern "C" { #define ALLOCA_ALIGN 63UL -#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) +#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ diff --git a/common_mips64.h b/common_mips64.h index 93bc7e519..1163413dc 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -94,7 +94,7 @@ static inline unsigned int rpcc(void){ #define RPCC_DEFINED #ifndef NO_AFFINITY -#define WHEREAMI +//#define WHEREAMI static inline int WhereAmI(void){ int ret=0; __asm__ __volatile__(".set push \n" diff --git a/cpuid_arm64.c b/cpuid_arm64.c index a42346c88..c914fbc2b 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -29,25 +29,37 @@ #define CPU_UNKNOWN 0 #define CPU_ARMV8 1 -#define CPU_CORTEXA57 2 -#define CPU_VULCAN 3 -#define CPU_THUNDERX 4 -#define CPU_THUNDERX2T99 5 +// Arm +#define CPU_CORTEXA53 2 +#define CPU_CORTEXA57 3 +#define CPU_CORTEXA72 4 +#define CPU_CORTEXA73 5 +// Qualcomm +#define CPU_FALKOR 6 +// Cavium +#define CPU_THUNDERX 7 +#define CPU_THUNDERX2T99 8 static char *cpuname[] = { "UNKNOWN", "ARMV8" , + "CORTEXA53", "CORTEXA57", - "VULCAN", + "CORTEXA72", + "CORTEXA73", + "FALKOR", "THUNDERX", "THUNDERX2T99" }; static char *cpuname_lower[] = { "unknown", - "armv8" , + "armv8", + "cortexa53", "cortexa57", - "vulcan", + "cortexa72", + "cortexa73", + "falkor", "thunderx", "thunderx2t99" }; @@ -114,14 +126,24 @@ int detect(void) fclose(infile); if(cpu_part != NULL && cpu_implementer != NULL) { - if (strstr(cpu_implementer, "0x41") && - (strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08") || strstr(cpu_part,"0xd03") )) - return CPU_CORTEXA57; //or compatible A53, A72 - else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42")) - return CPU_VULCAN; - else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) + // Arm + if (strstr(cpu_implementer, "0x41")) { + if (strstr(cpu_part, "0xd03")) + return CPU_CORTEXA53; + else if (strstr(cpu_part, "0xd07")) + return CPU_CORTEXA57; + else if (strstr(cpu_part, "0xd08")) + return CPU_CORTEXA72; + else if (strstr(cpu_part, "0xd09")) + return CPU_CORTEXA73; + } + // Qualcomm + else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) + return CPU_FALKOR; + // Cavium + else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0a1")) return CPU_THUNDERX; - else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43")) + else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) return CPU_THUNDERX2T99; } @@ -180,64 +202,63 @@ void get_subdirname(void) void get_cpuconfig(void) { + // All arches should define ARMv8 + printf("#define ARMV8\n"); + printf("#define HAVE_NEON\n"); // This shouldn't be necessary + printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary + int d = detect(); switch (d) { + case CPU_CORTEXA53: + printf("#define %s\n", cpuname[d]); + // Fall-through case CPU_ARMV8: - printf("#define ARMV8\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 262144\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 4\n"); - break; - - case CPU_VULCAN: - printf("#define VULCAN \n"); - printf("#define HAVE_VFP \n"); - printf("#define HAVE_VFPV3 \n"); - printf("#define HAVE_NEON \n"); - printf("#define HAVE_VFPV4 \n"); - printf("#define L1_CODE_SIZE 32768 \n"); - printf("#define L1_CODE_LINESIZE 64 \n"); - printf("#define L1_CODE_ASSOCIATIVE 8 \n"); - printf("#define L1_DATA_SIZE 32768 \n"); - printf("#define L1_DATA_LINESIZE 64 \n"); - printf("#define L1_DATA_ASSOCIATIVE 8 \n"); - printf("#define L2_SIZE 262144 \n"); - printf("#define L2_LINESIZE 64 \n"); - printf("#define L2_ASSOCIATIVE 8 \n"); - printf("#define L3_SIZE 33554432 \n"); - printf("#define L3_LINESIZE 64 \n"); - printf("#define L3_ASSOCIATIVE 32 \n"); - printf("#define DTB_DEFAULT_ENTRIES 64 \n"); - printf("#define DTB_SIZE 4096 \n"); + // Minimum parameters for ARMv8 (based on A53) + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 262144\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); break; case CPU_CORTEXA57: - printf("#define CORTEXA57\n"); - printf("#define HAVE_VFP\n"); - printf("#define HAVE_VFPV3\n"); - printf("#define HAVE_NEON\n"); - printf("#define HAVE_VFPV4\n"); + case CPU_CORTEXA72: + case CPU_CORTEXA73: + // Common minimum settings for these Arm cores + // Can change a lot, but we need to be conservative + // TODO: detect info from /sys if possible + printf("#define %s\n", cpuname[d]); printf("#define L1_CODE_SIZE 49152\n"); printf("#define L1_CODE_LINESIZE 64\n"); printf("#define L1_CODE_ASSOCIATIVE 3\n"); printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L1_DATA_ASSOCIATIVE 2\n"); - printf("#define L2_SIZE 2097152\n"); + printf("#define L2_SIZE 524288\n"); printf("#define L2_LINESIZE 64\n"); printf("#define L2_ASSOCIATIVE 16\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); break; + case CPU_FALKOR: + printf("#define FALKOR\n"); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 128\n"); + printf("#define L2_SIZE 524288\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + break; + case CPU_THUNDERX: - printf("#define ARMV8\n"); printf("#define THUNDERX\n"); printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 128\n"); @@ -250,10 +271,6 @@ void get_cpuconfig(void) case CPU_THUNDERX2T99: printf("#define VULCAN \n"); - printf("#define HAVE_VFP \n"); - printf("#define HAVE_VFPV3 \n"); - printf("#define HAVE_NEON \n"); - printf("#define HAVE_VFPV4 \n"); printf("#define L1_CODE_SIZE 32768 \n"); printf("#define L1_CODE_LINESIZE 64 \n"); printf("#define L1_CODE_ASSOCIATIVE 8 \n"); diff --git a/cpuid_power.c b/cpuid_power.c index 6c7baef4a..23e98ebb0 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -56,6 +56,7 @@ #define CPUTYPE_CELL 6 #define CPUTYPE_PPCG4 7 #define CPUTYPE_POWER8 8 +#define CPUTYPE_POWER9 9 char *cpuname[] = { "UNKNOWN", @@ -66,7 +67,8 @@ char *cpuname[] = { "POWER6", "CELL", "PPCG4", - "POWER8" + "POWER8", + "POWER9" }; char *lowercpuname[] = { @@ -78,7 +80,8 @@ char *lowercpuname[] = { "power6", "cell", "ppcg4", - "power8" + "power8", + "power9" }; char *corename[] = { @@ -90,7 +93,8 @@ char *corename[] = { "POWER6", "CELL", "PPCG4", - "POWER8" + "POWER8", + "POWER8" }; int detect(void){ @@ -120,6 +124,7 @@ int detect(void){ if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; + if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; @@ -127,6 +132,33 @@ int detect(void){ #endif #ifdef _AIX + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + infile = popen("prtconf|grep 'Processor Type'"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("Pro", buffer, 3)){ + p = strchr(buffer, ':') + 2; +#if 0 + fprintf(stderr, "%s\n", p); +#endif + break; + } + } + + pclose(infile); + + if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3; + if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4; + if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970; + if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; + if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; + if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; + if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; + if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; + if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; + if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; return CPUTYPE_POWER5; #endif @@ -143,12 +175,12 @@ int detect(void){ return CPUTYPE_PPC970; #endif -#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) int id; -id = __asm __volatile("mfpvr %0" : "=r"(id)); +__asm __volatile("mfpvr %0" : "=r"(id)); switch ( id >> 16 ) { case 0x4e: // POWER9 - return return CPUTYPE_POWER8; + return CPUTYPE_POWER8; break; case 0x4d: case 0x4b: // POWER8/8E diff --git a/cpuid_x86.c b/cpuid_x86.c index 512ad877b..8e4a7cb84 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -2009,6 +2009,8 @@ int get_coretype(void){ switch (model) { case 1: // AMD Ryzen + case 8: + // Ryzen 2 if(support_avx()) #ifndef NO_AVX2 return CORE_ZEN; diff --git a/driver/level2/gemv_thread.c b/driver/level2/gemv_thread.c index 061454848..d57740314 100644 --- a/driver/level2/gemv_thread.c +++ b/driver/level2/gemv_thread.c @@ -62,9 +62,36 @@ #endif #endif -#ifndef TRANSA +#ifndef thread_local +# if __STDC_VERSION__ >= 201112 && !defined __STDC_NO_THREADS__ +# define thread_local _Thread_local +# elif defined _WIN32 && ( \ + defined _MSC_VER || \ + defined __ICL || \ + defined __DMC__ || \ + defined __BORLANDC__ ) +# define thread_local __declspec(thread) +/* note that ICC (linux) and Clang are covered by __GNUC__ */ +# elif defined __GNUC__ || \ + defined __SUNPRO_C || \ + defined __xlC__ +# define thread_local __thread +# else +# define UNSAFE +#endif +#endif +#if defined USE_OPENMP +#undef UNSAFE +#endif + +#if !defined(TRANSA) && !defined(UNSAFE) #define Y_DUMMY_NUM 1024 +#if defined(USE_OPENMP) static FLOAT y_dummy[Y_DUMMY_NUM]; +#pragma omp threadprivate(y_dummy) +# else +static thread_local FLOAT y_dummy[Y_DUMMY_NUM]; +# endif #endif static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ @@ -105,10 +132,12 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifdef TRANSA y += n_from * incy * COMPSIZE; #else +# ifndef UNSAFE //for split matrix row (n) direction and vector x of gemv_n x += n_from * incx * COMPSIZE; //store partial result for every thread y += (m_to - m_from) * 1 * COMPSIZE * pos; +# endif #endif } @@ -136,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x BLASLONG width, i, num_cpu; -#ifndef TRANSA +#if !defined(TRANSA) && !defined(UNSAFE) int split_x=0; #endif @@ -212,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x i -= width; } -#ifndef TRANSA +#if !defined(TRANSA) && !defined(UNSAFE) //try to split matrix on row direction and x. //Then, reduction. if (num_cpu < nthreads) { @@ -272,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x exec_blas(num_cpu, queue); } -#ifndef TRANSA +#if !defined(TRANSA) && !defined(UNSAFE) if(split_x==1){ //reduction for(i=0; i BLAS3_MEM_ALLOC_THRESHOLD @@ -510,10 +514,29 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, return 0; } +static int round_up(int remainder, int width, int multiple) +{ + if (multiple > remainder || width <= multiple) + return width; + width = (width + multiple - 1) / multiple; + width = width * multiple; + return width; +} + + static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG nthreads_m, BLASLONG nthreads_n) { +#ifndef USE_OPENMP +#ifndef OS_WINDOWS +static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; +#else +CRITICAL_SECTION level3_lock; +InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); +#endif +#endif + blas_arg_t newarg; #ifndef USE_ALLOC_HEAP @@ -554,6 +577,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG #endif #endif +#ifndef USE_OPENMP +#ifndef OS_WINDOWS +pthread_mutex_lock(&level3_lock); +#else +EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); +#endif +#endif + #ifdef USE_ALLOC_HEAP /* Dynamically allocate workspace */ job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); @@ -601,9 +632,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG num_parts = 0; while (m > 0){ width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts); + + width = round_up(m, width, GEMM_PREFERED_SIZE); + m -= width; + if (m < 0) width = width + m; range_M[num_parts + 1] = range_M[num_parts] + width; + num_parts ++; } for (i = num_parts; i < MAX_CPU_NUMBER; i++) { @@ -645,9 +681,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG if (width < SWITCH_RATIO) { width = SWITCH_RATIO; } + width = round_up(n, width, GEMM_PREFERED_SIZE); + n -= width; if (n < 0) width = width + n; range_N[num_parts + 1] = range_N[num_parts] + width; + num_parts ++; } for (j = num_parts; j < MAX_CPU_NUMBER; j++) { @@ -671,6 +710,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG free(job); #endif +#ifndef USE_OPENMP +#ifndef OS_WINDOWS + pthread_mutex_unlock(&level3_lock); +#else + LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); +#endif +#endif + return 0; } diff --git a/driver/others/Makefile b/driver/others/Makefile index e61ba7bc8..3dc2e7c1b 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -15,7 +15,11 @@ endif # COMMONOBJS += info.$(SUFFIX) ifeq ($(DYNAMIC_ARCH), 1) +ifeq ($(ARCH),arm64) +COMMONOBJS += dynamic_arm64.$(SUFFIX) +else COMMONOBJS += dynamic.$(SUFFIX) +endif else COMMONOBJS += parameter.$(SUFFIX) endif @@ -71,7 +75,11 @@ BLAS_SERVER = blas_server.c endif ifeq ($(DYNAMIC_ARCH), 1) +ifeq ($(ARCH),arm64) +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX) +else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) +endif else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 1d7f570d8..e5db1804f 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -582,7 +582,7 @@ int blas_thread_init(void){ if(ret!=0){ struct rlimit rlim; const char *msg = strerror(ret); - fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg); + fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg); #ifdef RLIMIT_NPROC if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " @@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) { long i; +#ifdef SMP_SERVER + // Handle lazy re-init of the thread-pool after a POSIX fork + if (unlikely(blas_server_avail == 0)) blas_thread_init(); +#endif + if (num_threads < 1) num_threads = blas_num_threads; #ifndef NO_AFFINITY diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 02a25ac39..bae344c59 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -478,7 +478,12 @@ int BLASFUNC(blas_thread_shutdown)(void){ void goto_set_num_threads(int num_threads) { - long i; + long i; + +#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) + // Handle lazy re-init of the thread-pool after a POSIX fork + if (unlikely(blas_server_avail == 0)) blas_thread_init(); +#endif if (num_threads < 1) num_threads = blas_cpu_number; diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c new file mode 100644 index 000000000..b4ce6b67d --- /dev/null +++ b/driver/others/dynamic_arm64.c @@ -0,0 +1,198 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include +#include + +extern gotoblas_t gotoblas_ARMV8; +extern gotoblas_t gotoblas_CORTEXA57; +extern gotoblas_t gotoblas_THUNDERX; +extern gotoblas_t gotoblas_THUNDERX2T99; + +extern void openblas_warning(int verbose, const char * msg); + +#define NUM_CORETYPES 4 + +/* + * In case asm/hwcap.h is outdated on the build system, make sure + * that HWCAP_CPUID is defined + */ +#ifndef HWCAP_CPUID +#define HWCAP_CPUID (1 << 11) +#endif + +#define get_cpu_ftr(id, var) ({ \ + asm("mrs %0, "#id : "=r" (var)); \ + }) + +static char *corename[] = { + "armv8", + "cortexa57", + "thunderx", + "thunderx2t99", + "unknown" +}; + +char *gotoblas_corename(void) { + if (gotoblas == &gotoblas_ARMV8) return corename[ 0]; + if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1]; + if (gotoblas == &gotoblas_THUNDERX) return corename[ 2]; + if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3]; + return corename[NUM_CORETYPES]; +} + +static gotoblas_t *force_coretype(char *coretype) { + int i ; + int found = -1; + char message[128]; + + for ( i=0 ; i < NUM_CORETYPES; i++) + { + if (!strncasecmp(coretype, corename[i], 20)) + { + found = i; + break; + } + } + + switch (found) + { + case 0: return (&gotoblas_ARMV8); + case 1: return (&gotoblas_CORTEXA57); + case 2: return (&gotoblas_THUNDERX); + case 3: return (&gotoblas_THUNDERX2T99); + } + snprintf(message, 128, "Core not found: %s\n", coretype); + openblas_warning(1, message); + return NULL; +} + +static gotoblas_t *get_coretype(void) { + int implementer, variant, part, arch, revision, midr_el1; + + if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { + char coremsg[128]; + snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n"); + openblas_warning(1, coremsg); + return NULL; + } + + get_cpu_ftr(MIDR_EL1, midr_el1); + /* + * MIDR_EL1 + * + * 31 24 23 20 19 16 15 4 3 0 + * ----------------------------------------------------------------- + * | Implementer | Variant | Architecture | Part Number | Revision | + * ----------------------------------------------------------------- + */ + implementer = (midr_el1 >> 24) & 0xFF; + part = (midr_el1 >> 4) & 0xFFF; + + switch(implementer) + { + case 0x41: // ARM + switch (part) + { + case 0xd07: // Cortex A57 + case 0xd08: // Cortex A72 + case 0xd03: // Cortex A53 + return &gotoblas_CORTEXA57; + } + break; + case 0x42: // Broadcom + switch (part) + { + case 0x516: // Vulcan + return &gotoblas_THUNDERX2T99; + } + break; + case 0x43: // Cavium + switch (part) + { + case 0x0a1: // ThunderX + return &gotoblas_THUNDERX; + case 0x0af: // ThunderX2 + return &gotoblas_THUNDERX2T99; + } + break; + } + return NULL; +} + +void gotoblas_dynamic_init(void) { + + char coremsg[128]; + char coren[22]; + char *p; + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + + if (gotoblas == NULL) + { + snprintf(coremsg, 128, "Falling back to generic ARMV8 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_ARMV8; + } + + if (gotoblas && gotoblas->init) { + strncpy(coren, gotoblas_corename(), 20); + sprintf(coremsg, "Core: %s\n", coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } + +} + +void gotoblas_dynamic_quit(void) { + gotoblas = NULL; +} diff --git a/driver/others/memory.c b/driver/others/memory.c index 9d4ab19f5..36815a39c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -73,8 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(USE_TLS) +#if defined(USE_TLS) && defined(SMP) #define COMPILE_TLS + +#if USE_TLS != 1 +#undef COMPILE_TLS +#endif + #if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2,20) #undef COMPILE_TLS @@ -254,6 +259,16 @@ int get_num_procs(void) { } #endif +#ifdef OS_AIX +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + + + #ifdef OS_WINDOWS int get_num_procs(void) { @@ -1733,6 +1748,22 @@ int get_num_procs(void) { return nums; } #endif + +#ifdef OS_HAIKU +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + +#ifdef OS_AIX +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif #ifdef OS_WINDOWS @@ -2555,7 +2586,7 @@ void *blas_memory_alloc(int procpos){ printf("Alloc Start ...\n"); #endif -#if defined(WHEREAMI) && !defined(USE_OPENMP) +/* #if defined(WHEREAMI) && !defined(USE_OPENMP) mypos = WhereAmI(); @@ -2565,12 +2596,12 @@ void *blas_memory_alloc(int procpos){ do { if (!memory[position].used && (memory[position].pos == mypos)) { LOCK_COMMAND(&alloc_lock); -/* blas_lock(&memory[position].lock);*/ +// blas_lock(&memory[position].lock); if (!memory[position].used) goto allocation; UNLOCK_COMMAND(&alloc_lock); -/* blas_unlock(&memory[position].lock);*/ +// blas_unlock(&memory[position].lock); } position ++; @@ -2578,24 +2609,24 @@ void *blas_memory_alloc(int procpos){ } while (position < NUM_BUFFERS); -#endif +#endif */ position = 0; + LOCK_COMMAND(&alloc_lock); do { /* if (!memory[position].used) { */ - LOCK_COMMAND(&alloc_lock); /* blas_lock(&memory[position].lock);*/ if (!memory[position].used) goto allocation; - UNLOCK_COMMAND(&alloc_lock); /* blas_unlock(&memory[position].lock);*/ /* } */ position ++; } while (position < NUM_BUFFERS); + UNLOCK_COMMAND(&alloc_lock); goto error; diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 3e87f2cc2..eca494dca 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -42,8 +42,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif static char* openblas_config_str="" +"OpenBLAS " + VERSION +" " #ifdef USE64BITINT - "USE64BITINT " + " USE64BITINT " #endif #ifdef NO_CBLAS "NO_CBLAS " diff --git a/driver/others/parameter.c b/driver/others/parameter.c index e7332c0c4..8bf7da78b 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -730,35 +730,8 @@ void blas_set_parameter(void){ #if defined(ARCH_ARM64) -#if defined(VULCAN) || defined(THUNDERX2T99) -unsigned long dgemm_prefetch_size_a; -unsigned long dgemm_prefetch_size_b; -unsigned long dgemm_prefetch_size_c; -#endif - void blas_set_parameter(void) { -#if defined(VULCAN) || defined(THUNDERX2T99) - dgemm_p = 160; - dgemm_q = 128; - dgemm_r = 4096; - - sgemm_p = 128; - sgemm_q = 352; - sgemm_r = 4096; - - cgemm_p = 128; - cgemm_q = 224; - cgemm_r = 4096; - - zgemm_p = 128; - zgemm_q = 112; - zgemm_r = 4096; - - dgemm_prefetch_size_a = 3584; - dgemm_prefetch_size_b = 512; - dgemm_prefetch_size_c = 128; -#endif } #endif diff --git a/exports/Makefile b/exports/Makefile index 29075a9c2..3a5f77db3 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -114,9 +114,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def endif ifneq (,$(filter 1 2,$(NOFORTRAN))) #only build without Fortran - $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else - $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif dllinit.$(SUFFIX) : dllinit.c diff --git a/f_check b/f_check index 997e02393..34caa00be 100644 --- a/f_check +++ b/f_check @@ -292,9 +292,6 @@ if ($link ne "") { && ($flags !~ /^-LIST:/) && ($flags !~ /^-LANG:/) ) { - if ($vendor eq "PGI") { - $flags =~ s/lib$/libso/; - } $linker_L .= $flags . " "; } @@ -311,17 +308,11 @@ if ($link ne "") { if ($flags =~ /^\-rpath\@/) { $flags =~ s/\@/\,/g; - if ($vendor eq "PGI") { - $flags =~ s/lib$/libso/; - } $linker_L .= "-Wl,". $flags . " " ; } if ($flags =~ /^\-rpath-link\@/) { $flags =~ s/\@/\,/g; - if ($vendor eq "PGI") { - $flags =~ s/lib$/libso/; - } $linker_L .= "-Wl,". $flags . " " ; } @@ -330,7 +321,6 @@ if ($link ne "") { && ($flags !~ /gfortranbegin/) && ($flags !~ /frtbegin/) && ($flags !~ /pathfstart/) - && ($flags !~ /numa/) && ($flags !~ /crt[0-9]/) && ($flags !~ /gcc/) && ($flags !~ /user32/) diff --git a/getarch.c b/getarch.c index 31f41d62c..146f1f36f 100644 --- a/getarch.c +++ b/getarch.c @@ -927,11 +927,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DARMV8 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "armv8" #define CORENAME "ARMV8" #endif +#ifdef FORCE_CORTEXA53 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXA53" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA53 " \ + "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "cortexa53" +#define CORENAME "CORTEXA53" +#else +#endif + #ifdef FORCE_CORTEXA57 #define FORCE #define ARCHITECTURE "ARM64" @@ -942,26 +959,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "cortexa57" #define CORENAME "CORTEXA57" #else #endif -#ifdef FORCE_VULCAN +#ifdef FORCE_CORTEXA72 #define FORCE #define ARCHITECTURE "ARM64" -#define SUBARCHITECTURE "VULCAN" +#define SUBARCHITECTURE "CORTEXA72" #define SUBDIRNAME "arm64" -#define ARCHCONFIG "-DVULCAN " \ - "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ - "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ - "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ - "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ +#define ARCHCONFIG "-DCORTEXA72 " \ + "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" -#define LIBNAME "vulcan" -#define CORENAME "VULCAN" + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "cortexa72" +#define CORENAME "CORTEXA72" +#else +#endif + +#ifdef FORCE_CORTEXA73 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXA73" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA73 " \ + "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "cortexa73" +#define CORENAME "CORTEXA73" +#else +#endif + +#ifdef FORCE_FALKOR +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "FALKOR" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DFALKOR " \ + "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "falkor" +#define CORENAME "FALKOR" #else #endif @@ -973,13 +1021,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DTHUNDERX " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ "-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "thunderx" #define CORENAME "THUNDERX" #else #endif #ifdef FORCE_THUNDERX2T99 +#define ARMV8 #define FORCE #define ARCHITECTURE "ARM64" #define SUBARCHITECTURE "THUNDERX2T99" @@ -990,7 +1040,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" #define LIBNAME "thunderx2t99" #define CORENAME "THUNDERX2T99" #else diff --git a/interface/axpy.c b/interface/axpy.c index 39edea6af..9032946d2 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -75,6 +75,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc if (alpha == ZERO) return; + if (incx == 0 && incy == 0) { + *y += n * alpha *(*x); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/lapack/laswp.c b/interface/lapack/laswp.c index ebeb103e7..0dde33ae3 100644 --- a/interface/lapack/laswp.c +++ b/interface/lapack/laswp.c @@ -97,7 +97,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, - laswp[flag], nthreads); + (int(*)())laswp[flag], nthreads); } #endif diff --git a/interface/lapack/zlaswp.c b/interface/lapack/zlaswp.c index 31e08451d..b77a40985 100644 --- a/interface/lapack/zlaswp.c +++ b/interface/lapack/zlaswp.c @@ -96,7 +96,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * mode = BLAS_SINGLE | BLAS_COMPLEX; #endif - blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads); + blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)())laswp[flag], nthreads); } #endif diff --git a/interface/swap.c b/interface/swap.c index f7642edf1..17a9868a9 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -42,7 +42,7 @@ #include "functable.h" #endif -#if defined(THUNDERX2T99) || defined(VULCAN) +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance diff --git a/interface/zaxpy.c b/interface/zaxpy.c index 1a0259c96..dbd559628 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -82,6 +82,12 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + if (incx == 0 && incy == 0) { + *y += n * (alpha_r * (*x) - alpha_i* (*(x+1)) ); + *(y+1) += n * (alpha_i * (*x) + alpha_r * (*(x +1)) ); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/zhemv.c b/interface/zhemv.c index d1996ad69..9c31f31d9 100644 --- a/interface/zhemv.c +++ b/interface/zhemv.c @@ -43,6 +43,10 @@ #include "functable.h" #endif +// this is smallest dimension N of square input a to permit threading +// see graph in issue #1820 for explanation +#define MULTI_THREAD_MINIMAL 362 + #ifdef XDOUBLE #define ERROR_NAME "XHEMV " #elif defined(DOUBLE) @@ -195,7 +199,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - nthreads = num_cpu_avail(2); + if (n $(@F) diff --git a/kernel/arm/asum_vfp.S b/kernel/arm/asum_vfp.S index 5b08e5028..9a75885a2 100644 --- a/kernel/arm/asum_vfp.S +++ b/kernel/arm/asum_vfp.S @@ -58,11 +58,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } vabs.f64 d6, d6 vadd.f64 d1 , d1, d5 vabs.f64 d7, d7 @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 @@ -82,22 +82,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X @@ -107,7 +107,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X @@ -118,11 +118,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } vabs.f32 s6, s6 vadd.f32 s1 , s1, s5 vabs.f32 s7, s7 @@ -133,7 +133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 @@ -142,22 +142,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X @@ -167,7 +167,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X @@ -184,11 +184,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } vabs.f64 d6, d6 vadd.f64 d1 , d1, d5 vabs.f64 d7, d7 @@ -196,11 +196,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vadd.f64 d1 , d1, d7 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } vabs.f64 d6, d6 vadd.f64 d1 , d1, d5 vabs.f64 d7, d7 @@ -212,11 +212,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 @@ -226,28 +226,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 @@ -259,7 +259,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 @@ -273,22 +273,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } vabs.f32 s6, s6 vadd.f32 s1 , s1, s5 vabs.f32 s7, s7 vadd.f32 s0 , s0, s6 vadd.f32 s1 , s1, s7 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } vabs.f32 s6, s6 vadd.f32 s1 , s1, s5 vabs.f32 s7, s7 @@ -300,11 +300,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 @@ -313,28 +313,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 @@ -346,7 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S index c35b8aece..39c9ac233 100644 --- a/kernel/arm/axpy_vfp.S +++ b/kernel/arm/axpy_vfp.S @@ -146,17 +146,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d7 } + vldmia.f64 X!, { d4 - d7 } pld [ Y, #X_PRE ] - fldmiad Y , { d8 - d11 } + vldmia.f64 Y , { d8 - d11 } fmacd d8 , d0, d4 - fstmiad Y!, { d8 } + vstmia.f64 Y!, { d8 } fmacd d9 , d0, d5 - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d9 } fmacd d10, d0, d6 - fstmiad Y!, { d10 } + vstmia.f64 Y!, { d10 } fmacd d11, d0, d7 - fstmiad Y!, { d11 } + vstmia.f64 Y!, { d11 } .endm @@ -164,19 +164,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } - fldmiad Y , { d8 } + vldmia.f64 X!, { d4 } + vldmia.f64 Y , { d8 } fmacd d8 , d0, d4 - fstmiad Y!, { d8 } + vstmia.f64 Y!, { d8 } .endm .macro KERNEL_S1 - fldmiad X , { d4 } - fldmiad Y , { d8 } + vldmia.f64 X , { d4 } + vldmia.f64 Y , { d8 } fmacd d8 , d0, d4 - fstmiad Y , { d8 } + vstmia.f64 Y , { d8 } add X, X, INC_X add Y, Y, INC_Y @@ -186,16 +186,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s4 - s7 } - fldmias Y , { s8 - s11 } + vldmia.f32 X!, { s4 - s7 } + vldmia.f32 Y , { s8 - s11 } fmacs s8 , s0, s4 - fstmias Y!, { s8 } + vstmia.f32 Y!, { s8 } fmacs s9 , s0, s5 - fstmias Y!, { s9 } + vstmia.f32 Y!, { s9 } fmacs s10, s0, s6 - fstmias Y!, { s10 } + vstmia.f32 Y!, { s10 } fmacs s11, s0, s7 - fstmias Y!, { s11 } + vstmia.f32 Y!, { s11 } .endm @@ -203,19 +203,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } - fldmias Y , { s8 } + vldmia.f32 X!, { s4 } + vldmia.f32 Y , { s8 } fmacs s8 , s0, s4 - fstmias Y!, { s8 } + vstmia.f32 Y!, { s8 } .endm .macro KERNEL_S1 - fldmias X , { s4 } - fldmias Y , { s8 } + vldmia.f32 X , { s4 } + vldmia.f32 Y , { s8 } fmacs s8 , s0, s4 - fstmias Y , { s8 } + vstmia.f32 Y , { s8 } add X, X, INC_X add Y, Y, INC_Y @@ -231,42 +231,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d7 } + vldmia.f64 X!, { d4 - d7 } pld [ Y, #X_PRE ] - fldmiad Y , { d8 - d11 } + vldmia.f64 Y , { d8 - d11 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y!, { d8 } - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d8 } + vstmia.f64 Y!, { d9 } FMAC_R1 d10, d0, d6 FMAC_R2 d10, d1, d7 FMAC_I1 d11, d0, d7 FMAC_I2 d11, d1, d6 - fstmiad Y!, { d10 } - fstmiad Y!, { d11 } + vstmia.f64 Y!, { d10 } + vstmia.f64 Y!, { d11 } pld [ X, #X_PRE ] - fldmiad X!, { d4 - d7 } + vldmia.f64 X!, { d4 - d7 } pld [ Y, #X_PRE ] - fldmiad Y , { d8 - d11 } + vldmia.f64 Y , { d8 - d11 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y!, { d8 } - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d8 } + vstmia.f64 Y!, { d9 } FMAC_R1 d10, d0, d6 FMAC_R2 d10, d1, d7 FMAC_I1 d11, d0, d7 FMAC_I2 d11, d1, d6 - fstmiad Y!, { d10 } - fstmiad Y!, { d11 } + vstmia.f64 Y!, { d10 } + vstmia.f64 Y!, { d11 } @@ -277,15 +277,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } - fldmiad Y , { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y , { d8 - d9 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y!, { d8 } - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d8 } + vstmia.f64 Y!, { d9 } @@ -293,14 +293,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X , { d4 - d5 } - fldmiad Y , { d8 - d9 } + vldmia.f64 X , { d4 - d5 } + vldmia.f64 Y , { d8 - d9 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y , { d8 - d9 } + vstmia.f64 Y , { d8 - d9 } add X, X, INC_X add Y, Y, INC_Y @@ -314,40 +314,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmias X!, { s4 - s7 } + vldmia.f32 X!, { s4 - s7 } pld [ Y, #X_PRE ] - fldmias Y , { s8 - s11 } + vldmia.f32 Y , { s8 - s11 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y!, { s8 } - fstmias Y!, { s9 } + vstmia.f32 Y!, { s8 } + vstmia.f32 Y!, { s9 } FMAC_R1 s10, s0, s6 FMAC_R2 s10, s1, s7 FMAC_I1 s11, s0, s7 FMAC_I2 s11, s1, s6 - fstmias Y!, { s10 } - fstmias Y!, { s11 } + vstmia.f32 Y!, { s10 } + vstmia.f32 Y!, { s11 } - fldmias X!, { s4 - s7 } - fldmias Y , { s8 - s11 } + vldmia.f32 X!, { s4 - s7 } + vldmia.f32 Y , { s8 - s11 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y!, { s8 } - fstmias Y!, { s9 } + vstmia.f32 Y!, { s8 } + vstmia.f32 Y!, { s9 } FMAC_R1 s10, s0, s6 FMAC_R2 s10, s1, s7 FMAC_I1 s11, s0, s7 FMAC_I2 s11, s1, s6 - fstmias Y!, { s10 } - fstmias Y!, { s11 } + vstmia.f32 Y!, { s10 } + vstmia.f32 Y!, { s11 } @@ -358,15 +358,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 - s5 } - fldmias Y , { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y , { s8 - s9 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y!, { s8 } - fstmias Y!, { s9 } + vstmia.f32 Y!, { s8 } + vstmia.f32 Y!, { s9 } @@ -374,14 +374,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X , { s4 - s5 } - fldmias Y , { s8 - s9 } + vldmia.f32 X , { s4 - s5 } + vldmia.f32 Y , { s8 - s9 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y , { s8 - s9 } + vstmia.f32 Y , { s8 - s9 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/ccopy_vfp.S b/kernel/arm/ccopy_vfp.S index 874fcab9c..fbb32b43c 100644 --- a/kernel/arm/ccopy_vfp.S +++ b/kernel/arm/ccopy_vfp.S @@ -65,15 +65,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_F4 pld [ X, #X_PRE ] - fldmias X!, { s0 - s7 } - fstmias Y!, { s0 - s7 } + vldmia.f32 X!, { s0 - s7 } + vstmia.f32 Y!, { s0 - s7 } .endm .macro COPY_F1 - fldmias X!, { s0 - s1 } - fstmias Y!, { s0 - s1 } + vldmia.f32 X!, { s0 - s1 } + vstmia.f32 Y!, { s0 - s1 } .endm @@ -83,23 +83,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmias X, { s0 - s1 } - fstmias Y, { s0 - s1 } + vldmia.f32 X, { s0 - s1 } + vstmia.f32 Y, { s0 - s1 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s2 - s3 } - fstmias Y, { s2 - s3 } + vldmia.f32 X, { s2 - s3 } + vstmia.f32 Y, { s2 - s3 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s0 - s1 } - fstmias Y, { s0 - s1 } + vldmia.f32 X, { s0 - s1 } + vstmia.f32 Y, { s0 - s1 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s2 - s3 } - fstmias Y, { s2 - s3 } + vldmia.f32 X, { s2 - s3 } + vstmia.f32 Y, { s2 - s3 } add X, X, INC_X add Y, Y, INC_Y @@ -108,8 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmias X, { s0 - s1 } - fstmias Y, { s0 - s1 } + vldmia.f32 X, { s0 - s1 } + vstmia.f32 Y, { s0 - s1 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S index fd86a37b0..85246d734 100644 --- a/kernel/arm/cdot_vfp.S +++ b/kernel/arm/cdot_vfp.S @@ -76,30 +76,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X!, { s4 - s5 } - fldmias Y!, { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y!, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } fmacs s2 , s5, s9 fmacs s3 , s5, s8 - fldmias Y!, { s10 - s11 } + vldmia.f32 Y!, { s10 - s11 } fmacs s0 , s6, s10 fmacs s1 , s6, s11 fmacs s2 , s7, s11 fmacs s3 , s7, s10 - fldmias X!, { s4 - s5 } - fldmias Y!, { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y!, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } fmacs s2 , s5, s9 fmacs s3 , s5, s8 - fldmias Y!, { s10 - s11 } + vldmia.f32 Y!, { s10 - s11 } fmacs s0 , s6, s10 fmacs s1 , s6, s11 fmacs s2 , s7, s11 @@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 - s5 } - fldmias Y!, { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y!, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -125,8 +125,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. nop - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -134,8 +134,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -143,8 +143,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -152,8 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -166,8 +166,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S index 71bc50efd..d2591919e 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfp.S +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -165,9 +165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmuls s8 , s0, s4 @@ -197,9 +197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -225,8 +225,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M2 - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -254,8 +254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_E - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -317,7 +317,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } + vldmia.f32 CO1, { s4 - s7 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -329,9 +329,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } - fldmias CO2, { s4 - s7 } + vldmia.f32 CO2, { s4 - s7 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 @@ -343,7 +343,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias CO2, { s4 - s7 } + vstmia.f32 CO2, { s4 - s7 } add CO1, CO1, #16 @@ -500,23 +500,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } + vldmia.f32 CO1, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } - fldmias CO2, { s4 - s5 } + vldmia.f32 CO2, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias CO2, { s4 - s5 } + vstmia.f32 CO2, { s4 - s5 } add CO1, CO1, #8 @@ -671,7 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } + vldmia.f32 CO1, { s4 - s7 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -683,7 +683,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -800,14 +800,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } + vldmia.f32 CO1, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index 9d473ad78..5ebc904ac 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -182,30 +182,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmuls s16 , s0, s8 fmuls s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s0, s9 fmuls s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s18 , s2, s8 fmuls s26 , s3, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s19 , s2, s9 fmuls s27 , s3, s8 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s20 , s0, s10 fmuls s28 , s1, s11 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s21 , s0, s11 fmuls s29 , s1, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s22 , s2, s10 fmuls s30 , s3, s11 fmuls s23 , s2, s11 @@ -218,17 +218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmacs s24 , s1, s9 fmacs s17 , s0, s9 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmacs s25 , s1, s8 fmacs s18 , s2, s8 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmacs s26 , s3, s9 fmacs s19 , s2, s9 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmacs s27 , s3, s8 fmacs s20 , s0, s10 @@ -250,19 +250,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ BO , #B_PRE ] fmacs s24 , s5, s13 fmacs s17 , s4, s13 - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmacs s19 , s6, s13 fmacs s27 , s7, s12 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s20 , s4, s14 fmacs s28 , s5, s15 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s21 , s4, s15 fmacs s29 , s5, s14 @@ -300,16 +300,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmacs s16 , s0, s8 fmacs s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s17 , s0, s9 fmacs s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s18 , s2, s8 fmacs s26 , s3, s9 fmacs s19 , s2, s9 @@ -338,8 +338,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } - fldmias CO2, { s8 - s11 } + vldmia.f32 CO1, { s4 - s7 } + vldmia.f32 CO2, { s8 - s11 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -370,8 +370,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s10, s1 , s23 FMAC_I2 s11, s1 , s22 - fstmias CO1, { s4 - s7 } - fstmias CO2, { s8 - s11 } + vstmia.f32 CO1, { s4 - s7 } + vstmia.f32 CO2, { s8 - s11 } add CO1, CO1, #16 @@ -534,8 +534,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } - fldmias CO2, { s8 - s9 } + vldmia.f32 CO1, { s4 - s5 } + vldmia.f32 CO2, { s8 - s9 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -552,8 +552,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s8 , s1 , s21 FMAC_I2 s9 , s1 , s20 - fstmias CO1, { s4 - s5 } - fstmias CO2, { s8 - s9 } + vstmia.f32 CO1, { s4 - s5 } + vstmia.f32 CO2, { s8 - s9 } add CO1, CO1, #8 @@ -716,7 +716,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } + vldmia.f32 CO1, { s4 - s7 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -733,7 +733,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s19 FMAC_I2 s7 , s1 , s18 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -851,7 +851,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } + vldmia.f32 CO1, { s4 - s5 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -861,7 +861,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/cgemm_ncopy_2_vfp.S b/kernel/arm/cgemm_ncopy_2_vfp.S index 29eeab492..fe4959988 100644 --- a/kernel/arm/cgemm_ncopy_2_vfp.S +++ b/kernel/arm/cgemm_ncopy_2_vfp.S @@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s6 , [ AO2, #8 ] flds s7 , [ AO2, #12 ] - fstmias BO!, { s0 - s7 } + vstmia.f32 BO!, { s0 - s7 } add AO2, AO2, #16 .endm @@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s3 , [ AO2, #4 ] add AO1, AO1, #8 - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO2, AO2, #8 .endm @@ -111,7 +111,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s2 , [ AO1, #8 ] flds s3 , [ AO1, #12 ] - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO1, AO1, #16 .endm @@ -122,7 +122,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] flds s1 , [ AO1, #4 ] - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO1, AO1, #8 .endm diff --git a/kernel/arm/cgemm_tcopy_2_vfp.S b/kernel/arm/cgemm_tcopy_2_vfp.S index 9036b994d..7b3ae18d4 100644 --- a/kernel/arm/cgemm_tcopy_2_vfp.S +++ b/kernel/arm/cgemm_tcopy_2_vfp.S @@ -73,12 +73,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **************************************************************************************/ .macro COPY2x2 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } - fstmias BO1, { s0 - s7 } + vstmia.f32 BO1, { s0 - s7 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -86,12 +86,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmias AO1, { s0 -s1 } + vldmia.f32 AO1, { s0 -s1 } add r3, AO1, LDA - fldmias r3, { s2 - s3 } + vldmia.f32 r3, { s2 - s3 } - fstmias BO2, { s0 - s3 } + vstmia.f32 BO2, { s0 - s3 } add AO1, AO1, #8 add BO2, BO2, #16 @@ -100,9 +100,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*************************************************************************************************************************/ .macro COPY2x1 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } - fstmias BO1, { s0 - s3 } + vstmia.f32 BO1, { s0 - s3 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -110,9 +110,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } - fstmias BO2, { s0 - s1 } + vstmia.f32 BO2, { s0 - s1 } add AO1, AO1, #8 add BO2, BO2, #8 diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S index 62ee33bb9..d6b18c796 100644 --- a/kernel/arm/cgemv_n_vfp.S +++ b/kernel/arm/cgemv_n_vfp.S @@ -201,7 +201,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -213,9 +213,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 @@ -227,7 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } .endm @@ -266,14 +266,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, #8 @@ -349,47 +349,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y - fldmias YO, { s6 - s7 } + vldmia.f32 YO, { s6 - s7 } FMAC_R1 s6 , s0 , s10 FMAC_I1 s7 , s0 , s11 FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias YO, { s6 - s7 } + vstmia.f32 YO, { s6 - s7 } add YO, YO, INC_Y - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y - fldmias YO, { s6 - s7 } + vldmia.f32 YO, { s6 - s7 } FMAC_R1 s6 , s0 , s14 FMAC_I1 s7 , s0 , s15 FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO, { s6 - s7 } + vstmia.f32 YO, { s6 - s7 } add YO, YO, INC_Y @@ -430,14 +430,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S index c07b6d6f8..6833df7d1 100644 --- a/kernel/arm/cgemv_t_vfp.S +++ b/kernel/arm/cgemv_t_vfp.S @@ -150,9 +150,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmias XO! , { s2 - s3 } - fldmias AO1!, { s4 - s5 } - fldmias AO2!, { s8 - s9 } + vldmia.f32 XO! , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } + vldmia.f32 AO2!, { s8 - s9 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 @@ -180,7 +180,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } .endm @@ -204,8 +204,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s2 - s3 } - fldmias AO1!, { s4 - s5 } + vldmia.f32 XO! , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -216,14 +216,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO!, { s4 - s5 } + vstmia.f32 YO!, { s4 - s5 } .endm @@ -249,9 +249,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmias XO , { s2 - s3 } - fldmias AO1!, { s4 - s5 } - fldmias AO2!, { s8 - s9 } + vldmia.f32 XO , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } + vldmia.f32 AO2!, { s8 - s9 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -269,25 +269,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y - fldmias YO, { s6 - s7 } + vldmia.f32 YO, { s6 - s7 } FMAC_R1 s6 , s0 , s14 FMAC_I1 s7 , s0 , s15 FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO, { s6 - s7 } + vstmia.f32 YO, { s6 - s7 } add YO, YO, INC_Y @@ -313,8 +313,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s2 - s3 } - fldmias AO1!, { s4 - s5 } + vldmia.f32 XO , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -327,14 +327,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S index aae890ea9..ca1a512fb 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -165,9 +165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmuls s8 , s0, s4 @@ -197,9 +197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -225,8 +225,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M2 - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -254,8 +254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_E - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -331,7 +331,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } flds s4, FP_ZERO vmov.f32 s5, s4 @@ -348,7 +348,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias CO2, { s4 - s7 } + vstmia.f32 CO2, { s4 - s7 } add CO1, CO1, #16 @@ -513,7 +513,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } flds s4, FP_ZERO vmov.f32 s5, s4 @@ -523,7 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias CO2, { s4 - s5 } + vstmia.f32 CO2, { s4 - s5 } add CO1, CO1, #8 @@ -693,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -818,7 +818,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S index 79e7ed07f..d75fb7735 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -170,30 +170,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmuls s16 , s0, s8 fmuls s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s0, s9 fmuls s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s18 , s2, s8 fmuls s26 , s3, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s19 , s2, s9 fmuls s27 , s3, s8 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s20 , s0, s10 fmuls s28 , s1, s11 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s21 , s0, s11 fmuls s29 , s1, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s22 , s2, s10 fmuls s30 , s3, s11 fmuls s23 , s2, s11 @@ -206,17 +206,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmacs s24 , s1, s9 fmacs s17 , s0, s9 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmacs s25 , s1, s8 fmacs s18 , s2, s8 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmacs s26 , s3, s9 fmacs s19 , s2, s9 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmacs s27 , s3, s8 fmacs s20 , s0, s10 @@ -238,19 +238,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ BO , #B_PRE ] fmacs s24 , s5, s13 fmacs s17 , s4, s13 - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmacs s19 , s6, s13 fmacs s27 , s7, s12 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s20 , s4, s14 fmacs s28 , s5, s15 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s21 , s4, s15 fmacs s29 , s5, s14 @@ -288,16 +288,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmacs s16 , s0, s8 fmacs s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s17 , s0, s9 fmacs s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s18 , s2, s8 fmacs s26 , s3, s9 fmacs s19 , s2, s9 @@ -354,8 +354,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s10, s1 , s23 FMAC_I2 s11, s1 , s22 - fstmias CO1, { s4 - s7 } - fstmias CO2, { s8 - s11 } + vstmia.f32 CO1, { s4 - s7 } + vstmia.f32 CO2, { s8 - s11 } add CO1, CO1, #16 @@ -532,8 +532,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s8 , s1 , s21 FMAC_I2 s9 , s1 , s20 - fstmias CO1, { s4 - s5 } - fstmias CO2, { s8 - s9 } + vstmia.f32 CO1, { s4 - s5 } + vstmia.f32 CO2, { s8 - s9 } add CO1, CO1, #8 @@ -710,7 +710,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s19 FMAC_I2 s7 , s1 , s18 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -835,7 +835,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/dcopy_vfp.S b/kernel/arm/dcopy_vfp.S index da239924a..7ee52af88 100644 --- a/kernel/arm/dcopy_vfp.S +++ b/kernel/arm/dcopy_vfp.S @@ -65,15 +65,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_F4 pld [ X, #X_PRE ] - fldmiad X!, { d0 - d3 } - fstmiad Y!, { d0 - d3 } + vldmia.f64 X!, { d0 - d3 } + vstmia.f64 Y!, { d0 - d3 } .endm .macro COPY_F1 - fldmiad X!, { d0 } - fstmiad Y!, { d0 } + vldmia.f64 X!, { d0 } + vstmia.f64 Y!, { d0 } .endm @@ -83,23 +83,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmiad X, { d0 } - fstmiad Y, { d0 } + vldmia.f64 X, { d0 } + vstmia.f64 Y, { d0 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d1 } - fstmiad Y, { d1 } + vldmia.f64 X, { d1 } + vstmia.f64 Y, { d1 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d0 } - fstmiad Y, { d0 } + vldmia.f64 X, { d0 } + vstmia.f64 Y, { d0 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d1 } - fstmiad Y, { d1 } + vldmia.f64 X, { d1 } + vstmia.f64 Y, { d1 } add X, X, INC_X add Y, Y, INC_Y @@ -108,8 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmiad X, { d0 } - fstmiad Y, { d0 } + vldmia.f64 X, { d0 } + vstmia.f64 Y, { d0 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S index cc2e485b7..4dff5a3e1 100644 --- a/kernel/arm/ddot_vfp.S +++ b/kernel/arm/ddot_vfp.S @@ -67,26 +67,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d8 } + vldmia.f64 X!, { d8 } pld [ Y, #X_PRE ] - fldmiad Y!, { d4 } - fldmiad Y!, { d5 } + vldmia.f64 Y!, { d4 } + vldmia.f64 Y!, { d5 } fmacd d0 , d4, d8 - fldmiad X!, { d9 } - fldmiad Y!, { d6 } + vldmia.f64 X!, { d9 } + vldmia.f64 Y!, { d6 } fmacd d1 , d5, d9 - fldmiad X!, { d10 } - fldmiad X!, { d11 } + vldmia.f64 X!, { d10 } + vldmia.f64 X!, { d11 } fmacd d0 , d6, d10 - fldmiad Y!, { d7 } + vldmia.f64 Y!, { d7 } fmacd d1 , d7, d11 .endm .macro KERNEL_F1 - fldmiad X!, { d4 } - fldmiad Y!, { d8 } + vldmia.f64 X!, { d4 } + vldmia.f64 Y!, { d8 } fmacd d0 , d4, d8 .endm @@ -97,26 +97,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 nop - fldmiad X, { d4 } - fldmiad Y, { d8 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d8 } add X, X, INC_X add Y, Y, INC_Y fmacd d0 , d4, d8 - fldmiad X, { d5 } - fldmiad Y, { d9 } + vldmia.f64 X, { d5 } + vldmia.f64 Y, { d9 } add X, X, INC_X add Y, Y, INC_Y fmacd d1 , d5, d9 - fldmiad X, { d6 } - fldmiad Y, { d10 } + vldmia.f64 X, { d6 } + vldmia.f64 Y, { d10 } add X, X, INC_X add Y, Y, INC_Y fmacd d0 , d6, d10 - fldmiad X, { d7 } - fldmiad Y, { d11 } + vldmia.f64 X, { d7 } + vldmia.f64 Y, { d11 } add X, X, INC_X add Y, Y, INC_Y fmacd d1 , d7, d11 @@ -126,8 +126,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 } - fldmiad Y, { d8 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d8 } add X, X, INC_X fmacd d0 , d4, d8 add Y, Y, INC_Y diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index 1744b54d8..d852c2dad 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -331,7 +331,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add r4 , CO2, r3 pld [ CO2 , #C_PRE ] - fldmiad CO1, { d8 - d11 } + vldmia.f64 CO1, { d8 - d11 } pld [ r4 , #C_PRE ] fmacd d8 , d0 , d16 @@ -352,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d15, d0 , d23 fstd d11, [CO1, #24 ] - fldmiad r4, { d8 - d11 } + vldmia.f64 r4, { d8 - d11 } fmacd d8 , d0 , d24 fstd d12, [CO2] @@ -367,7 +367,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ CO2 , #C_PRE ] - fldmiad CO2, { d12 - d15 } + vldmia.f64 CO2, { d12 - d15 } fstd d8 , [r4 ] fmacd d12, d0 , d28 @@ -378,7 +378,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstd d11, [r4 , #24 ] fmacd d15, d0 , d31 - fstmiad CO2, { d12 - d15 } + vstmia.f64 CO2, { d12 - d15 } add CO1, CO1, #32 diff --git a/kernel/arm/dgemm_ncopy_2_vfp.S b/kernel/arm/dgemm_ncopy_2_vfp.S index 6266c61d2..9642b6478 100644 --- a/kernel/arm/dgemm_ncopy_2_vfp.S +++ b/kernel/arm/dgemm_ncopy_2_vfp.S @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d3 , [ AO2, #8 ] add AO1, AO1, #16 - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO2, AO2, #16 .endm @@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d1 , [ AO2, #0 ] add AO1, AO1, #8 - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO2, AO2, #8 .endm @@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] fldd d1 , [ AO1, #8 ] - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO1, AO1, #16 .endm @@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] - fstmiad BO!, { d0 } + vstmia.f64 BO!, { d0 } add AO1, AO1, #8 .endm diff --git a/kernel/arm/dgemm_ncopy_4_vfp.S b/kernel/arm/dgemm_ncopy_4_vfp.S index ffc19a9cc..5760cbd8a 100644 --- a/kernel/arm/dgemm_ncopy_4_vfp.S +++ b/kernel/arm/dgemm_ncopy_4_vfp.S @@ -105,10 +105,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d11, [ AO4, #16 ] fldd d15, [ AO4, #24 ] - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO4, AO4, #32 - fstmiad BO!, { d4 - d7 } - fstmiad BO!, { d8 - d15 } + vstmia.f64 BO!, { d4 - d7 } + vstmia.f64 BO!, { d8 - d15 } .endm @@ -122,7 +122,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d3 , [ AO4, #0 ] add AO3, AO3, #8 - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO4, AO4, #8 .endm @@ -140,7 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d5 , [ AO2, #16 ] fldd d7 , [ AO2, #24 ] - fstmiad BO!, { d0 - d7 } + vstmia.f64 BO!, { d0 - d7 } add AO2, AO2, #32 .endm @@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d1 , [ AO2, #0 ] add AO1, AO1, #8 - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO2, AO2, #8 .endm @@ -164,7 +164,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d2 , [ AO1, #16 ] fldd d3 , [ AO1, #24 ] - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO1, AO1, #32 .endm @@ -174,7 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] - fstmiad BO!, { d0 } + vstmia.f64 BO!, { d0 } add AO1, AO1, #8 .endm diff --git a/kernel/arm/dgemm_tcopy_4_vfp.S b/kernel/arm/dgemm_tcopy_4_vfp.S index 937f43957..8335de27c 100644 --- a/kernel/arm/dgemm_tcopy_4_vfp.S +++ b/kernel/arm/dgemm_tcopy_4_vfp.S @@ -76,21 +76,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d4 - d7 } + vldmia.f64 r3, { d4 - d7 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d8 - d11 } + vldmia.f64 r3, { d8 - d11 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d12 - d15 } + vldmia.f64 r3, { d12 - d15 } - fstmiad BO1, { d0 - d15 } + vstmia.f64 BO1, { d0 - d15 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -98,18 +98,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x4 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } add r3, AO1, LDA - fldmiad r3, { d2 - d3 } + vldmia.f64 r3, { d2 - d3 } add r3, r3, LDA - fldmiad r3, { d4 - d5 } + vldmia.f64 r3, { d4 - d5 } add r3, r3, LDA - fldmiad r3, { d6 - d7 } + vldmia.f64 r3, { d6 - d7 } - fstmiad BO2, { d0 - d7 } + vstmia.f64 BO2, { d0 - d7 } add AO1, AO1, #16 add BO2, BO2, #64 @@ -117,18 +117,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x4 - fldmiad AO1, { d0 } + vldmia.f64 AO1, { d0 } add r3, AO1, LDA - fldmiad r3, { d1 } + vldmia.f64 r3, { d1 } add r3, r3, LDA - fldmiad r3, { d2 } + vldmia.f64 r3, { d2 } add r3, r3, LDA - fldmiad r3, { d3 } + vldmia.f64 r3, { d3 } - fstmiad BO3, { d0 - d3 } + vstmia.f64 BO3, { d0 - d3 } add AO1, AO1, #8 add BO3, BO3, #32 @@ -139,13 +139,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x2 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d4 - d7 } + vldmia.f64 r3, { d4 - d7 } - fstmiad BO1, { d0 - d7 } + vstmia.f64 BO1, { d0 - d7 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -153,12 +153,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x2 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } add r3, AO1, LDA - fldmiad r3, { d2 - d3 } + vldmia.f64 r3, { d2 - d3 } - fstmiad BO2, { d0 - d3 } + vstmia.f64 BO2, { d0 - d3 } add AO1, AO1, #16 add BO2, BO2, #32 @@ -166,12 +166,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmiad AO1, { d0 } + vldmia.f64 AO1, { d0 } add r3, AO1, LDA - fldmiad r3, { d1 } + vldmia.f64 r3, { d1 } - fstmiad BO3, { d0 - d1 } + vstmia.f64 BO3, { d0 - d1 } add AO1, AO1, #8 add BO3, BO3, #16 @@ -182,9 +182,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x1 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } - fstmiad BO1, { d0 - d3 } + vstmia.f64 BO1, { d0 - d3 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -192,9 +192,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x1 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } - fstmiad BO2, { d0 - d1 } + vstmia.f64 BO2, { d0 - d1 } add AO1, AO1, #16 add BO2, BO2, #16 @@ -202,9 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmiad AO1, { d0 } + vldmia.f64 AO1, { d0 } - fstmiad BO3, { d0 } + vstmia.f64 BO3, { d0 } add AO1, AO1, #8 add BO3, BO3, #8 diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S index c0c6a1677..e73936cdd 100644 --- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -128,10 +128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [ BO ] pld [ AO , #A_PRE ] - fldmiad AO!, { d0 - d1} + vldmia.f64 AO!, { d0 - d1} fmuld d16 , d0, d8 - fldmiad AO!, { d2 - d3} + vldmia.f64 AO!, { d2 - d3} fmuld d17 , d1, d8 fldd d9 , [ BO, #8 ] fmuld d18 , d2, d8 @@ -148,10 +148,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmuld d23 , d3, d9 fmuld d24 , d0, d10 - fldmiad AO!, { d4 - d5 } + vldmia.f64 AO!, { d4 - d5 } fmuld d25 , d1, d10 fmuld d26 , d2, d10 - fldmiad AO!, { d6 - d7 } + vldmia.f64 AO!, { d6 - d7 } fmuld d27 , d3, d10 fldd d13, [ BO, #8 ] @@ -173,10 +173,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [ BO ] pld [ AO , #A_PRE ] - fldmiad AO!, { d0 - d1} + vldmia.f64 AO!, { d0 - d1} fmacd d16 , d0, d8 - fldmiad AO!, { d2 - d3} + vldmia.f64 AO!, { d2 - d3} fmacd d17 , d1, d8 fldd d9 , [ BO, #8 ] fmacd d18 , d2, d8 @@ -193,10 +193,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d23 , d3, d9 fmacd d24 , d0, d10 - fldmiad AO!, { d4 - d5 } + vldmia.f64 AO!, { d4 - d5 } fmacd d25 , d1, d10 fmacd d26 , d2, d10 - fldmiad AO!, { d6 - d7 } + vldmia.f64 AO!, { d6 - d7 } fmacd d27 , d3, d10 fldd d13, [ BO, #8 ] @@ -225,11 +225,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [ BO ] fmacd d21 , d5, d13 fmacd d22 , d6, d13 - fldmiad AO!, { d0 - d1 } + vldmia.f64 AO!, { d0 - d1 } fmacd d23 , d7, d13 fmacd d24 , d4, d14 - fldmiad AO!, { d2 - d3 } + vldmia.f64 AO!, { d2 - d3 } fmacd d25 , d5, d14 fldd d9 , [ BO, #8 ] fmacd d26 , d6, d14 @@ -257,10 +257,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d19 , d3, d8 fmacd d20 , d0, d9 - fldmiad AO!, { d4 - d5 } + vldmia.f64 AO!, { d4 - d5 } fmacd d21 , d1, d9 fmacd d22 , d2, d9 - fldmiad AO!, { d6 - d7 } + vldmia.f64 AO!, { d6 - d7 } fmacd d23 , d3, d9 fmacd d24 , d0, d10 @@ -390,7 +390,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstd d11, [r4 , #24 ] fmuld d15, d0 , d31 - fstmiad CO2, { d12 - d15 } + vstmia.f64 CO2, { d12 - d15 } add CO1, CO1, #32 diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S index 7c154d741..753ac27c6 100644 --- a/kernel/arm/gemv_n_vfp.S +++ b/kernel/arm/gemv_n_vfp.S @@ -139,8 +139,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 pld [ AO2 , #A_PRE ] - fldmiad XO! , { d2 } - fldmiad AO1 , { d4 - d7 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1 , { d4 - d7 } vmla.f64 d8 , d2 , d4 pld [ AO2 , #4*SIZE ] @@ -150,7 +150,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmla.f64 d11 , d2 , d7 - fldmiad r3 , { d4 - d7 } + vldmia.f64 r3 , { d4 - d7 } vmla.f64 d12 , d2 , d4 vmla.f64 d13 , d2 , d5 @@ -164,23 +164,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } vmla.f64 d4 , d0, d8 vmla.f64 d5 , d0, d9 vmla.f64 d6 , d0, d10 vmla.f64 d7 , d0, d11 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } vmla.f64 d4 , d0, d12 vmla.f64 d5 , d0, d13 vmla.f64 d6 , d0, d14 vmla.f64 d7 , d0, d15 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } .endm @@ -195,8 +195,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d2 } - fldmiad AO1 , { d8 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1 , { d8 } vmla.f64 d12 , d2 , d8 add AO1, AO1, LDA @@ -204,9 +204,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d12 - fstmiad YO!, { d4 } + vstmia.f64 YO!, { d4 } .endm @@ -234,8 +234,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4X1 pld [ AO2 , #A_PRE ] - fldmiad XO , { d2 } - fldmiad AO1 , { d8 - d11 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1 , { d8 - d11 } vmla.f64 d12 , d2 , d8 add AO1, AO1, LDA @@ -249,24 +249,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S4 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4 , d0, d12 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y - fldmiad YO, { d5 } + vldmia.f64 YO, { d5 } vmla.f64 d5 , d0, d13 - fstmiad YO, { d5 } + vstmia.f64 YO, { d5 } add YO, YO, INC_Y - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4 , d0, d14 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y - fldmiad YO, { d5 } + vldmia.f64 YO, { d5 } vmla.f64 d5 , d0, d15 - fstmiad YO, { d5 } + vstmia.f64 YO, { d5 } add YO, YO, INC_Y .endm @@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d2 } - fldmiad AO1 , { d8 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1 , { d8 } vmla.f64 d12 , d2 , d8 add AO1, AO1, LDA add XO, XO , INC_X @@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d12 - fstmiad YO , { d4 } + vstmia.f64 YO , { d4 } add YO, YO, INC_Y .endm @@ -338,8 +338,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 pld [ AO2, #A_PRE ] - fldmias XO! , { s2 } - fldmias AO1 , { s4 - s7 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1 , { s4 - s7 } vmla.f32 s8 , s2 , s4 vmla.f32 s9 , s2 , s5 @@ -348,7 +348,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add r3, AO1, #4*SIZE - fldmias r3 , { s4 - s7 } + vldmia.f32 r3 , { s4 - s7 } vmla.f32 s12 , s2 , s4 vmla.f32 s13 , s2 , s5 @@ -362,24 +362,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } vmla.f32 s4 , s0, s8 vmla.f32 s5 , s0, s9 vmla.f32 s6 , s0, s10 vmla.f32 s7 , s0, s11 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } vmla.f32 s4 , s0, s12 vmla.f32 s5 , s0, s13 vmla.f32 s6 , s0, s14 vmla.f32 s7 , s0, s15 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } .endm @@ -394,8 +394,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s2 } - fldmias AO1 , { s8 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1 , { s8 } vmla.f32 s12 , s2 , s8 add AO1, AO1, LDA @@ -403,9 +403,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s12 - fstmias YO!, { s4 } + vstmia.f32 YO!, { s4 } .endm @@ -434,8 +434,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4X1 - fldmias XO , { s2 } - fldmias AO1 , { s8 - s11 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1 , { s8 - s11 } vmla.f32 s12 , s2 , s8 vmla.f32 s13 , s2 , s9 @@ -449,24 +449,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S4 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4 , s0, s12 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y - fldmias YO, { s5 } + vldmia.f32 YO, { s5 } vmla.f32 s5 , s0, s13 - fstmias YO, { s5 } + vstmia.f32 YO, { s5 } add YO, YO, INC_Y - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4 , s0, s14 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y - fldmias YO, { s5 } + vldmia.f32 YO, { s5 } vmla.f32 s5 , s0, s15 - fstmias YO, { s5 } + vstmia.f32 YO, { s5 } add YO, YO, INC_Y .endm @@ -482,8 +482,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s2 } - fldmias AO1 , { s8 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1 , { s8 } vmla.f32 s12 , s2 , s8 add AO1, AO1, LDA add XO, XO , INC_X @@ -492,9 +492,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s12 - fstmias YO , { s4 } + vstmia.f32 YO , { s4 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S index 54f958b7b..e80dc1458 100644 --- a/kernel/arm/gemv_n_vfpv3.S +++ b/kernel/arm/gemv_n_vfpv3.S @@ -138,8 +138,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 - fldmiad XO! , { d4 } - fldmiad AO1 , { d8 - d15 } + vldmia.f64 XO! , { d4 } + vldmia.f64 AO1 , { d8 - d15 } vmla.f64 d24 , d4 , d8 pld [ AO2 , #A_PRE ] @@ -158,7 +158,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmiad YO, { d16 - d23 } + vldmia.f64 YO, { d16 - d23 } vmla.f64 d16, d0, d24 vmla.f64 d17, d0, d25 @@ -169,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmla.f64 d22, d0, d30 vmla.f64 d23, d0, d31 - fstmiad YO!, { d16 - d23 } + vstmia.f64 YO!, { d16 - d23 } .endm @@ -184,8 +184,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d4 } - fldmiad AO1 , { d8 } + vldmia.f64 XO! , { d4 } + vldmia.f64 AO1 , { d8 } vmla.f64 d24 , d4 , d8 add AO1, AO1, LDA @@ -193,9 +193,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmiad YO, { d16 } + vldmia.f64 YO, { d16 } vmla.f64 d16, d0, d24 - fstmiad YO!, { d16 } + vstmia.f64 YO!, { d16 } .endm @@ -234,8 +234,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ AO2 , #A_PRE ] pld [ AO2 , #A_PRE+32 ] - fldmiad XO , { d4 } - fldmiad AO1 , { d8 - d15 } + vldmia.f64 XO , { d4 } + vldmia.f64 AO1 , { d8 - d15 } vmla.f64 d24 , d4 , d8 vmla.f64 d25 , d4 , d9 @@ -253,44 +253,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S8 - fldmiad YO, { d16 } + vldmia.f64 YO, { d16 } vmla.f64 d16, d0, d24 - fstmiad YO, { d16 } + vstmia.f64 YO, { d16 } add YO, YO, INC_Y - fldmiad YO, { d17 } + vldmia.f64 YO, { d17 } vmla.f64 d17, d0, d25 - fstmiad YO, { d17 } + vstmia.f64 YO, { d17 } add YO, YO, INC_Y - fldmiad YO, { d18 } + vldmia.f64 YO, { d18 } vmla.f64 d18, d0, d26 - fstmiad YO, { d18 } + vstmia.f64 YO, { d18 } add YO, YO, INC_Y - fldmiad YO, { d19 } + vldmia.f64 YO, { d19 } vmla.f64 d19, d0, d27 - fstmiad YO, { d19 } + vstmia.f64 YO, { d19 } add YO, YO, INC_Y - fldmiad YO, { d20 } + vldmia.f64 YO, { d20 } vmla.f64 d20, d0, d28 - fstmiad YO, { d20 } + vstmia.f64 YO, { d20 } add YO, YO, INC_Y - fldmiad YO, { d21 } + vldmia.f64 YO, { d21 } vmla.f64 d21, d0, d29 - fstmiad YO, { d21 } + vstmia.f64 YO, { d21 } add YO, YO, INC_Y - fldmiad YO, { d22 } + vldmia.f64 YO, { d22 } vmla.f64 d22, d0, d30 - fstmiad YO, { d22 } + vstmia.f64 YO, { d22 } add YO, YO, INC_Y - fldmiad YO, { d23 } + vldmia.f64 YO, { d23 } vmla.f64 d23, d0, d31 - fstmiad YO, { d23 } + vstmia.f64 YO, { d23 } add YO, YO, INC_Y .endm @@ -306,8 +306,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d4 } - fldmiad AO1 , { d8 } + vldmia.f64 XO , { d4 } + vldmia.f64 AO1 , { d8 } vmla.f64 d24 , d4 , d8 add AO1, AO1, LDA add XO, XO, INC_X @@ -316,9 +316,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d16 } + vldmia.f64 YO, { d16 } vmla.f64 d16, d0, d24 - fstmiad YO, { d16 } + vstmia.f64 YO, { d16 } add YO, YO, INC_Y .endm @@ -361,8 +361,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 pld [ AO2 , #A_PRE ] - fldmias XO! , { s4 } - fldmias AO1 , { s8 - s15 } + vldmia.f32 XO! , { s4 } + vldmia.f32 AO1 , { s8 - s15 } vmla.f32 s24 , s4 , s8 vmla.f32 s25 , s4 , s9 @@ -379,7 +379,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmias YO, { s16 - s23 } + vldmia.f32 YO, { s16 - s23 } vmla.f32 s16, s0, s24 vmla.f32 s17, s0, s25 @@ -390,7 +390,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmla.f32 s22, s0, s30 vmla.f32 s23, s0, s31 - fstmias YO!, { s16 - s23 } + vstmia.f32 YO!, { s16 - s23 } .endm @@ -405,8 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s4 } - fldmias AO1 , { s8 } + vldmia.f32 XO! , { s4 } + vldmia.f32 AO1 , { s8 } vmla.f32 s24 , s4 , s8 add AO1, AO1, LDA @@ -414,9 +414,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmias YO, { s16 } + vldmia.f32 YO, { s16 } vmla.f32 s16, s0, s24 - fstmias YO!, { s16 } + vstmia.f32 YO!, { s16 } .endm @@ -454,8 +454,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S8X1 pld [ AO2 , #A_PRE ] - fldmias XO , { s4 } - fldmias AO1 , { s8 - s15 } + vldmia.f32 XO , { s4 } + vldmia.f32 AO1 , { s8 - s15 } vmla.f32 s24 , s4 , s8 vmla.f32 s25 , s4 , s9 @@ -473,44 +473,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S8 - fldmias YO, { s16 } + vldmia.f32 YO, { s16 } vmla.f32 s16, s0, s24 - fstmias YO, { s16 } + vstmia.f32 YO, { s16 } add YO, YO, INC_Y - fldmias YO, { s17 } + vldmia.f32 YO, { s17 } vmla.f32 s17, s0, s25 - fstmias YO, { s17 } + vstmia.f32 YO, { s17 } add YO, YO, INC_Y - fldmias YO, { s18 } + vldmia.f32 YO, { s18 } vmla.f32 s18, s0, s26 - fstmias YO, { s18 } + vstmia.f32 YO, { s18 } add YO, YO, INC_Y - fldmias YO, { s19 } + vldmia.f32 YO, { s19 } vmla.f32 s19, s0, s27 - fstmias YO, { s19 } + vstmia.f32 YO, { s19 } add YO, YO, INC_Y - fldmias YO, { s20 } + vldmia.f32 YO, { s20 } vmla.f32 s20, s0, s28 - fstmias YO, { s20 } + vstmia.f32 YO, { s20 } add YO, YO, INC_Y - fldmias YO, { s21 } + vldmia.f32 YO, { s21 } vmla.f32 s21, s0, s29 - fstmias YO, { s21 } + vstmia.f32 YO, { s21 } add YO, YO, INC_Y - fldmias YO, { s22 } + vldmia.f32 YO, { s22 } vmla.f32 s22, s0, s30 - fstmias YO, { s22 } + vstmia.f32 YO, { s22 } add YO, YO, INC_Y - fldmias YO, { s23 } + vldmia.f32 YO, { s23 } vmla.f32 s23, s0, s31 - fstmias YO, { s23 } + vstmia.f32 YO, { s23 } add YO, YO, INC_Y .endm @@ -526,8 +526,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s4 } - fldmias AO1 , { s8 } + vldmia.f32 XO , { s4 } + vldmia.f32 AO1 , { s8 } vmla.f32 s24 , s4 , s8 add AO1, AO1, LDA add XO, XO, INC_X @@ -536,9 +536,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s16 } + vldmia.f32 YO, { s16 } vmla.f32 s16, s0, s24 - fstmias YO, { s16 } + vstmia.f32 YO, { s16 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/gemv_t_vfp.S b/kernel/arm/gemv_t_vfp.S index 9559d1829..fbe51cc8c 100644 --- a/kernel/arm/gemv_t_vfp.S +++ b/kernel/arm/gemv_t_vfp.S @@ -112,13 +112,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d12 - d15 } + vldmia.f64 XO! , { d12 - d15 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d4 - d5 } - fldmiad AO1!, { d10 - d11 } - fldmiad AO2!, { d6 - d7 } + vldmia.f64 AO2!, { d4 - d5 } + vldmia.f64 AO1!, { d10 - d11 } + vldmia.f64 AO2!, { d6 - d7 } vmla.f64 d2 , d12 , d8 vmla.f64 d3 , d12 , d4 @@ -133,9 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmiad XO! , { d1 } - fldmiad AO1!, { d8 } - fldmiad AO2!, { d4 } + vldmia.f64 XO! , { d1 } + vldmia.f64 AO1!, { d8 } + vldmia.f64 AO2!, { d4 } vmla.f64 d2 , d1 , d8 vmla.f64 d3 , d1 , d4 @@ -143,10 +143,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } vmla.f64 d4, d0, d2 vmla.f64 d5, d0, d3 - fstmiad YO!, { d4 - d5 } + vstmia.f64 YO!, { d4 - d5 } .endm @@ -160,10 +160,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d12 - d15 } + vldmia.f64 XO! , { d12 - d15 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d2 , d12 , d8 vmla.f64 d2 , d13 , d9 vmla.f64 d2 , d14, d10 @@ -173,17 +173,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d1 } - fldmiad AO1!, { d8 } + vldmia.f64 XO! , { d1 } + vldmia.f64 AO1!, { d8 } vmla.f64 d2 , d1 , d8 .endm .macro SAVE_F1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d2 - fstmiad YO!, { d4 } + vstmia.f64 YO!, { d4 } .endm @@ -197,23 +197,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 - fldmiad XO , { d12 } + vldmia.f64 XO , { d12 } add XO, XO, INC_X pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d4 - d5 } + vldmia.f64 AO2!, { d4 - d5 } - fldmiad XO , { d13 } + vldmia.f64 XO , { d13 } add XO, XO, INC_X - fldmiad AO1!, { d10 - d11 } - fldmiad AO2!, { d6 - d7 } + vldmia.f64 AO1!, { d10 - d11 } + vldmia.f64 AO2!, { d6 - d7 } - fldmiad XO , { d14 } + vldmia.f64 XO , { d14 } add XO, XO, INC_X - fldmiad XO , { d15 } + vldmia.f64 XO , { d15 } add XO, XO, INC_X vmla.f64 d2 , d12 , d8 @@ -229,9 +229,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmiad XO , { d1 } - fldmiad AO1!, { d8 } - fldmiad AO2!, { d4 } + vldmia.f64 XO , { d1 } + vldmia.f64 AO1!, { d8 } + vldmia.f64 AO2!, { d4 } vmla.f64 d2 , d1 , d8 add XO, XO, INC_X vmla.f64 d3 , d1 , d4 @@ -240,14 +240,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d2 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y - fldmiad YO, { d5 } + vldmia.f64 YO, { d5 } vmla.f64 d5, d0, d3 - fstmiad YO, { d5 } + vstmia.f64 YO, { d5 } add YO, YO, INC_Y .endm @@ -261,20 +261,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 - fldmiad XO , { d12 } + vldmia.f64 XO , { d12 } add XO, XO, INC_X pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } - fldmiad XO , { d13 } + vldmia.f64 XO , { d13 } add XO, XO, INC_X - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } - fldmiad XO , { d14 } + vldmia.f64 XO , { d14 } add XO, XO, INC_X - fldmiad XO , { d15 } + vldmia.f64 XO , { d15 } add XO, XO, INC_X vmla.f64 d2 , d12 , d8 @@ -286,8 +286,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d1 } - fldmiad AO1!, { d8 } + vldmia.f64 XO , { d1 } + vldmia.f64 AO1!, { d8 } vmla.f64 d2 , d1 , d8 add XO, XO, INC_X @@ -295,9 +295,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d2 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y .endm @@ -315,11 +315,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 - fldmias XO! , { s12 - s15 } - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s4 - s5 } - fldmias AO1!, { s10 - s11 } - fldmias AO2!, { s6 - s7 } + vldmia.f32 XO! , { s12 - s15 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s4 - s5 } + vldmia.f32 AO1!, { s10 - s11 } + vldmia.f32 AO2!, { s6 - s7 } vmla.f32 s2 , s12 , s8 vmla.f32 s3 , s12 , s4 @@ -334,9 +334,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmias XO! , { s1 } - fldmias AO1!, { s8 } - fldmias AO2!, { s4 } + vldmia.f32 XO! , { s1 } + vldmia.f32 AO1!, { s8 } + vldmia.f32 AO2!, { s4 } vmla.f32 s2 , s1 , s8 vmla.f32 s3 , s1 , s4 @@ -344,10 +344,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } vmla.f32 s4, s0, s2 vmla.f32 s5, s0, s3 - fstmias YO!, { s4 - s5 } + vstmia.f32 YO!, { s4 - s5 } .endm @@ -359,9 +359,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 - fldmias XO! , { s12 - s15 } - fldmias AO1!, { s8 - s9 } - fldmias AO1!, { s10 - s11 } + vldmia.f32 XO! , { s12 - s15 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s2 , s12 , s8 vmla.f32 s2 , s13 , s9 vmla.f32 s2 , s14, s10 @@ -371,17 +371,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s1 } - fldmias AO1!, { s8 } + vldmia.f32 XO! , { s1 } + vldmia.f32 AO1!, { s8 } vmla.f32 s2 , s1 , s8 .endm .macro SAVE_F1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s2 - fstmias YO!, { s4 } + vstmia.f32 YO!, { s4 } .endm @@ -395,21 +395,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 - fldmias XO , { s12 } + vldmia.f32 XO , { s12 } add XO, XO, INC_X - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s4 - s5 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s4 - s5 } - fldmias XO , { s13 } + vldmia.f32 XO , { s13 } add XO, XO, INC_X - fldmias AO1!, { s10 - s11 } - fldmias AO2!, { s6 - s7 } + vldmia.f32 AO1!, { s10 - s11 } + vldmia.f32 AO2!, { s6 - s7 } - fldmias XO , { s14 } + vldmia.f32 XO , { s14 } add XO, XO, INC_X - fldmias XO , { s15 } + vldmia.f32 XO , { s15 } add XO, XO, INC_X vmla.f32 s2 , s12 , s8 @@ -425,9 +425,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmias XO , { s1 } - fldmias AO1!, { s8 } - fldmias AO2!, { s4 } + vldmia.f32 XO , { s1 } + vldmia.f32 AO1!, { s8 } + vldmia.f32 AO2!, { s4 } vmla.f32 s2 , s1 , s8 add XO, XO, INC_X vmla.f32 s3 , s1 , s4 @@ -436,14 +436,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s2 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y - fldmias YO, { s5 } + vldmia.f32 YO, { s5 } vmla.f32 s5, s0, s3 - fstmias YO, { s5 } + vstmia.f32 YO, { s5 } add YO, YO, INC_Y .endm @@ -456,20 +456,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 - fldmias XO , { s12 } + vldmia.f32 XO , { s12 } add XO, XO, INC_X pld [ AO1 , #A_PRE ] - fldmias AO1!, { s8 - s9 } + vldmia.f32 AO1!, { s8 - s9 } - fldmias XO , { s13 } + vldmia.f32 XO , { s13 } add XO, XO, INC_X - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } - fldmias XO , { s14 } + vldmia.f32 XO , { s14 } add XO, XO, INC_X - fldmias XO , { s15 } + vldmia.f32 XO , { s15 } add XO, XO, INC_X vmla.f32 s2 , s12 , s8 @@ -481,8 +481,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s1 } - fldmias AO1!, { s8 } + vldmia.f32 XO , { s1 } + vldmia.f32 AO1!, { s8 } vmla.f32 s2 , s1 , s8 add XO, XO, INC_X @@ -490,9 +490,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s2 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/gemv_t_vfpv3.S b/kernel/arm/gemv_t_vfpv3.S index b1d3dadf1..a88d70016 100644 --- a/kernel/arm/gemv_t_vfpv3.S +++ b/kernel/arm/gemv_t_vfpv3.S @@ -108,17 +108,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d28 - d31 } + vldmia.f64 XO! , { d28 - d31 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d16 - d17 } + vldmia.f64 AO2!, { d16 - d17 } vmla.f64 d4 , d28 , d8 vmla.f64 d5 , d28 , d16 - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 vmla.f64 d5 , d29 , d17 - fldmiad AO2!, { d18 - d19 } + vldmia.f64 AO2!, { d18 - d19 } vmla.f64 d4 , d30, d10 vmla.f64 d5 , d30, d18 vmla.f64 d4 , d31, d11 @@ -129,9 +129,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmiad XO! , { d2 } - fldmiad AO1!, { d8 } - fldmiad AO2!, { d16 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1!, { d8 } + vldmia.f64 AO2!, { d16 } vmla.f64 d4 , d2 , d8 vmla.f64 d5 , d2 , d16 @@ -139,10 +139,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmiad YO, { d24 - d25 } + vldmia.f64 YO, { d24 - d25 } vmla.f64 d24, d0, d4 vmla.f64 d25, d0, d5 - fstmiad YO!, { d24 - d25 } + vstmia.f64 YO!, { d24 - d25 } .endm @@ -156,23 +156,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 pld [ AO1 , #A_PRE ] - fldmiad XO , { d28 } + vldmia.f64 XO , { d28 } add XO, XO, INC_X - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d16 - d17 } + vldmia.f64 AO2!, { d16 - d17 } vmla.f64 d4 , d28 , d8 - fldmiad XO , { d29 } + vldmia.f64 XO , { d29 } add XO, XO, INC_X vmla.f64 d5 , d28 , d16 - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 - fldmiad XO , { d30 } + vldmia.f64 XO , { d30 } add XO, XO, INC_X vmla.f64 d5 , d29 , d17 - fldmiad AO2!, { d18 - d19 } + vldmia.f64 AO2!, { d18 - d19 } vmla.f64 d4 , d30, d10 - fldmiad XO , { d31 } + vldmia.f64 XO , { d31 } add XO, XO, INC_X vmla.f64 d5 , d30, d18 vmla.f64 d4 , d31, d11 @@ -183,10 +183,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmiad XO , { d2 } - fldmiad AO1!, { d8 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1!, { d8 } add XO, XO, INC_X - fldmiad AO2!, { d16 } + vldmia.f64 AO2!, { d16 } vmla.f64 d4 , d2 , d8 vmla.f64 d5 , d2 , d16 @@ -194,14 +194,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d4 - fstmiad YO, { d24 } + vstmia.f64 YO, { d24 } add YO, YO, INC_Y - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d5 - fstmiad YO, { d24 } + vstmia.f64 YO, { d24 } add YO, YO, INC_Y .endm @@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d28 - d31 } + vldmia.f64 XO! , { d28 - d31 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } vmla.f64 d4 , d28 , d8 - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 vmla.f64 d4 , d30, d10 vmla.f64 d4 , d31, d11 @@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d2 } - fldmiad AO1!, { d8 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1!, { d8 } vmla.f64 d4 , d2 , d8 .endm .macro SAVE_F1 - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d4 - fstmiad YO!, { d24 } + vstmia.f64 YO!, { d24 } .endm @@ -252,18 +252,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 pld [ AO1 , #A_PRE ] - fldmiad XO , { d28 } + vldmia.f64 XO , { d28 } add XO, XO, INC_X - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } vmla.f64 d4 , d28 , d8 - fldmiad XO , { d29 } + vldmia.f64 XO , { d29 } add XO, XO, INC_X - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 - fldmiad XO , { d30 } + vldmia.f64 XO , { d30 } add XO, XO, INC_X vmla.f64 d4 , d30, d10 - fldmiad XO , { d31 } + vldmia.f64 XO , { d31 } add XO, XO, INC_X vmla.f64 d4 , d31, d11 @@ -272,8 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d2 } - fldmiad AO1!, { d8 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1!, { d8 } add XO, XO, INC_X vmla.f64 d4 , d2 , d8 @@ -281,9 +281,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d4 - fstmiad YO, { d24 } + vstmia.f64 YO, { d24 } add YO, YO, INC_Y .endm @@ -300,15 +300,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 - fldmias XO! , { s28 - s31 } - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s16 - s17 } + vldmia.f32 XO! , { s28 - s31 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s16 - s17 } vmla.f32 s4 , s28 , s8 vmla.f32 s5 , s28 , s16 - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 vmla.f32 s5 , s29 , s17 - fldmias AO2!, { s18 - s19 } + vldmia.f32 AO2!, { s18 - s19 } vmla.f32 s4 , s30, s10 vmla.f32 s5 , s30, s18 vmla.f32 s4 , s31, s11 @@ -319,9 +319,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmias XO! , { s2 } - fldmias AO1!, { s8 } - fldmias AO2!, { s16 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1!, { s8 } + vldmia.f32 AO2!, { s16 } vmla.f32 s4 , s2 , s8 vmla.f32 s5 , s2 , s16 @@ -329,10 +329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmias YO, { s24 - s25 } + vldmia.f32 YO, { s24 - s25 } vmla.f32 s24, s0, s4 vmla.f32 s25, s0, s5 - fstmias YO!, { s24 - s25 } + vstmia.f32 YO!, { s24 - s25 } .endm @@ -345,22 +345,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 - fldmias XO , { s28 } + vldmia.f32 XO , { s28 } add XO, XO, INC_X - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s16 - s17 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s16 - s17 } vmla.f32 s4 , s28 , s8 - fldmias XO , { s29 } + vldmia.f32 XO , { s29 } add XO, XO, INC_X vmla.f32 s5 , s28 , s16 - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 - fldmias XO , { s30 } + vldmia.f32 XO , { s30 } add XO, XO, INC_X vmla.f32 s5 , s29 , s17 - fldmias AO2!, { s18 - s19 } + vldmia.f32 AO2!, { s18 - s19 } vmla.f32 s4 , s30, s10 - fldmias XO , { s31 } + vldmia.f32 XO , { s31 } add XO, XO, INC_X vmla.f32 s5 , s30, s18 vmla.f32 s4 , s31, s11 @@ -371,10 +371,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmias XO , { s2 } - fldmias AO1!, { s8 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1!, { s8 } add XO, XO, INC_X - fldmias AO2!, { s16 } + vldmia.f32 AO2!, { s16 } vmla.f32 s4 , s2 , s8 vmla.f32 s5 , s2 , s16 @@ -382,14 +382,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s4 - fstmias YO, { s24 } + vstmia.f32 YO, { s24 } add YO, YO, INC_Y - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s5 - fstmias YO, { s24 } + vstmia.f32 YO, { s24 } add YO, YO, INC_Y .endm @@ -402,10 +402,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 - fldmias XO! , { s28 - s31 } - fldmias AO1!, { s8 - s9 } + vldmia.f32 XO! , { s28 - s31 } + vldmia.f32 AO1!, { s8 - s9 } vmla.f32 s4 , s28 , s8 - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 vmla.f32 s4 , s30, s10 vmla.f32 s4 , s31, s11 @@ -415,17 +415,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s2 } - fldmias AO1!, { s8 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1!, { s8 } vmla.f32 s4 , s2 , s8 .endm .macro SAVE_F1 - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s4 - fstmias YO!, { s24 } + vstmia.f32 YO!, { s24 } .endm @@ -437,18 +437,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 - fldmias XO , { s28 } + vldmia.f32 XO , { s28 } add XO, XO, INC_X - fldmias AO1!, { s8 - s9 } + vldmia.f32 AO1!, { s8 - s9 } vmla.f32 s4 , s28 , s8 - fldmias XO , { s29 } + vldmia.f32 XO , { s29 } add XO, XO, INC_X - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 - fldmias XO , { s30 } + vldmia.f32 XO , { s30 } add XO, XO, INC_X vmla.f32 s4 , s30, s10 - fldmias XO , { s31 } + vldmia.f32 XO , { s31 } add XO, XO, INC_X vmla.f32 s4 , s31, s11 @@ -457,8 +457,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s2 } - fldmias AO1!, { s8 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1!, { s8 } add XO, XO, INC_X vmla.f32 s4 , s2 , s8 @@ -466,9 +466,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s4 - fstmias YO, { s24 } + vstmia.f32 YO, { s24 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/iamax_vfp.S b/kernel/arm/iamax_vfp.S index fab05c9c8..fd43b15b1 100644 --- a/kernel/arm/iamax_vfp.S +++ b/kernel/arm/iamax_vfp.S @@ -114,7 +114,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmiad X!, { d0 } + vldmia.f64 X!, { d0 } VABS( d0, d0 ) mov Z, #1 mov INDEX, Z @@ -123,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } add Z, Z, #1 VABS( d4, d4 ) vcmpe.f64 d4, d0 @@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmiad X, { d0 } + vldmia.f64 X, { d0 } VABS( d0, d0 ) mov Z, #1 mov INDEX, Z @@ -146,7 +146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } add Z, Z, #1 VABS( d4, d4 ) vcmpe.f64 d4, d0 @@ -161,7 +161,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmias X!, { s0 } + vldmia.f32 X!, { s0 } VABS( s0, s0 ) mov Z, #1 mov INDEX, Z @@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } add Z, Z, #1 VABS( s4, s4 ) vcmpe.f32 s4, s0 @@ -182,7 +182,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmias X, { s0 } + vldmia.f32 X, { s0 } VABS( s0, s0 ) mov Z, #1 mov INDEX, Z @@ -193,7 +193,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } add Z, Z, #1 VABS( s4, s4 ) vcmpe.f32 s4, s0 @@ -215,7 +215,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmiad X!, { d0 -d1 } + vldmia.f64 X!, { d0 -d1 } vabs.f64 d0, d0 vabs.f64 d1, d1 vadd.f64 d0 , d0, d1 @@ -227,7 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } add Z, Z, #1 vabs.f64 d4, d4 vabs.f64 d5, d5 @@ -241,7 +241,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmiad X, { d0 -d1 } + vldmia.f64 X, { d0 -d1 } vabs.f64 d0, d0 vabs.f64 d1, d1 vadd.f64 d0 , d0, d1 @@ -255,7 +255,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } add Z, Z, #1 vabs.f64 d4, d4 vabs.f64 d5, d5 @@ -272,7 +272,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmias X!, { s0 -s1 } + vldmia.f32 X!, { s0 -s1 } vabs.f32 s0, s0 vabs.f32 s1, s1 vadd.f32 s0 , s0, s1 @@ -284,7 +284,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } add Z, Z, #1 vabs.f32 s4, s4 vabs.f32 s5, s5 @@ -298,7 +298,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmias X, { s0 -s1 } + vldmia.f32 X, { s0 -s1 } vabs.f32 s0, s0 vabs.f32 s1, s1 vadd.f32 s0 , s0, s1 @@ -312,7 +312,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } add Z, Z, #1 vabs.f32 s4, s4 vabs.f32 s5, s5 diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S index 16ac5a632..8e0937851 100644 --- a/kernel/arm/nrm2_vfp.S +++ b/kernel/arm/nrm2_vfp.S @@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -95,7 +95,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -121,7 +121,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -158,7 +158,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -191,7 +191,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -249,7 +249,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -294,7 +294,7 @@ KERNEL_S1_END_\@: .macro KERNEL_F1 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -350,7 +350,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S index 84977901d..7be1e977e 100644 --- a/kernel/arm/nrm2_vfpv3.S +++ b/kernel/arm/nrm2_vfpv3.S @@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -95,7 +95,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -121,7 +121,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -158,7 +158,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -191,7 +191,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -249,7 +249,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -294,7 +294,7 @@ KERNEL_S1_END_\@: .macro KERNEL_F1 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -350,7 +350,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S index ea296dbc5..6aec06205 100644 --- a/kernel/arm/rot_vfp.S +++ b/kernel/arm/rot_vfp.S @@ -77,68 +77,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_F1 - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_S1 - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X, { d2 } - fstmiad Y, { d3 } + vstmia.f64 X, { d2 } + vstmia.f64 Y, { d3 } add X, X, INC_X add Y, Y, INC_Y @@ -149,68 +149,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_F1 - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_S1 - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X, { s2 } - fstmias Y, { s3 } + vstmia.f32 X, { s2 } + vstmia.f32 Y, { s3 } add X, X, INC_X add Y, Y, INC_Y @@ -230,96 +230,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_F1 - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_S1 - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 @@ -347,96 +347,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_F1 - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_S1 - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 diff --git a/kernel/arm/scal_vfp.S b/kernel/arm/scal_vfp.S index cc3e3b98d..8992c35a8 100644 --- a/kernel/arm/scal_vfp.S +++ b/kernel/arm/scal_vfp.S @@ -64,30 +64,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X, { d4 - d7 } + vldmia.f64 X, { d4 - d7 } vmul.f64 d4, d4, d0 vmul.f64 d5, d5, d0 vmul.f64 d6, d6, d0 - fstmiad X!, { d4 - d5 } + vstmia.f64 X!, { d4 - d5 } vmul.f64 d7, d7, d0 - fstmiad X!, { d6 - d7 } + vstmia.f64 X!, { d6 - d7 } .endm .macro KERNEL_F1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vmul.f64 d4, d4, d0 - fstmiad X!, { d4 } + vstmia.f64 X!, { d4 } .endm .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vmul.f64 d4, d4, d0 - fstmiad X, { d4 } + vstmia.f64 X, { d4 } add X, X, INC_X .endm @@ -96,30 +96,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X, { s4 - s7 } + vldmia.f32 X, { s4 - s7 } vmul.f32 s4, s4, s0 vmul.f32 s5, s5, s0 vmul.f32 s6, s6, s0 - fstmias X!, { s4 - s5 } + vstmia.f32 X!, { s4 - s5 } vmul.f32 s7, s7, s0 - fstmias X!, { s6 - s7 } + vstmia.f32 X!, { s6 - s7 } .endm .macro KERNEL_F1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vmul.f32 s4, s4, s0 - fstmias X!, { s4 } + vstmia.f32 X!, { s4 } .endm .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vmul.f32 s4, s4, s0 - fstmias X, { s4 } + vstmia.f32 X, { s4 } add X, X, INC_X .endm @@ -136,58 +136,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } pld [ X, #X_PRE ] - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } .endm .macro KERNEL_F1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } .endm .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X, { d2 - d3 } + vstmia.f64 X, { d2 - d3 } add X, X, INC_X .endm @@ -199,56 +199,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } .endm .macro KERNEL_F1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } .endm .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X, { s2 - s3 } + vstmia.f32 X, { s2 - s3 } add X, X, INC_X .endm diff --git a/kernel/arm/scopy_vfp.S b/kernel/arm/scopy_vfp.S index 0fd815db8..1ccd29c95 100644 --- a/kernel/arm/scopy_vfp.S +++ b/kernel/arm/scopy_vfp.S @@ -65,17 +65,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_F8 pld [ X, #X_PRE ] - fldmias X!, { s0 - s3 } - fldmias X!, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias Y!, { s4 - s7 } + vldmia.f32 X!, { s0 - s3 } + vldmia.f32 X!, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 Y!, { s4 - s7 } .endm .macro COPY_F1 - fldmias X!, { s0 } - fstmias Y!, { s0 } + vldmia.f32 X!, { s0 } + vstmia.f32 Y!, { s0 } .endm @@ -85,23 +85,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmias X, { s0 } - fstmias Y, { s0 } + vldmia.f32 X, { s0 } + vstmia.f32 Y, { s0 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s1 } - fstmias Y, { s1 } + vldmia.f32 X, { s1 } + vstmia.f32 Y, { s1 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s0 } - fstmias Y, { s0 } + vldmia.f32 X, { s0 } + vstmia.f32 Y, { s0 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s1 } - fstmias Y, { s1 } + vldmia.f32 X, { s1 } + vstmia.f32 Y, { s1 } add X, X, INC_X add Y, Y, INC_Y @@ -110,8 +110,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmias X, { s0 } - fstmias Y, { s0 } + vldmia.f32 X, { s0 } + vstmia.f32 Y, { s0 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S index 544846258..bb374b5ee 100644 --- a/kernel/arm/sdot_vfp.S +++ b/kernel/arm/sdot_vfp.S @@ -68,26 +68,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -96,8 +96,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -109,32 +109,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. nop - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -146,8 +146,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -162,12 +162,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s8 - s9 } - fldmias Y!, { s4 - s5} + vldmia.f32 X!, { s8 - s9 } + vldmia.f32 Y!, { s4 - s5} fmacs s0 , s4, s8 - fldmias X!, { s10 - s11 } + vldmia.f32 X!, { s10 - s11 } fmacs s1 , s5, s9 - fldmias Y!, { s6 - s7 } + vldmia.f32 Y!, { s6 - s7 } fmacs s0 , s6, s10 fmacs s1 , s7, s11 @@ -175,8 +175,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } - fldmias Y!, { s8 } + vldmia.f32 X!, { s4 } + vldmia.f32 Y!, { s8 } fmacs s0 , s4, s8 .endm @@ -185,26 +185,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 nop - fldmias X, { s4 } - fldmias Y, { s8 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s8 } add X, X, INC_X add Y, Y, INC_Y fmacs s0 , s4, s8 - fldmias X, { s5 } - fldmias Y, { s9 } + vldmia.f32 X, { s5 } + vldmia.f32 Y, { s9 } add X, X, INC_X add Y, Y, INC_Y fmacs s1 , s5, s9 - fldmias X, { s6 } - fldmias Y, { s10 } + vldmia.f32 X, { s6 } + vldmia.f32 Y, { s10 } add X, X, INC_X add Y, Y, INC_Y fmacs s0 , s6, s10 - fldmias X, { s7 } - fldmias Y, { s11 } + vldmia.f32 X, { s7 } + vldmia.f32 Y, { s11 } add X, X, INC_X add Y, Y, INC_Y fmacs s1 , s7, s11 @@ -214,8 +214,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 } - fldmias Y, { s8 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s8 } add X, X, INC_X fmacs s0 , s4, s8 add Y, Y, INC_Y diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S index 1f21e5a1f..c072f4126 100644 --- a/kernel/arm/sgemm_kernel_4x2_vfp.S +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -112,8 +112,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB - fldmias AO! , { s0 - s3 } - fldmias BO! , { s4 - s5 } + vldmia.f32 AO! , { s0 - s3 } + vldmia.f32 BO! , { s4 - s5 } fmacs s8 , s0, s4 fmacs s9 , s1, s4 diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 6491d3571..789643f56 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -136,29 +136,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I pld [ AO , #A_PRE ] - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } pld [ BO , #B_PRE ] - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmuls s16 , s0, s8 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s1, s8 fmuls s18 , s2, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s19 , s3, s8 fmuls s20 , s0, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s21 , s1, s9 fmuls s22 , s2, s9 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s23 , s3, s9 fmuls s24 , s0, s10 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s25 , s1, s10 fmuls s26 , s2, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s27 , s3, s10 fmuls s28 , s0, s11 @@ -174,20 +174,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ AO , #A_PRE ] fmacs s16 , s4, s12 fmacs s17 , s5, s12 - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } fmacs s18 , s6, s12 pld [ BO , #B_PRE ] fmacs s19 , s7, s12 fmacs s20 , s4, s13 - fldmias BO!, { s8 - s11 } + vldmia.f32 BO!, { s8 - s11 } fmacs s21 , s5, s13 fmacs s22 , s6, s13 - //fldmias AO!, { s2 - s3 } + //vldmia.f32 AO!, { s2 - s3 } fmacs s23 , s7, s13 fmacs s24 , s4, s14 - //fldmias BO!, { s10 - s11 } + //vldmia.f32 BO!, { s10 - s11 } fmacs s25 , s5, s14 fmacs s26 , s6, s14 fmacs s27 , s7, s14 @@ -203,17 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s7 } + vldmia.f32 AO!, { s4 - s7 } fmacs s17 , s1, s8 fmacs s18 , s2, s8 - fldmias BO!, { s12 - s15 } - //fldmias AO!, { s6 - s7 } + vldmia.f32 BO!, { s12 - s15 } + //vldmia.f32 AO!, { s6 - s7 } fmacs s19 , s3, s8 fmacs s20 , s0, s9 fmacs s21 , s1, s9 fmacs s22 , s2, s9 - //fldmias BO!, { s14 - s15 } + //vldmia.f32 BO!, { s14 - s15 } fmacs s23 , s3, s9 fmacs s24 , s0, s10 @@ -300,7 +300,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA add r4 , CO2, r3 - fldmias CO1, { s8 - s11 } + vldmia.f32 CO1, { s8 - s11 } fmacs s8 , s0 , s16 flds s12, [CO2] @@ -322,7 +322,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ CO1 , #C_PRE ] - fldmias r4, { s8 - s11 } + vldmia.f32 r4, { s8 - s11 } fmacs s8 , s0 , s24 fsts s12, [CO2] @@ -338,7 +338,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add CO2, r4 , r3 - fldmias CO2, { s12 - s15 } + vldmia.f32 CO2, { s12 - s15 } fsts s8 , [r4 ] fmacs s12, s0 , s28 @@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s15, s0 , s31 pld [ r4 , #C_PRE ] - fstmias CO2, { s12 - s15 } + vstmia.f32 CO2, { s12 - s15 } pld [ CO2 , #C_PRE ] add CO1, CO1, #16 diff --git a/kernel/arm/sgemm_ncopy_2_vfp.S b/kernel/arm/sgemm_ncopy_2_vfp.S index ff4ff0845..dd4596602 100644 --- a/kernel/arm/sgemm_ncopy_2_vfp.S +++ b/kernel/arm/sgemm_ncopy_2_vfp.S @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s3 , [ AO2, #4 ] add AO1, AO1, #8 - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO2, AO2, #8 .endm @@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s1 , [ AO2, #0 ] add AO1, AO1, #4 - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO2, AO2, #4 .endm @@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] flds s1 , [ AO1, #4 ] - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO1, AO1, #8 .endm @@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] - fstmias BO!, { s0 } + vstmia.f32 BO!, { s0 } add AO1, AO1, #4 .endm diff --git a/kernel/arm/sgemm_ncopy_4_vfp.S b/kernel/arm/sgemm_ncopy_4_vfp.S index ab013134e..dbcea5961 100644 --- a/kernel/arm/sgemm_ncopy_4_vfp.S +++ b/kernel/arm/sgemm_ncopy_4_vfp.S @@ -100,10 +100,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s11, [ AO4, #8 ] flds s15, [ AO4, #12 ] - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO4, AO4, #16 - fstmias BO!, { s4 - s7 } - fstmias BO!, { s8 - s15 } + vstmia.f32 BO!, { s4 - s7 } + vstmia.f32 BO!, { s8 - s15 } .endm @@ -117,7 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s3 , [ AO4, #0 ] add AO3, AO3, #4 - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO4, AO4, #4 .endm @@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s5 , [ AO2, #8 ] flds s7 , [ AO2, #12 ] - fstmias BO!, { s0 - s7 } + vstmia.f32 BO!, { s0 - s7 } add AO2, AO2, #16 .endm @@ -147,7 +147,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s1 , [ AO2, #0 ] add AO1, AO1, #4 - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO2, AO2, #4 .endm @@ -159,7 +159,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s2 , [ AO1, #8 ] flds s3 , [ AO1, #12 ] - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO1, AO1, #16 .endm @@ -169,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] - fstmias BO!, { s0 } + vstmia.f32 BO!, { s0 } add AO1, AO1, #4 .endm diff --git a/kernel/arm/sgemm_tcopy_4_vfp.S b/kernel/arm/sgemm_tcopy_4_vfp.S index 9bb0e46b1..e61613c5c 100644 --- a/kernel/arm/sgemm_tcopy_4_vfp.S +++ b/kernel/arm/sgemm_tcopy_4_vfp.S @@ -76,21 +76,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4_1 pld [ AO1, #A_PRE ] - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmias r3, { s8 - s11 } + vldmia.f32 r3, { s8 - s11 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmias r3, { s12 - s15 } + vldmia.f32 r3, { s12 - s15 } - fstmias BO1, { s0 - s15 } + vstmia.f32 BO1, { s0 - s15 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -98,18 +98,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4_2 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } add r3, r3, LDA - fldmias r3, { s8 - s11 } + vldmia.f32 r3, { s8 - s11 } add r3, r3, LDA - fldmias r3, { s12 - s15 } + vldmia.f32 r3, { s12 - s15 } - fstmias BO1, { s0 - s15 } + vstmia.f32 BO1, { s0 - s15 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -118,18 +118,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x4 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } add r3, AO1, LDA - fldmias r3, { s2 - s3 } + vldmia.f32 r3, { s2 - s3 } add r3, r3, LDA - fldmias r3, { s4 - s5 } + vldmia.f32 r3, { s4 - s5 } add r3, r3, LDA - fldmias r3, { s6 - s7 } + vldmia.f32 r3, { s6 - s7 } - fstmias BO2, { s0 - s7 } + vstmia.f32 BO2, { s0 - s7 } add AO1, AO1, #8 add BO2, BO2, #32 @@ -137,18 +137,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x4 - fldmias AO1, { s0 } + vldmia.f32 AO1, { s0 } add r3, AO1, LDA - fldmias r3, { s1 } + vldmia.f32 r3, { s1 } add r3, r3, LDA - fldmias r3, { s2 } + vldmia.f32 r3, { s2 } add r3, r3, LDA - fldmias r3, { s3 } + vldmia.f32 r3, { s3 } - fstmias BO3, { s0 - s3 } + vstmia.f32 BO3, { s0 - s3 } add AO1, AO1, #4 add BO3, BO3, #16 @@ -158,12 +158,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x2 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } - fstmias BO1, { s0 - s7 } + vstmia.f32 BO1, { s0 - s7 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -171,12 +171,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x2 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } add r3, AO1, LDA - fldmias r3, { s2 - s3 } + vldmia.f32 r3, { s2 - s3 } - fstmias BO2, { s0 - s3 } + vstmia.f32 BO2, { s0 - s3 } add AO1, AO1, #8 add BO2, BO2, #16 @@ -184,12 +184,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmias AO1, { s0 } + vldmia.f32 AO1, { s0 } add r3, AO1, LDA - fldmias r3, { s1 } + vldmia.f32 r3, { s1 } - fstmias BO3, { s0 - s1 } + vstmia.f32 BO3, { s0 - s1 } add AO1, AO1, #4 add BO3, BO3, #8 @@ -199,9 +199,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x1 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } - fstmias BO1, { s0 - s3 } + vstmia.f32 BO1, { s0 - s3 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -209,9 +209,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x1 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } - fstmias BO2, { s0 - s1 } + vstmia.f32 BO2, { s0 - s1 } add AO1, AO1, #8 add BO2, BO2, #8 @@ -219,9 +219,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmias AO1, { s0 } + vldmia.f32 AO1, { s0 } - fstmias BO3, { s0 } + vstmia.f32 BO3, { s0 } add AO1, AO1, #4 add BO3, BO3, #4 diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S index 635b1dd13..34fa0ee39 100644 --- a/kernel/arm/strmm_kernel_4x2_vfp.S +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -118,8 +118,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s5 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s5 } fmacs s8 , s0, s4 fmacs s9 , s1, s4 diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S index e24d24eba..0f601d5b8 100644 --- a/kernel/arm/strmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -122,30 +122,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } pld [ AO , #A_PRE-8 ] - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } pld [ BO , #B_PRE-8 ] fmuls s16 , s0, s8 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s1, s8 fmuls s18 , s2, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s19 , s3, s8 fmuls s20 , s0, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s21 , s1, s9 fmuls s22 , s2, s9 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s23 , s3, s9 fmuls s24 , s0, s10 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s25 , s1, s10 fmuls s26 , s2, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s27 , s3, s10 fmuls s28 , s0, s11 @@ -161,20 +161,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ AO , #A_PRE ] fmacs s16 , s4, s12 fmacs s17 , s5, s12 - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } fmacs s18 , s6, s12 pld [ BO , #B_PRE ] fmacs s19 , s7, s12 fmacs s20 , s4, s13 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s21 , s5, s13 fmacs s22 , s6, s13 - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmacs s23 , s7, s13 fmacs s24 , s4, s14 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s25 , s5, s14 fmacs s26 , s6, s14 fmacs s27 , s7, s14 @@ -190,17 +190,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmacs s17 , s1, s8 fmacs s18 , s2, s8 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmacs s19 , s3, s8 fmacs s20 , s0, s9 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmacs s21 , s1, s9 fmacs s22 , s2, s9 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmacs s23 , s3, s9 fmacs s24 , s0, s10 @@ -325,7 +325,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fsts s11, [r4 , #12 ] fmuls s15, s0 , s31 - fstmias CO2, { s12 - s15 } + vstmia.f32 CO2, { s12 - s15 } add CO1, CO1, #16 diff --git a/kernel/arm/swap_vfp.S b/kernel/arm/swap_vfp.S index 76661da79..0b3d98912 100644 --- a/kernel/arm/swap_vfp.S +++ b/kernel/arm/swap_vfp.S @@ -103,29 +103,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d0 - d3 } - fldmiad Y, { d4 - d7 } - fstmiad Y!, { d0 - d3 } - fstmiad X!, { d4 - d7} + vldmia.f64 X, { d0 - d3 } + vldmia.f64 Y, { d4 - d7 } + vstmia.f64 Y!, { d0 - d3 } + vstmia.f64 X!, { d4 - d7} .endm .macro KERNEL_F1 - fldmiad X, { d0 } - fldmiad Y, { d4 } - fstmiad Y!, { d0 } - fstmiad X!, { d4 } + vldmia.f64 X, { d0 } + vldmia.f64 Y, { d4 } + vstmia.f64 Y!, { d0 } + vstmia.f64 X!, { d4 } .endm .macro KERNEL_S1 - fldmiad X, { d0 } - fldmiad Y, { d4 } - fstmiad Y, { d0 } - fstmiad X, { d4 } + vldmia.f64 X, { d0 } + vldmia.f64 Y, { d4 } + vstmia.f64 Y, { d0 } + vstmia.f64 X, { d4 } add X, X, INC_X add Y, Y, INC_Y @@ -135,29 +135,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X, { s0 - s3 } - fldmias Y, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias X!, { s4 - s7} + vldmia.f32 X, { s0 - s3 } + vldmia.f32 Y, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 X!, { s4 - s7} .endm .macro KERNEL_F1 - fldmias X, { s0 } - fldmias Y, { s4 } - fstmias Y!, { s0 } - fstmias X!, { s4 } + vldmia.f32 X, { s0 } + vldmia.f32 Y, { s4 } + vstmia.f32 Y!, { s0 } + vstmia.f32 X!, { s4 } .endm .macro KERNEL_S1 - fldmias X, { s0 } - fldmias Y, { s4 } - fstmias Y, { s0 } - fstmias X, { s4 } + vldmia.f32 X, { s0 } + vldmia.f32 Y, { s4 } + vstmia.f32 Y, { s0 } + vstmia.f32 X, { s4 } add X, X, INC_X add Y, Y, INC_Y @@ -174,35 +174,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d0 - d3 } - fldmiad Y, { d4 - d7 } - fstmiad Y!, { d0 - d3 } - fstmiad X!, { d4 - d7} + vldmia.f64 X, { d0 - d3 } + vldmia.f64 Y, { d4 - d7 } + vstmia.f64 Y!, { d0 - d3 } + vstmia.f64 X!, { d4 - d7} pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d0 - d3 } - fldmiad Y, { d4 - d7 } - fstmiad Y!, { d0 - d3 } - fstmiad X!, { d4 - d7} + vldmia.f64 X, { d0 - d3 } + vldmia.f64 Y, { d4 - d7 } + vstmia.f64 Y!, { d0 - d3 } + vstmia.f64 X!, { d4 - d7} .endm .macro KERNEL_F1 - fldmiad X, { d0 - d1 } - fldmiad Y, { d4 - d5 } - fstmiad Y!, { d0 - d1 } - fstmiad X!, { d4 - d5 } + vldmia.f64 X, { d0 - d1 } + vldmia.f64 Y, { d4 - d5 } + vstmia.f64 Y!, { d0 - d1 } + vstmia.f64 X!, { d4 - d5 } .endm .macro KERNEL_S1 - fldmiad X, { d0 - d1 } - fldmiad Y, { d4 - d5 } - fstmiad Y, { d0 - d1 } - fstmiad X, { d4 - d5 } + vldmia.f64 X, { d0 - d1 } + vldmia.f64 Y, { d4 - d5 } + vstmia.f64 Y, { d0 - d1 } + vstmia.f64 X, { d4 - d5 } add X, X, INC_X add Y, Y, INC_Y @@ -215,33 +215,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X, { s0 - s3 } - fldmias Y, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias X!, { s4 - s7} + vldmia.f32 X, { s0 - s3 } + vldmia.f32 Y, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 X!, { s4 - s7} - fldmias X, { s0 - s3 } - fldmias Y, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias X!, { s4 - s7} + vldmia.f32 X, { s0 - s3 } + vldmia.f32 Y, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 X!, { s4 - s7} .endm .macro KERNEL_F1 - fldmias X, { s0 - s1 } - fldmias Y, { s4 - s5 } - fstmias Y!, { s0 - s1 } - fstmias X!, { s4 - s5 } + vldmia.f32 X, { s0 - s1 } + vldmia.f32 Y, { s4 - s5 } + vstmia.f32 Y!, { s0 - s1 } + vstmia.f32 X!, { s4 - s5 } .endm .macro KERNEL_S1 - fldmias X, { s0 - s1 } - fldmias Y, { s4 - s5 } - fstmias Y, { s0 - s1 } - fstmias X, { s4 - s5 } + vldmia.f32 X, { s0 - s1 } + vldmia.f32 Y, { s4 - s5 } + vstmia.f32 Y, { s0 - s1 } + vstmia.f32 X, { s4 - s5 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/zcopy_vfp.S b/kernel/arm/zcopy_vfp.S index 48aee4ce0..899dd1e36 100644 --- a/kernel/arm/zcopy_vfp.S +++ b/kernel/arm/zcopy_vfp.S @@ -66,15 +66,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ X, #X_PRE+32 ] - fldmiad X!, { d0 - d7 } - fstmiad Y!, { d0 - d7 } + vldmia.f64 X!, { d0 - d7 } + vstmia.f64 Y!, { d0 - d7 } .endm .macro COPY_F1 - fldmiad X!, { d0 - d1 } - fstmiad Y!, { d0 - d1 } + vldmia.f64 X!, { d0 - d1 } + vstmia.f64 Y!, { d0 - d1 } .endm @@ -84,23 +84,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmiad X, { d0 - d1 } - fstmiad Y, { d0 - d1 } + vldmia.f64 X, { d0 - d1 } + vstmia.f64 Y, { d0 - d1 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d2 - d3 } - fstmiad Y, { d2 - d3 } + vldmia.f64 X, { d2 - d3 } + vstmia.f64 Y, { d2 - d3 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d0 - d1 } - fstmiad Y, { d0 - d1 } + vldmia.f64 X, { d0 - d1 } + vstmia.f64 Y, { d0 - d1 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d2 - d3 } - fstmiad Y, { d2 - d3 } + vldmia.f64 X, { d2 - d3 } + vstmia.f64 Y, { d2 - d3 } add X, X, INC_X add Y, Y, INC_Y @@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmiad X, { d0 - d1 } - fstmiad Y, { d0 - d1 } + vldmia.f64 X, { d0 - d1 } + vstmia.f64 Y, { d0 - d1 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S index c0cd92d3c..5ef9f16a9 100644 --- a/kernel/arm/zdot_vfp.S +++ b/kernel/arm/zdot_vfp.S @@ -76,15 +76,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X!, { d4 - d5 } - fldmiad Y!, { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y!, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } fmacd d2 , d5, d9 fmacd d3 , d5, d8 - fldmiad Y!, { d10 - d11 } + vldmia.f64 Y!, { d10 - d11 } fmacd d0 , d6, d10 fmacd d1 , d6, d11 pld [ X, #X_PRE ] @@ -93,15 +93,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ Y, #X_PRE ] - fldmiad X!, { d4 - d5 } - fldmiad Y!, { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y!, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } fmacd d2 , d5, d9 fmacd d3 , d5, d8 - fldmiad Y!, { d10 - d11 } + vldmia.f64 Y!, { d10 - d11 } fmacd d0 , d6, d10 fmacd d1 , d6, d11 fmacd d2 , d7, d11 @@ -111,8 +111,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } - fldmiad Y!, { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y!, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -127,8 +127,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. nop - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -136,8 +136,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -145,8 +145,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -154,8 +154,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -168,8 +168,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index 53d18b07b..7934a500e 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -360,7 +360,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } + vldmia.f64 CO1, { d4 - d7 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -372,9 +372,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } - fldmiad CO2, { d4 - d7 } + vldmia.f64 CO2, { d4 - d7 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 @@ -386,7 +386,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad CO2, { d4 - d7 } + vstmia.f64 CO2, { d4 - d7 } add CO1, CO1, #32 @@ -543,23 +543,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } + vldmia.f64 CO1, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } - fldmiad CO2, { d4 - d5 } + vldmia.f64 CO2, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad CO2, { d4 - d5 } + vstmia.f64 CO2, { d4 - d5 } add CO1, CO1, #16 @@ -714,7 +714,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } + vldmia.f64 CO1, { d4 - d7 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -726,7 +726,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -843,14 +843,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } + vldmia.f64 CO1, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index a9d4eddeb..cbb10f342 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -374,8 +374,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } - fldmiad CO2, { d8 - d11 } + vldmia.f64 CO1, { d4 - d7 } + vldmia.f64 CO2, { d8 - d11 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -406,8 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d10, d1 , d23 FMAC_I2 d11, d1 , d22 - fstmiad CO1, { d4 - d7 } - fstmiad CO2, { d8 - d11 } + vstmia.f64 CO1, { d4 - d7 } + vstmia.f64 CO2, { d8 - d11 } add CO1, CO1, #32 @@ -570,8 +570,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } - fldmiad CO2, { d8 - d9 } + vldmia.f64 CO1, { d4 - d5 } + vldmia.f64 CO2, { d8 - d9 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -588,8 +588,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d8 , d1 , d21 FMAC_I2 d9 , d1 , d20 - fstmiad CO1, { d4 - d5 } - fstmiad CO2, { d8 - d9 } + vstmia.f64 CO1, { d4 - d5 } + vstmia.f64 CO2, { d8 - d9 } add CO1, CO1, #16 @@ -752,7 +752,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } + vldmia.f64 CO1, { d4 - d7 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -769,7 +769,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d19 FMAC_I2 d7 , d1 , d18 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -887,7 +887,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } + vldmia.f64 CO1, { d4 - d5 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -897,7 +897,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 diff --git a/kernel/arm/zgemm_ncopy_2_vfp.S b/kernel/arm/zgemm_ncopy_2_vfp.S index b3fa225bb..d0661da2a 100644 --- a/kernel/arm/zgemm_ncopy_2_vfp.S +++ b/kernel/arm/zgemm_ncopy_2_vfp.S @@ -87,7 +87,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d6 , [ AO2, #16 ] fldd d7 , [ AO2, #24 ] - fstmiad BO!, { d0 - d7 } + vstmia.f64 BO!, { d0 - d7 } add AO2, AO2, #32 .endm @@ -101,7 +101,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d3 , [ AO2, #8 ] add AO1, AO1, #16 - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO2, AO2, #16 .endm @@ -113,7 +113,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d2 , [ AO1, #16 ] fldd d3 , [ AO1, #24 ] - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO1, AO1, #32 .endm @@ -124,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] fldd d1 , [ AO1, #8 ] - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO1, AO1, #16 .endm diff --git a/kernel/arm/zgemm_tcopy_2_vfp.S b/kernel/arm/zgemm_tcopy_2_vfp.S index 7e27ca6a6..5e1a384b1 100644 --- a/kernel/arm/zgemm_tcopy_2_vfp.S +++ b/kernel/arm/zgemm_tcopy_2_vfp.S @@ -74,13 +74,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x2 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d4 - d7 } + vldmia.f64 r3, { d4 - d7 } - fstmiad BO1, { d0 - d7 } + vstmia.f64 BO1, { d0 - d7 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -88,12 +88,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmiad AO1, { d0 -d1 } + vldmia.f64 AO1, { d0 -d1 } add r3, AO1, LDA - fldmiad r3, { d2 - d3 } + vldmia.f64 r3, { d2 - d3 } - fstmiad BO2, { d0 - d3 } + vstmia.f64 BO2, { d0 - d3 } add AO1, AO1, #16 add BO2, BO2, #32 @@ -102,9 +102,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*************************************************************************************************************************/ .macro COPY2x1 - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } - fstmiad BO1, { d0 - d3 } + vstmia.f64 BO1, { d0 - d3 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -112,9 +112,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } - fstmiad BO2, { d0 - d1 } + vstmia.f64 BO2, { d0 - d1 } add AO1, AO1, #16 add BO2, BO2, #16 diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S index 3e3a1bc07..4e64d8785 100644 --- a/kernel/arm/zgemv_n_vfp.S +++ b/kernel/arm/zgemv_n_vfp.S @@ -204,7 +204,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -216,9 +216,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 @@ -230,7 +230,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } .endm @@ -269,14 +269,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, #16 @@ -352,47 +352,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y - fldmiad YO, { d6 - d7 } + vldmia.f64 YO, { d6 - d7 } FMAC_R1 d6 , d0 , d10 FMAC_I1 d7 , d0 , d11 FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad YO, { d6 - d7 } + vstmia.f64 YO, { d6 - d7 } add YO, YO, INC_Y - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y - fldmiad YO, { d6 - d7 } + vldmia.f64 YO, { d6 - d7 } FMAC_R1 d6 , d0 , d14 FMAC_I1 d7 , d0 , d15 FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO, { d6 - d7 } + vstmia.f64 YO, { d6 - d7 } add YO, YO, INC_Y @@ -433,14 +433,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S index 2193083af..c66fa4fb8 100644 --- a/kernel/arm/zgemv_t_vfp.S +++ b/kernel/arm/zgemv_t_vfp.S @@ -151,12 +151,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmiad XO! , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } + vldmia.f64 XO! , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 - fldmiad AO2!, { d8 - d9 } + vldmia.f64 AO2!, { d8 - d9 } KMAC_R d12 , d5 , d3 KMAC_I d13 , d5 , d2 @@ -169,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 @@ -181,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } .endm @@ -205,8 +205,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } + vldmia.f64 XO! , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 @@ -217,14 +217,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO!, { d4 - d5 } + vstmia.f64 YO!, { d4 - d5 } .endm @@ -250,9 +250,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmiad XO , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } - fldmiad AO2!, { d8 - d9 } + vldmia.f64 XO , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } + vldmia.f64 AO2!, { d8 - d9 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 @@ -270,25 +270,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y - fldmiad YO, { d6 - d7 } + vldmia.f64 YO, { d6 - d7 } FMAC_R1 d6 , d0 , d14 FMAC_I1 d7 , d0 , d15 FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO, { d6 - d7 } + vstmia.f64 YO, { d6 - d7 } add YO, YO, INC_Y @@ -314,8 +314,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } + vldmia.f64 XO , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 @@ -328,14 +328,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S index cb6bc050e..4393bc9f6 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -385,7 +385,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } fldd d4 , FP_ZERO vmov.f64 d5 , d4 @@ -402,7 +402,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad CO2, { d4 - d7 } + vstmia.f64 CO2, { d4 - d7 } add CO1, CO1, #32 @@ -567,7 +567,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } fldd d4 , FP_ZERO vmov.f64 d5 , d4 @@ -577,7 +577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad CO2, { d4 - d5 } + vstmia.f64 CO2, { d4 - d5 } add CO1, CO1, #16 @@ -747,7 +747,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -872,7 +872,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S index 3e6962f06..39b12caa0 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -391,8 +391,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d10, d1 , d23 FMAC_I2 d11, d1 , d22 - fstmiad CO1, { d4 - d7 } - fstmiad CO2, { d8 - d11 } + vstmia.f64 CO1, { d4 - d7 } + vstmia.f64 CO2, { d8 - d11 } add CO1, CO1, #32 @@ -569,8 +569,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d8 , d1 , d21 FMAC_I2 d9 , d1 , d20 - fstmiad CO1, { d4 - d5 } - fstmiad CO2, { d8 - d9 } + vstmia.f64 CO1, { d4 - d5 } + vstmia.f64 CO2, { d8 - d9 } add CO1, CO1, #16 @@ -747,7 +747,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d19 FMAC_I2 d7 , d1 , d18 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -872,7 +872,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL index aeccfbf4c..f936cdf47 100644 --- a/kernel/arm64/KERNEL +++ b/kernel/arm64/KERNEL @@ -1,17 +1,17 @@ ifndef SNRM2KERNEL -SNRM2KERNEL = nrm2.c +SNRM2KERNEL = ../arm/nrm2.c endif ifndef DNRM2KERNEL -DNRM2KERNEL = nrm2.c +DNRM2KERNEL = ../arm/nrm2.c endif ifndef CNRM2KERNEL -CNRM2KERNEL = znrm2.c +CNRM2KERNEL = ../arm/znrm2.c endif ifndef ZNRM2KERNEL -ZNRM2KERNEL = znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c endif ifndef SCABS_KERNEL diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index d05754628..5c70390dc 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -1,8 +1,3 @@ -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - SAMINKERNEL = ../arm/amin.c DAMINKERNEL = ../arm/amin.c CAMINKERNEL = ../arm/zamin.c @@ -14,11 +9,6 @@ DMAXKERNEL = ../arm/max.c SMINKERNEL = ../arm/min.c DMINKERNEL = ../arm/min.c -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - ISAMINKERNEL = ../arm/iamin.c IDAMINKERNEL = ../arm/iamin.c ICAMINKERNEL = ../arm/izamin.c @@ -30,86 +20,6 @@ IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SDOTKERNEL = dot.S -DDOTKERNEL = dot.S -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -DSDOTKERNEL = dot.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -STRMMKERNEL = ../generic/trmmkernel_4x4.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = sgemm_kernel_4x4.S -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -130,6 +40,168 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +ifneq ($(OS_DARWIN)$(CROSS),11) +SNRM2KERNEL = nrm2.S +CNRM2KERNEL = nrm2.S +DNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S +endif + +DDOTKERNEL = dot.S +SDOTKERNEL = dot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +DSDOTKERNEL = dot.S + +ifneq ($(OS_DARWIN)$(CROSS),11) + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +else + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +endif diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 new file mode 100644 index 000000000..c1d33fa3e --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.ARMV8 + + diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index 371e488cd..04d6940d7 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -1,4 +1,49 @@ -include $(KERNELDIR)/KERNEL.ARMV8 +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c SAMAXKERNEL = amax.S DAMAXKERNEL = amax.S @@ -66,13 +111,13 @@ STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S @@ -87,8 +132,8 @@ DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c endif -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ifeq ($(DGEMM_UNROLL_N), 4) @@ -99,32 +144,32 @@ DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c endif -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) endif CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy.o -ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.CORTEXA72 b/kernel/arm64/KERNEL.CORTEXA72 new file mode 100644 index 000000000..007b2ce26 --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA72 @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.CORTEXA57 + + diff --git a/kernel/arm64/KERNEL.CORTEXA73 b/kernel/arm64/KERNEL.CORTEXA73 new file mode 100644 index 000000000..007b2ce26 --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA73 @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.CORTEXA57 + + diff --git a/kernel/arm64/KERNEL.FALKOR b/kernel/arm64/KERNEL.FALKOR new file mode 100644 index 000000000..007b2ce26 --- /dev/null +++ b/kernel/arm64/KERNEL.FALKOR @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.CORTEXA57 + + diff --git a/kernel/arm64/KERNEL.THUNDERX b/kernel/arm64/KERNEL.THUNDERX index 11b7a2ca8..cb02c7bc5 100644 --- a/kernel/arm64/KERNEL.THUNDERX +++ b/kernel/arm64/KERNEL.THUNDERX @@ -1,6 +1,133 @@ -include $(KERNELDIR)/KERNEL.ARMV8 +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx.c +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SDOTKERNEL = dot_thunderx.c +DDOTKERNEL = ddot_thunderx.c +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +DSDOTKERNEL = dot.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = sgemm_kernel_4x4.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -SDOTKERNEL=dot_thunderx.c -DDOTKERNEL=ddot_thunderx.c -DAXPYKERNEL=daxpy_thunderx.c diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index b66cd0e8b..a20d0d4a6 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -1,4 +1,137 @@ -include $(KERNELDIR)/KERNEL.CORTEXA57 +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) SASUMKERNEL = sasum_thunderx2t99.c DASUMKERNEL = dasum_thunderx2t99.c @@ -27,12 +160,12 @@ CNRM2KERNEL = scnrm2_thunderx2t99.c DNRM2KERNEL = dznrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DAXPYKERNEL = daxpy_thunderx2t99.S DDOTKERNEL = dot_thunderx2t99.c SDOTKERNEL = dot_thunderx2t99.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S diff --git a/kernel/arm64/KERNEL.VULCAN b/kernel/arm64/KERNEL.VULCAN deleted file mode 100644 index 8b0273951..000000000 --- a/kernel/arm64/KERNEL.VULCAN +++ /dev/null @@ -1,3 +0,0 @@ -include $(KERNELDIR)/KERNEL.THUNDERX2T99 - - diff --git a/kernel/arm64/KERNEL.XGENE1 b/kernel/arm64/KERNEL.XGENE1 deleted file mode 100644 index 6ee0c730c..000000000 --- a/kernel/arm64/KERNEL.XGENE1 +++ /dev/null @@ -1 +0,0 @@ -include $(KERNELDIR)/KERNEL.ARMV8 \ No newline at end of file diff --git a/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S index 598db6e0c..d1551ffea 100644 --- a/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S +++ b/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S @@ -943,13 +943,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] - - ldr A_PRE_SIZE, =dgemm_prefetch_size_a - ldr A_PRE_SIZE, [A_PRE_SIZE] - ldr B_PRE_SIZE, =dgemm_prefetch_size_b - ldr B_PRE_SIZE, [B_PRE_SIZE] - ldr C_PRE_SIZE, =dgemm_prefetch_size_c - ldr C_PRE_SIZE, [C_PRE_SIZE] + mov A_PRE_SIZE, #3584 + mov B_PRE_SIZE, #512 + mov C_PRE_SIZE, #128 add A_PRE_SIZE_64, A_PRE_SIZE, #64 add B_PRE_SIZE_64, B_PRE_SIZE, #64 diff --git a/kernel/generic/trmm_lncopy_16.c b/kernel/generic/trmm_lncopy_16.c index 4c0a76cbd..0f4b0a9f7 100644 --- a/kernel/generic/trmm_lncopy_16.c +++ b/kernel/generic/trmm_lncopy_16.c @@ -661,7 +661,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; - b[ 11] = ZERO; + b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index e257dcfc9..61da7445f 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -1,12 +1,13 @@ CAXPYKERNEL = ../mips/zaxpy.c ZAXPYKERNEL = ../mips/zaxpy.c -SROTKERNEL = ../mips/rot.c -DROTKERNEL = ../mips/rot.c -CROTKERNEL = ../mips/zrot.c -ZROTKERNEL = ../mips/zrot.c +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c - + + ifndef SNRM2KERNEL SNRM2KERNEL = snrm2.S endif diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index 2d03ad7fa..0298faaad 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -63,6 +63,7 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DSDOTKERNEL = ../mips/dot.c diff --git a/kernel/mips64/axpy_loongson3a.S b/kernel/mips64/axpy_loongson3a.S index 5904bc580..765e5ebbb 100644 --- a/kernel/mips64/axpy_loongson3a.S +++ b/kernel/mips64/axpy_loongson3a.S @@ -270,6 +270,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 .L20: + beqz INCY, .L27 dsra I, N, 3 move YY, Y @@ -450,5 +451,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. j $31 NOP + .align 3 +.L27: + LD b1, 0 * SIZE(Y) +.L28: + daddiu N, N, -1 + LD a1, 0 * SIZE(X) + daddu X, X, INCX + bgtz N, .L28 + MADD b1, b1, ALPHA, a1 + + j .L999 + ST b1, 0 * SIZE(Y) + EPILOGUE diff --git a/kernel/mips64/daxpy_loongson3a_simd.S b/kernel/mips64/daxpy_loongson3a_simd.S index f54008bc2..23225770a 100644 --- a/kernel/mips64/daxpy_loongson3a_simd.S +++ b/kernel/mips64/daxpy_loongson3a_simd.S @@ -562,6 +562,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //INCX!=1 or INCY != 1 .L20: + beq INCY, $0, .L27 dsra I, N, 3 move YY, Y @@ -754,5 +755,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. j $31 NOP + .align 3 +.L27: + LD b1, 0 * SIZE(Y) +.L28: + daddiu N, N, -1 + LD a1, 0 * SIZE(X) + daddu X, X, INCX + bgtz N, .L28 + MADD b1, b1, ALPHA, a1 + + j .L999 + ST b1, 0 * SIZE(Y) + EPILOGUE diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 37b20a880..82703ff5d 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -146,11 +146,11 @@ sd $21, 40($sp) sd $22, 48($sp) - ST $f24, 56($sp) - ST $f25, 64($sp) - ST $f26, 72($sp) - ST $f27, 80($sp) - ST $f28, 88($sp) + sdc1 $f24, 56($sp) + sdc1 $f25, 64($sp) + sdc1 $f26, 72($sp) + sdc1 $f27, 80($sp) + sdc1 $f28, 88($sp) #if defined(TRMMKERNEL) sd $23, 96($sp) @@ -161,10 +161,10 @@ #endif #ifndef __64BIT__ - ST $f20,120($sp) - ST $f21,128($sp) - ST $f22,136($sp) - ST $f23,144($sp) + sdc1 $f20,120($sp) + sdc1 $f21,128($sp) + sdc1 $f22,136($sp) + sdc1 $f23,144($sp) #endif .align 4 @@ -7766,11 +7766,11 @@ ld $21, 40($sp) ld $22, 48($sp) - LD $f24, 56($sp) - LD $f25, 64($sp) - LD $f26, 72($sp) - LD $f27, 80($sp) - LD $f28, 88($sp) + ldc1 $f24, 56($sp) + ldc1 $f25, 64($sp) + ldc1 $f26, 72($sp) + ldc1 $f27, 80($sp) + ldc1 $f28, 88($sp) #if defined(TRMMKERNEL) ld $23, 96($sp) @@ -7779,10 +7779,10 @@ #endif #ifndef __64BIT__ - LD $f20,120($sp) - LD $f21,128($sp) - LD $f22,136($sp) - LD $f23,144($sp) + ldc1 $f20,120($sp) + ldc1 $f21,128($sp) + ldc1 $f22,136($sp) + ldc1 $f23,144($sp) #endif daddiu $sp,$sp,STACKSIZE diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index f654de110..6d4028b0b 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -294,6 +294,8 @@ gotoblas_t TABLE_NAME = { chemm_outcopyTS, chemm_oltcopyTS, 0, 0, 0, + +#if defined(USE_GEMM3M) #ifdef CGEMM3M_DEFAULT_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N, MAX(CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N), #else @@ -324,6 +326,33 @@ gotoblas_t TABLE_NAME = { chemm3m_oucopybTS, chemm3m_olcopybTS, chemm3m_oucopyrTS, chemm3m_olcopyrTS, chemm3m_oucopyiTS, chemm3m_olcopyiTS, +#else + 0, 0, 0, + + NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, +#endif #ifndef NO_LAPACK cneg_tcopyTS, claswp_ncopyTS, @@ -400,6 +429,7 @@ gotoblas_t TABLE_NAME = { zhemm_outcopyTS, zhemm_oltcopyTS, 0, 0, 0, +#if defined(USE_GEMM3M) #ifdef ZGEMM3M_DEFAULT_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N, MAX(ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N), #else @@ -430,6 +460,33 @@ gotoblas_t TABLE_NAME = { zhemm3m_oucopybTS, zhemm3m_olcopybTS, zhemm3m_oucopyrTS, zhemm3m_olcopyrTS, zhemm3m_oucopyiTS, zhemm3m_olcopyiTS, +#else + 0, 0, 0, + + NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, +#endif #ifndef NO_LAPACK zneg_tcopyTS, zlaswp_ncopyTS, @@ -503,6 +560,7 @@ gotoblas_t TABLE_NAME = { xhemm_outcopyTS, xhemm_oltcopyTS, 0, 0, 0, +#if defined(USE_GEMM3M) QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N), xgemm3m_kernelTS, @@ -528,6 +586,33 @@ gotoblas_t TABLE_NAME = { xhemm3m_oucopybTS, xhemm3m_olcopybTS, xhemm3m_oucopyrTS, xhemm3m_olcopyrTS, xhemm3m_oucopyiTS, xhemm3m_olcopyiTS, +#else + 0, 0, 0, + + NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, + + NULL, NULL, + NULL, NULL, + NULL, NULL, +#endif #ifndef NO_LAPACK xneg_tcopyTS, xlaswp_ncopyTS, @@ -561,6 +646,78 @@ gotoblas_t TABLE_NAME = { }; +#if defined(ARCH_ARM64) +static void init_parameter(void) { + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; + + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; + + TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; + TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; + TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; + TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; + +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; + TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; + TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; + TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R; + TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R; +#endif + +#if defined(USE_GEMM3M) +#ifdef CGEMM3M_DEFAULT_P + TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; +#else + TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; +#endif + +#ifdef ZGEMM3M_DEFAULT_P + TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; +#else + TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; +#endif + +#ifdef CGEMM3M_DEFAULT_Q + TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q; +#endif + +#ifdef ZGEMM3M_DEFAULT_Q + TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q; +#endif + +#ifdef CGEMM3M_DEFAULT_R + TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R; +#else + TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r; +#endif + +#ifdef ZGEMM3M_DEFAULT_R + TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R; +#else + TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r; +#endif + +#ifdef EXPRECISION + TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; + TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q; + TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; +#endif +#endif + +} +#else // defined(ARCH_ARM64) #ifdef ARCH_X86 static int get_l2_size_old(void){ int i, eax, ebx, ecx, edx, cpuid_level; @@ -1146,3 +1303,4 @@ static void init_parameter(void) { } +#endif //defined(ARCH_ARM64) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 1256f4c3c..acc6356d6 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -1,19 +1,18 @@ include $(KERNELDIR)/KERNEL.HASWELL -SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S +SGEMMKERNEL = sgemm_kernel_16x4_skylakex.c +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = sgemm_tcopy_16_skylakex.c +SGEMMONCOPY = sgemm_ncopy_4_skylakex.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -#DTRMMKERNEL = ../generic/trmmkernel_16x2.c -#DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S -#DGEMMINCOPY = ../generic/gemm_ncopy_16.c -#DGEMMITCOPY = ../generic/gemm_tcopy_16.c -#DGEMMONCOPY = ../generic/gemm_ncopy_2.c -#DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -#DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -#DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -#DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -#DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c +DGEMMINCOPY = dgemm_ncopy_8_skylakex.c +DGEMMITCOPY = dgemm_tcopy_8_skylakex.c +DGEMMONCOPY = dgemm_ncopy_8_skylakex.c +DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c -SGEMM_BETA = ../generic/gemm_beta.c -DGEMM_BETA = ../generic/gemm_beta.c +SGEMM_BETA = sgemm_beta_skylakex.c +DGEMM_BETA = dgemm_beta_skylakex.c diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c new file mode 100644 index 000000000..6a824c9b5 --- /dev/null +++ b/kernel/x86_64/dgemm_beta_skylakex.c @@ -0,0 +1,152 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#include + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, + FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc){ + + BLASLONG i, j; + FLOAT *c_offset1, *c_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + + /* fast path.. just zero the whole matrix */ + if (m == ldc && (unsigned long)beta == (unsigned long)ZERO) { + memset(c, 0, m * n * sizeof(FLOAT)); + return 0; + } + + if (m == 0 || n == 0) + return 0; + + c_offset = c; + + if (beta == ZERO){ + __m512d z_zero; + + z_zero = _mm512_setzero_pd(); + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = m; + + while (i >= 32) { + _mm512_storeu_pd(c_offset1, z_zero); + _mm512_storeu_pd(c_offset1 + 8, z_zero); + _mm512_storeu_pd(c_offset1 + 16, z_zero); + _mm512_storeu_pd(c_offset1 + 24 , z_zero); + c_offset1 += 32; + i -= 32; + } + while (i >= 8) { + _mm512_storeu_pd(c_offset1, z_zero); + c_offset1 += 8; + i -= 8; + } + + while (i > 0) { + *c_offset1 = ZERO; + c_offset1 ++; + i --; + } + j --; + } while (j > 0); + + } else { + + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = (m >> 3); + if (i > 0){ + do { + ctemp1 = *(c_offset1 + 0); + ctemp2 = *(c_offset1 + 1); + ctemp3 = *(c_offset1 + 2); + ctemp4 = *(c_offset1 + 3); + ctemp5 = *(c_offset1 + 4); + ctemp6 = *(c_offset1 + 5); + ctemp7 = *(c_offset1 + 6); + ctemp8 = *(c_offset1 + 7); + + ctemp1 *= beta; + ctemp2 *= beta; + ctemp3 *= beta; + ctemp4 *= beta; + ctemp5 *= beta; + ctemp6 *= beta; + ctemp7 *= beta; + ctemp8 *= beta; + + *(c_offset1 + 0) = ctemp1; + *(c_offset1 + 1) = ctemp2; + *(c_offset1 + 2) = ctemp3; + *(c_offset1 + 3) = ctemp4; + *(c_offset1 + 4) = ctemp5; + *(c_offset1 + 5) = ctemp6; + *(c_offset1 + 6) = ctemp7; + *(c_offset1 + 7) = ctemp8; + c_offset1 += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0){ + do { + ctemp1 = *c_offset1; + ctemp1 *= beta; + *c_offset1 = ctemp1; + c_offset1 ++; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + + } + return 0; +}; diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c new file mode 100644 index 000000000..a83ca98fa --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -0,0 +1,1565 @@ +/********************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/* + * This file is based on dgemm_kernel_4x8_haswell.s (original copyright above). + * The content got translated from ASM to C+intrinsics, significantly simplified, + * and AVX512 support added by Arjan van de Ven + */ + + +#include "common.h" +#include + + +/******************************************************************************************* +* Macro definitions +*******************************************************************************************/ + + +/******************************************************************************************/ + + +#define INIT4x8() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); \ + ymm8 = _mm256_setzero_pd(); \ + ymm9 = _mm256_setzero_pd(); \ + ymm10 = _mm256_setzero_pd(); \ + ymm11 = _mm256_setzero_pd(); \ + + +#define KERNEL4x8_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ +/* ymm0 [ A B C D ] */ \ + ymm1 = _mm256_loadu_pd(BO - 12); \ + ymm2 = _mm256_loadu_pd(BO - 8); \ +/* ymm1 [ 1 2 3 4 ] */ \ +/* ymm2 [ 5 6 7 8 ] */ \ + \ + ymm4 += ymm0 * ymm1; \ +/* ymm4 += [ A*1 | B*2 | C*3 | D*4 ] */ \ + ymm8 += ymm0 * ymm2; \ +/* ymm8 += [ A*5 | B*6 | C*7 | D*8 ] */ \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ +/* ymm0 [ B A D C ] */ \ + ymm5 += ymm0 * ymm1; \ +/* ymm5 += [ B*1 | A*2 | D*3 | C*4 ] */ \ + ymm9 += ymm0 * ymm2; \ +/* ymm9 += [ B*5 | A*6 | D*7 | C*8 ] */ \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0x1b); \ +/* ymm0 [ C D A B ]] */ \ + ymm6 += ymm0 * ymm1; \ +/* ymm6 += [ C*1 | D*2 | A*3 | B*4 ] */ \ + ymm10+= ymm0 * ymm2; \ +/* ymm10 += [ C*5 | D*6 | A*7 | B*8 ] */ \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ +/* ymm0 [ D C B A ] */ \ + ymm7 += ymm0 * ymm1; \ +/* ymm7 += [ D*1 | C*2 | B*3 | A*4 ] */ \ + ymm11+= ymm0 * ymm2; \ +/* ymm11 += [ D*5 | C*6 | B*7 | A*8 ] */ \ + AO += 4; \ + BO += 8; + + +#define SAVE4x8(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 *= ymm0; \ + ymm5 *= ymm0; \ + ymm6 *= ymm0; \ + ymm7 *= ymm0; \ + ymm8 *= ymm0; \ + ymm9 *= ymm0; \ + ymm10 *= ymm0; \ + ymm11 *= ymm0; \ + \ +/* Entry values: */ \ +/* ymm4 = a [ A*1 | B*2 | C*3 | D*4 ] */ \ +/* ymm5 = a [ B*1 | A*2 | D*3 | C*4 ] */ \ +/* ymm6 = a [ C*1 | D*2 | A*3 | B*4 ] */ \ +/* ymm7 = a [ D*1 | C*2 | B*3 | A*4 ] */ \ +/* ymm8 = a [ A*5 | B*6 | C*7 | D*8 ] */ \ +/* ymm9 = a [ B*5 | A*6 | D*7 | C*8 ] */ \ +/* ymm10 = a [ C*5 | D*6 | A*7 | B*8 ] */ \ +/* ymm11 = a [ D*5 | C*6 | B*7 | A*8 ] */ \ + \ + ymm5 = _mm256_permute4x64_pd(ymm5, 0xb1); \ +/* ymm5 = a [ A*2 | B*1 | C*4 | D*3 ] */ \ + ymm7 = _mm256_permute4x64_pd(ymm7, 0xb1); \ +/* ymm7 = a [ C*2 | D*1 | A*4 | B*3 ] */ \ + \ + ymm0 = _mm256_blend_pd(ymm4, ymm5, 0x0a); \ + ymm1 = _mm256_blend_pd(ymm4, ymm5, 0x05); \ +/* ymm0 = a [ A*1 | B*1 | C*3 | D*3 ] */ \ +/* ymm1 = a [ A*2 | B*2 | C*4 | D*4 ] */ \ + ymm2 = _mm256_blend_pd(ymm6, ymm7, 0x0a); \ + ymm3 = _mm256_blend_pd(ymm6, ymm7, 0x05); \ +/* ymm2 = a [ C*1 | D*1 | A*3 | B*3 ] */ \ +/* ymm3 = a [ C*2 | D*2 | A*4 | B*4 ] */ \ + \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \ +/* ymm2 = a [ B*3 | A*3 | D*1 | C*1 ] */ \ +/* ymm3 = a [ B*4 | A*4 | D*2 | C*2 ] */ \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \ +/* ymm2 = a [ A*3 | B*3 | C*1 | D*1 ] */ \ +/* ymm3 = a [ A*4 | B*4 | C*2 | D*2 ] */ \ + \ + ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \ + ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \ +/* ymm4 = a [ A*1 | B*1 | C*1 | D*1 ] */ \ +/* ymm5 = a [ A*2 | B*2 | C*2 | D*2 ] */ \ + ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \ + ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \ +/* ymm5 = a [ A*3 | B*3 | C*3 | D*3 ] */ \ +/* ymm7 = a [ A*4 | B*4 | C*4 | D*4 ] */ \ + \ + ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \ + ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \ + ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (3 * ldc)); \ + _mm256_storeu_pd(CO1 + (0 * ldc), ymm4); \ + _mm256_storeu_pd(CO1 + (1 * ldc), ymm5); \ + _mm256_storeu_pd(CO1 + (2 * ldc), ymm6); \ + _mm256_storeu_pd(CO1 + (3 * ldc), ymm7); \ + \ + ymm9 = _mm256_permute4x64_pd(ymm9, 0xb1); \ + ymm11 = _mm256_permute4x64_pd(ymm11, 0xb1); \ + \ + ymm0 = _mm256_blend_pd(ymm8, ymm9, 0x0a); \ + ymm1 = _mm256_blend_pd(ymm8, ymm9, 0x05); \ + ymm2 = _mm256_blend_pd(ymm10, ymm11, 0x0a); \ + ymm3 = _mm256_blend_pd(ymm10, ymm11, 0x05); \ + \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \ + \ + ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \ + ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \ + ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \ + ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \ + \ + ymm4 += _mm256_loadu_pd(CO1 + (4 * ldc)); \ + ymm5 += _mm256_loadu_pd(CO1 + (5 * ldc)); \ + ymm6 += _mm256_loadu_pd(CO1 + (6 * ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (7 * ldc)); \ + _mm256_storeu_pd(CO1 + (4 * ldc), ymm4); \ + _mm256_storeu_pd(CO1 + (5 * ldc), ymm5); \ + _mm256_storeu_pd(CO1 + (6 * ldc), ymm6); \ + _mm256_storeu_pd(CO1 + (7 * ldc), ymm7); \ + \ + CO1 += 4; + +/******************************************************************************************/ + +#define INIT2x8() \ + xmm4 = _mm_setzero_pd(); \ + xmm5 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + xmm7 = _mm_setzero_pd(); \ + xmm8 = _mm_setzero_pd(); \ + xmm9 = _mm_setzero_pd(); \ + xmm10 = _mm_setzero_pd(); \ + xmm11 = _mm_setzero_pd(); \ + + +#define KERNEL2x8_SUB() \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm1 = _mm_set1_pd(*(BO - 12)); \ + xmm2 = _mm_set1_pd(*(BO - 11)); \ + xmm3 = _mm_set1_pd(*(BO - 10)); \ + xmm4 += xmm0 * xmm1; \ + xmm1 = _mm_set1_pd(*(BO - 9)); \ + xmm5 += xmm0 * xmm2; \ + xmm2 = _mm_set1_pd(*(BO - 8)); \ + xmm6 += xmm0 * xmm3; \ + xmm3 = _mm_set1_pd(*(BO - 7)); \ + xmm7 += xmm0 * xmm1; \ + xmm1 = _mm_set1_pd(*(BO - 6)); \ + xmm8 += xmm0 * xmm2; \ + xmm2 = _mm_set1_pd(*(BO - 5)); \ + xmm9 += xmm0 * xmm3; \ + xmm10 += xmm0 * xmm1; \ + xmm11 += xmm0 * xmm2; \ + BO += 8; \ + AO += 2; + +#define SAVE2x8(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm5 *= xmm0; \ + xmm6 *= xmm0; \ + xmm7 *= xmm0; \ + xmm8 *= xmm0; \ + xmm9 *= xmm0; \ + xmm10 *= xmm0; \ + xmm11 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1 + (0 * ldc)); \ + xmm5 += _mm_loadu_pd(CO1 + (1 * ldc)); \ + xmm6 += _mm_loadu_pd(CO1 + (2 * ldc)); \ + xmm7 += _mm_loadu_pd(CO1 + (3 * ldc)); \ + \ + _mm_storeu_pd(CO1 + (0 * ldc), xmm4); \ + _mm_storeu_pd(CO1 + (1 * ldc), xmm5); \ + _mm_storeu_pd(CO1 + (2 * ldc), xmm6); \ + _mm_storeu_pd(CO1 + (3 * ldc), xmm7); \ + \ + xmm8 += _mm_loadu_pd(CO1 + (4 * ldc)); \ + xmm9 += _mm_loadu_pd(CO1 + (5 * ldc)); \ + xmm10+= _mm_loadu_pd(CO1 + (6 * ldc)); \ + xmm11+= _mm_loadu_pd(CO1 + (7 * ldc)); \ + _mm_storeu_pd(CO1 + (4 * ldc), xmm8); \ + _mm_storeu_pd(CO1 + (5 * ldc), xmm9); \ + _mm_storeu_pd(CO1 + (6 * ldc), xmm10); \ + _mm_storeu_pd(CO1 + (7 * ldc), xmm11); \ + CO1 += 2; + + + + +/******************************************************************************************/ + +#define INIT1x8() \ + dbl4 = 0; \ + dbl5 = 0; \ + dbl6 = 0; \ + dbl7 = 0; \ + dbl8 = 0; \ + dbl9 = 0; \ + dbl10 = 0; \ + dbl11 = 0; + + +#define KERNEL1x8_SUB() \ + dbl0 = *(AO - 16); \ + dbl1 = *(BO - 12); \ + dbl2 = *(BO - 11); \ + dbl3 = *(BO - 10); \ + dbl4 += dbl0 * dbl1; \ + dbl1 = *(BO - 9); \ + dbl5 += dbl0 * dbl2; \ + dbl2 = *(BO - 8); \ + dbl6 += dbl0 * dbl3; \ + dbl3 = *(BO - 7); \ + dbl7 += dbl0 * dbl1; \ + dbl1 = *(BO - 6); \ + dbl8 += dbl0 * dbl2; \ + dbl2 = *(BO - 5); \ + dbl9 += dbl0 * dbl3; \ + dbl10 += dbl0 * dbl1; \ + dbl11 += dbl0 * dbl2; \ + BO += 8; \ + AO += 1; + + +#define SAVE1x8(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl5 *= dbl0; \ + dbl6 *= dbl0; \ + dbl7 *= dbl0; \ + dbl8 *= dbl0; \ + dbl9 *= dbl0; \ + dbl10 *= dbl0; \ + dbl11 *= dbl0; \ + \ + dbl4 += *(CO1 + (0 * ldc)); \ + dbl5 += *(CO1 + (1 * ldc)); \ + dbl6 += *(CO1 + (2 * ldc)); \ + dbl7 += *(CO1 + (3 * ldc)); \ + *(CO1 + (0 * ldc)) = dbl4; \ + *(CO1 + (1 * ldc)) = dbl5; \ + *(CO1 + (2 * ldc)) = dbl6; \ + *(CO1 + (3 * ldc)) = dbl7; \ + \ + dbl8 += *(CO1 + (4 * ldc)); \ + dbl9 += *(CO1 + (5 * ldc)); \ + dbl10 += *(CO1 + (6 * ldc)); \ + dbl11 += *(CO1 + (7 * ldc)); \ + *(CO1 + (4 * ldc)) = dbl8; \ + *(CO1 + (5 * ldc)) = dbl9; \ + *(CO1 + (6 * ldc)) = dbl10; \ + *(CO1 + (7 * ldc)) = dbl11; \ + \ + CO1 += 1; + + + + + + +/******************************************************************************************/ + +#define INIT4x4() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); \ + + +#define KERNEL4x4_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 12)); \ + \ + ymm4 += ymm0 * ymm1; \ + \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 11)); \ + ymm5 += ymm0 * ymm1; \ + \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 10)); \ + ymm6 += ymm0 * ymm1; \ + \ + ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 9)); \ + ymm7 += ymm0 * ymm1; \ + AO += 4; \ + BO += 4; + + +#define SAVE4x4(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 *= ymm0; \ + ymm5 *= ymm0; \ + ymm6 *= ymm0; \ + ymm7 *= ymm0; \ + \ + ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \ + ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \ + ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (3 * ldc)); \ + _mm256_storeu_pd(CO1 + (0 * ldc), ymm4); \ + _mm256_storeu_pd(CO1 + (1 * ldc), ymm5); \ + _mm256_storeu_pd(CO1 + (2 * ldc), ymm6); \ + _mm256_storeu_pd(CO1 + (3 * ldc), ymm7); \ + \ + CO1 += 4; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT2x4() \ + xmm4 = _mm_setzero_pd(); \ + xmm5 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + xmm7 = _mm_setzero_pd(); \ + + + +#define KERNEL2x4_SUB() \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm1 = _mm_set1_pd(*(BO - 12)); \ + xmm2 = _mm_set1_pd(*(BO - 11)); \ + xmm3 = _mm_set1_pd(*(BO - 10)); \ + xmm4 += xmm0 * xmm1; \ + xmm1 = _mm_set1_pd(*(BO - 9)); \ + xmm5 += xmm0 * xmm2; \ + xmm6 += xmm0 * xmm3; \ + xmm7 += xmm0 * xmm1; \ + BO += 4; \ + AO += 2; + + + +#define SAVE2x4(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm5 *= xmm0; \ + xmm6 *= xmm0; \ + xmm7 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1 + (0 * ldc)); \ + xmm5 += _mm_loadu_pd(CO1 + (1 * ldc)); \ + xmm6 += _mm_loadu_pd(CO1 + (2 * ldc)); \ + xmm7 += _mm_loadu_pd(CO1 + (3 * ldc)); \ + \ + _mm_storeu_pd(CO1 + (0 * ldc), xmm4); \ + _mm_storeu_pd(CO1 + (1 * ldc), xmm5); \ + _mm_storeu_pd(CO1 + (2 * ldc), xmm6); \ + _mm_storeu_pd(CO1 + (3 * ldc), xmm7); \ + \ + CO1 += 2; + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT1x4() \ + dbl4 = 0; \ + dbl5 = 0; \ + dbl6 = 0; \ + dbl7 = 0; \ + +#define KERNEL1x4_SUB() \ + dbl0 = *(AO - 16); \ + dbl1 = *(BO - 12); \ + dbl2 = *(BO - 11); \ + dbl3 = *(BO - 10); \ + dbl8 = *(BO - 9); \ + \ + dbl4 += dbl0 * dbl1; \ + dbl5 += dbl0 * dbl2; \ + dbl6 += dbl0 * dbl3; \ + dbl7 += dbl0 * dbl8; \ + BO += 4; \ + AO += 1; + + +#define SAVE1x4(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl5 *= dbl0; \ + dbl6 *= dbl0; \ + dbl7 *= dbl0; \ + \ + dbl4 += *(CO1 + (0 * ldc)); \ + dbl5 += *(CO1 + (1 * ldc)); \ + dbl6 += *(CO1 + (2 * ldc)); \ + dbl7 += *(CO1 + (3 * ldc)); \ + *(CO1 + (0 * ldc)) = dbl4; \ + *(CO1 + (1 * ldc)) = dbl5; \ + *(CO1 + (2 * ldc)) = dbl6; \ + *(CO1 + (3 * ldc)) = dbl7; \ + \ + \ + CO1 += 1; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT8x4() \ + ymm10 = _mm256_setzero_pd(); \ + ymm11 = _mm256_setzero_pd(); \ + ymm12 = _mm256_setzero_pd(); \ + ymm13 = _mm256_setzero_pd(); \ + ymm14 = _mm256_setzero_pd(); \ + ymm15 = _mm256_setzero_pd(); \ + ymm16 = _mm256_setzero_pd(); \ + ymm17 = _mm256_setzero_pd(); \ + + +#define KERNEL8x4_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm1 = _mm256_loadu_pd(AO - 12); \ + ymm2 = _mm256_set1_pd(*(BO - 12)); \ + ymm3 = _mm256_set1_pd(*(BO - 11)); \ + ymm4 = _mm256_set1_pd(*(BO - 10)); \ + ymm5 = _mm256_set1_pd(*(BO - 9)); \ + ymm10 += ymm0 * ymm2; \ + ymm11 += ymm1 * ymm2; \ + ymm12 += ymm0 * ymm3; \ + ymm13 += ymm1 * ymm3; \ + ymm14 += ymm0 * ymm4; \ + ymm15 += ymm1 * ymm4; \ + ymm16 += ymm0 * ymm5; \ + ymm17 += ymm1 * ymm5; \ + BO += 4; \ + AO += 8; + + + +#define SAVE8x4(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm10 *= ymm0; \ + ymm11 *= ymm0; \ + ymm12 *= ymm0; \ + ymm13 *= ymm0; \ + ymm14 *= ymm0; \ + ymm15 *= ymm0; \ + ymm16 *= ymm0; \ + ymm17 *= ymm0; \ + \ + ymm10 += _mm256_loadu_pd(CO1); \ + ymm11 += _mm256_loadu_pd(CO1 + 4); \ + ymm12 += _mm256_loadu_pd(CO1 + (ldc)); \ + ymm13 += _mm256_loadu_pd(CO1 + (ldc) + 4); \ + ymm14 += _mm256_loadu_pd(CO1 + (ldc*2)); \ + ymm15 += _mm256_loadu_pd(CO1 + (ldc*2) + 4); \ + ymm16 += _mm256_loadu_pd(CO1 + (ldc*3)); \ + ymm17 += _mm256_loadu_pd(CO1 + (ldc*3) + 4); \ + \ + _mm256_storeu_pd(CO1, ymm10); \ + _mm256_storeu_pd(CO1 + 4, ymm11); \ + _mm256_storeu_pd(CO1 + ldc, ymm12); \ + _mm256_storeu_pd(CO1 + ldc + 4, ymm13); \ + _mm256_storeu_pd(CO1 + ldc*2, ymm14); \ + _mm256_storeu_pd(CO1 + ldc*2 + 4, ymm15); \ + _mm256_storeu_pd(CO1 + ldc*3, ymm16); \ + _mm256_storeu_pd(CO1 + ldc*3 + 4, ymm17); \ + \ + CO1 += 8; + + +/******************************************************************************************/ +/******************************************************************************************/ +#define INIT8x2() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); \ + + +#define KERNEL8x2_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm1 = _mm256_loadu_pd(AO - 12); \ + ymm2 = _mm256_set1_pd(*(BO - 12)); \ + ymm3 = _mm256_set1_pd(*(BO - 11)); \ + ymm4 += ymm0 * ymm2; \ + ymm5 += ymm1 * ymm2; \ + ymm6 += ymm0 * ymm3; \ + ymm7 += ymm1 * ymm3; \ + BO += 2; \ + AO += 8; + + + +#define SAVE8x2(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 *= ymm0; \ + ymm5 *= ymm0; \ + ymm6 *= ymm0; \ + ymm7 *= ymm0; \ + \ + ymm4 += _mm256_loadu_pd(CO1); \ + ymm5 += _mm256_loadu_pd(CO1 + 4); \ + ymm6 += _mm256_loadu_pd(CO1 + (ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (ldc) + 4); \ + \ + _mm256_storeu_pd(CO1, ymm4); \ + _mm256_storeu_pd(CO1 + 4, ymm5); \ + _mm256_storeu_pd(CO1 + ldc, ymm6); \ + _mm256_storeu_pd(CO1 + ldc + 4, ymm7); \ + \ + CO1 += 8; + + +/******************************************************************************************/ +/******************************************************************************************/ +#define INIT4x2() \ + xmm4 = _mm_setzero_pd(); \ + xmm5 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + xmm7 = _mm_setzero_pd(); \ + + +#define KERNEL4x2_SUB() \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm1 = _mm_loadu_pd(AO - 14); \ + xmm2 = _mm_set1_pd(*(BO - 12)); \ + xmm3 = _mm_set1_pd(*(BO - 11)); \ + xmm4 += xmm0 * xmm2; \ + xmm5 += xmm1 * xmm2; \ + xmm6 += xmm0 * xmm3; \ + xmm7 += xmm1 * xmm3; \ + BO += 2; \ + AO += 4; + + + +#define SAVE4x2(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm5 *= xmm0; \ + xmm6 *= xmm0; \ + xmm7 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1); \ + xmm5 += _mm_loadu_pd(CO1 + 2); \ + xmm6 += _mm_loadu_pd(CO1 + (ldc)); \ + xmm7 += _mm_loadu_pd(CO1 + (ldc) + 2); \ + \ + _mm_storeu_pd(CO1, xmm4); \ + _mm_storeu_pd(CO1 + 2, xmm5); \ + _mm_storeu_pd(CO1 + ldc, xmm6); \ + _mm_storeu_pd(CO1 + ldc + 2, xmm7); \ + \ + CO1 += 4; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT2x2() \ + xmm4 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + + + +#define KERNEL2x2_SUB() \ + xmm2 = _mm_set1_pd(*(BO - 12)); \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm3 = _mm_set1_pd(*(BO - 11)); \ + xmm4 += xmm0 * xmm2; \ + xmm6 += xmm0 * xmm3; \ + BO += 2; \ + AO += 2; + + +#define SAVE2x2(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm6 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1); \ + xmm6 += _mm_loadu_pd(CO1 + ldc); \ + \ + _mm_storeu_pd(CO1, xmm4); \ + _mm_storeu_pd(CO1 + ldc, xmm6); \ + \ + CO1 += 2; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT1x2() \ + dbl4 = 0; \ + dbl5 = 0; + + +#define KERNEL1x2_SUB() \ + dbl0 = *(AO - 16); \ + dbl1 = *(BO - 12); \ + dbl2 = *(BO - 11); \ + dbl4 += dbl0 * dbl1; \ + dbl5 += dbl0 * dbl2; \ + BO += 2; \ + AO += 1; + + +#define SAVE1x2(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl5 *= dbl0; \ + \ + dbl4 += *(CO1 + (0 * ldc)); \ + dbl5 += *(CO1 + (1 * ldc)); \ + *(CO1 + (0 * ldc)) = dbl4; \ + *(CO1 + (1 * ldc)) = dbl5; \ + \ + \ + CO1 += 1; + + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT4x1() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); + + +#define KERNEL4x1() \ + ymm0 = _mm256_set1_pd(*(BO - 12)); \ + ymm1 = _mm256_set1_pd(*(BO - 11)); \ + ymm2 = _mm256_set1_pd(*(BO - 10)); \ + ymm3 = _mm256_set1_pd(*(BO - 9)); \ + \ + ymm4 += _mm256_loadu_pd(AO - 16) * ymm0; \ + ymm5 += _mm256_loadu_pd(AO - 12) * ymm1; \ + \ + ymm0 = _mm256_set1_pd(*(BO - 8)); \ + ymm1 = _mm256_set1_pd(*(BO - 7)); \ + \ + ymm6 += _mm256_loadu_pd(AO - 8) * ymm2; \ + ymm7 += _mm256_loadu_pd(AO - 4) * ymm3; \ + \ + ymm2 = _mm256_set1_pd(*(BO - 6)); \ + ymm3 = _mm256_set1_pd(*(BO - 5)); \ + \ + ymm4 += _mm256_loadu_pd(AO + 0) * ymm0; \ + ymm5 += _mm256_loadu_pd(AO + 4) * ymm1; \ + ymm6 += _mm256_loadu_pd(AO + 8) * ymm2; \ + ymm7 += _mm256_loadu_pd(AO + 12) * ymm3; \ + \ + BO += 8; \ + AO += 32; + + +#define INIT8x1() \ + zmm4 = _mm512_setzero_pd(); \ + + +#define KERNEL8x1_SUB() \ + zmm2 = _mm512_set1_pd(*(BO - 12)); \ + zmm0 = _mm512_loadu_pd(AO - 16); \ + zmm4 += zmm0 * zmm2; \ + BO += 1; \ + AO += 8; + + +#define SAVE8x1(ALPHA) \ + zmm0 = _mm512_set1_pd(ALPHA); \ + zmm4 *= zmm0; \ + \ + zmm4 += _mm512_loadu_pd(CO1); \ + _mm512_storeu_pd(CO1, zmm4); \ + CO1 += 8; + +#define KERNEL4x1_SUB() \ + ymm2 = _mm256_set1_pd(*(BO - 12)); \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm4 += ymm0 * ymm2; \ + BO += 1; \ + AO += 4; + + +#define SAVE4x1(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 += ymm5; \ + ymm6 += ymm7; \ + ymm4 += ymm6; \ + ymm4 *= ymm0; \ + \ + ymm4 += _mm256_loadu_pd(CO1); \ + _mm256_storeu_pd(CO1, ymm4); \ + CO1 += 4; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT2x1() \ + xmm4 = _mm_setzero_pd(); + + +#define KERNEL2x1_SUB() \ + xmm2 = _mm_set1_pd(*(BO - 12)); \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm4 += xmm0 * xmm2; \ + BO += 1; \ + AO += 2; + + +#define SAVE2x1(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1); \ + \ + _mm_storeu_pd(CO1, xmm4); \ + \ + CO1 += 2; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT1x1() \ + dbl4 = 0; + +#define KERNEL1x1_SUB() \ + dbl1 = *(BO - 12); \ + dbl0 = *(AO - 16); \ + dbl4 += dbl0 * dbl1; \ + BO += 1; \ + AO += 1; + +#define SAVE1x1(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl4 += *CO1; \ + *CO1 = dbl4; \ + CO1 += 1; + + +/*******************************************************************************************/ + +/* START */ + + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc) +{ + unsigned long M=m, N=n, K=k; + + + if (M == 0) + return 0; + if (N == 0) + return 0; + if (K == 0) + return 0; + + while (N >= 8) { + double *CO1; + double *AO; + int i; + + CO1 = C; + C += 8 * ldc; + + AO = A + 16; + + i = m; + + while (i >= 24) { + double *BO; + double *A1, *A2; + int kloop = K; + + BO = B + 12; + A1 = AO + 8 * K; + A2 = AO + 16 * K; + /* + * This is the inner loop for the hot hot path + * Written in inline asm because compilers like GCC 8 and earlier + * struggle with register allocation and are not good at using + * the AVX512 built in broadcast ability (1to8) + */ + asm( + "vxorpd %%zmm1, %%zmm1, %%zmm1\n" + "vmovapd %%zmm1, %%zmm2\n" + "vmovapd %%zmm1, %%zmm3\n" + "vmovapd %%zmm1, %%zmm4\n" + "vmovapd %%zmm1, %%zmm5\n" + "vmovapd %%zmm1, %%zmm6\n" + "vmovapd %%zmm1, %%zmm7\n" + "vmovapd %%zmm1, %%zmm8\n" + "vmovapd %%zmm1, %%zmm11\n" + "vmovapd %%zmm1, %%zmm12\n" + "vmovapd %%zmm1, %%zmm13\n" + "vmovapd %%zmm1, %%zmm14\n" + "vmovapd %%zmm1, %%zmm15\n" + "vmovapd %%zmm1, %%zmm16\n" + "vmovapd %%zmm1, %%zmm17\n" + "vmovapd %%zmm1, %%zmm18\n" + "vmovapd %%zmm1, %%zmm21\n" + "vmovapd %%zmm1, %%zmm22\n" + "vmovapd %%zmm1, %%zmm23\n" + "vmovapd %%zmm1, %%zmm24\n" + "vmovapd %%zmm1, %%zmm25\n" + "vmovapd %%zmm1, %%zmm26\n" + "vmovapd %%zmm1, %%zmm27\n" + "vmovapd %%zmm1, %%zmm28\n" + "jmp .label24\n" + ".align 32\n" + /* Inner math loop */ + ".label24:\n" + "vmovupd -128(%[AO]),%%zmm0\n" + "vmovupd -128(%[A1]),%%zmm10\n" + "vmovupd -128(%[A2]),%%zmm20\n" + + "vbroadcastsd -96(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm1\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm11\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm21\n" + + "vbroadcastsd -88(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm2\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm12\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm22\n" + + "vbroadcastsd -80(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm3\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm13\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm23\n" + + "vbroadcastsd -72(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm4\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm14\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm24\n" + + "vbroadcastsd -64(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm5\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm15\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm25\n" + + "vbroadcastsd -56(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm6\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm16\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm26\n" + + "vbroadcastsd -48(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm7\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm17\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm27\n" + + "vbroadcastsd -40(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm18\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm28\n" + "add $64, %[AO]\n" + "add $64, %[A1]\n" + "add $64, %[A2]\n" + "add $64, %[BO]\n" + "prefetch 512(%[AO])\n" + "prefetch 512(%[A1])\n" + "prefetch 512(%[A2])\n" + "prefetch 512(%[BO])\n" + "subl $1, %[kloop]\n" + "jg .label24\n" + /* multiply the result by alpha */ + "vbroadcastsd (%[alpha]), %%zmm9\n" + /* And store additively in C */ + "vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n" + "vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n" + "vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n" + "vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n" + "vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n" + "vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n" + "vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n" + "vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n" + "vmovupd %%zmm1, (%[C0])\n" + "vmovupd %%zmm2, (%[C1])\n" + "vmovupd %%zmm3, (%[C2])\n" + "vmovupd %%zmm4, (%[C3])\n" + "vmovupd %%zmm5, (%[C4])\n" + "vmovupd %%zmm6, (%[C5])\n" + "vmovupd %%zmm7, (%[C6])\n" + "vmovupd %%zmm8, (%[C7])\n" + + "vfmadd213pd 64(%[C0]), %%zmm9, %%zmm11\n" + "vfmadd213pd 64(%[C1]), %%zmm9, %%zmm12\n" + "vfmadd213pd 64(%[C2]), %%zmm9, %%zmm13\n" + "vfmadd213pd 64(%[C3]), %%zmm9, %%zmm14\n" + "vfmadd213pd 64(%[C4]), %%zmm9, %%zmm15\n" + "vfmadd213pd 64(%[C5]), %%zmm9, %%zmm16\n" + "vfmadd213pd 64(%[C6]), %%zmm9, %%zmm17\n" + "vfmadd213pd 64(%[C7]), %%zmm9, %%zmm18\n" + "vmovupd %%zmm11, 64(%[C0])\n" + "vmovupd %%zmm12, 64(%[C1])\n" + "vmovupd %%zmm13, 64(%[C2])\n" + "vmovupd %%zmm14, 64(%[C3])\n" + "vmovupd %%zmm15, 64(%[C4])\n" + "vmovupd %%zmm16, 64(%[C5])\n" + "vmovupd %%zmm17, 64(%[C6])\n" + "vmovupd %%zmm18, 64(%[C7])\n" + + "vfmadd213pd 128(%[C0]), %%zmm9, %%zmm21\n" + "vfmadd213pd 128(%[C1]), %%zmm9, %%zmm22\n" + "vfmadd213pd 128(%[C2]), %%zmm9, %%zmm23\n" + "vfmadd213pd 128(%[C3]), %%zmm9, %%zmm24\n" + "vfmadd213pd 128(%[C4]), %%zmm9, %%zmm25\n" + "vfmadd213pd 128(%[C5]), %%zmm9, %%zmm26\n" + "vfmadd213pd 128(%[C6]), %%zmm9, %%zmm27\n" + "vfmadd213pd 128(%[C7]), %%zmm9, %%zmm28\n" + "vmovupd %%zmm21, 128(%[C0])\n" + "vmovupd %%zmm22, 128(%[C1])\n" + "vmovupd %%zmm23, 128(%[C2])\n" + "vmovupd %%zmm24, 128(%[C3])\n" + "vmovupd %%zmm25, 128(%[C4])\n" + "vmovupd %%zmm26, 128(%[C5])\n" + "vmovupd %%zmm27, 128(%[C6])\n" + "vmovupd %%zmm28, 128(%[C7])\n" + + : + [AO] "+r" (AO), + [A1] "+r" (A1), + [A2] "+r" (A2), + [BO] "+r" (BO), + [C0] "+r" (CO1), + [kloop] "+r" (kloop) + : + [alpha] "r" (&alpha), + [C1] "r" (CO1 + 1 * ldc), + [C2] "r" (CO1 + 2 * ldc), + [C3] "r" (CO1 + 3 * ldc), + [C4] "r" (CO1 + 4 * ldc), + [C5] "r" (CO1 + 5 * ldc), + [C6] "r" (CO1 + 6 * ldc), + [C7] "r" (CO1 + 7 * ldc) + + : "memory", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", + "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28" + ); + CO1 += 24; + AO += 16 * K; + i-= 24; + } + + + while (i >= 16) { + double *BO; + double *A1; + int kloop = K; + + BO = B + 12; + A1 = AO + 8 * K; + /* + * This is the inner loop for the hot hot path + * Written in inline asm because compilers like GCC 8 and earlier + * struggle with register allocation and are not good at using + * the AVX512 built in broadcast ability (1to8) + */ + asm( + "vxorpd %%zmm1, %%zmm1, %%zmm1\n" + "vmovapd %%zmm1, %%zmm2\n" + "vmovapd %%zmm1, %%zmm3\n" + "vmovapd %%zmm1, %%zmm4\n" + "vmovapd %%zmm1, %%zmm5\n" + "vmovapd %%zmm1, %%zmm6\n" + "vmovapd %%zmm1, %%zmm7\n" + "vmovapd %%zmm1, %%zmm8\n" + "vmovapd %%zmm1, %%zmm11\n" + "vmovapd %%zmm1, %%zmm12\n" + "vmovapd %%zmm1, %%zmm13\n" + "vmovapd %%zmm1, %%zmm14\n" + "vmovapd %%zmm1, %%zmm15\n" + "vmovapd %%zmm1, %%zmm16\n" + "vmovapd %%zmm1, %%zmm17\n" + "vmovapd %%zmm1, %%zmm18\n" + "jmp .label16\n" + ".align 32\n" + /* Inner math loop */ + ".label16:\n" + "vmovupd -128(%[AO]),%%zmm0\n" + "vmovupd -128(%[A1]),%%zmm10\n" + + "vbroadcastsd -96(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm1\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm11\n" + + "vbroadcastsd -88(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm2\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm12\n" + + "vbroadcastsd -80(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm3\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm13\n" + + "vbroadcastsd -72(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm4\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm14\n" + + "vbroadcastsd -64(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm5\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm15\n" + + "vbroadcastsd -56(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm6\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm16\n" + + "vbroadcastsd -48(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm7\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm17\n" + + "vbroadcastsd -40(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm18\n" + "add $64, %[AO]\n" + "add $64, %[A1]\n" + "add $64, %[BO]\n" + "prefetch 512(%[AO])\n" + "prefetch 512(%[A1])\n" + "prefetch 512(%[BO])\n" + "subl $1, %[kloop]\n" + "jg .label16\n" + /* multiply the result by alpha */ + "vbroadcastsd (%[alpha]), %%zmm9\n" + /* And store additively in C */ + "vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n" + "vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n" + "vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n" + "vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n" + "vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n" + "vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n" + "vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n" + "vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n" + "vmovupd %%zmm1, (%[C0])\n" + "vmovupd %%zmm2, (%[C1])\n" + "vmovupd %%zmm3, (%[C2])\n" + "vmovupd %%zmm4, (%[C3])\n" + "vmovupd %%zmm5, (%[C4])\n" + "vmovupd %%zmm6, (%[C5])\n" + "vmovupd %%zmm7, (%[C6])\n" + "vmovupd %%zmm8, (%[C7])\n" + + "vfmadd213pd 64(%[C0]), %%zmm9, %%zmm11\n" + "vfmadd213pd 64(%[C1]), %%zmm9, %%zmm12\n" + "vfmadd213pd 64(%[C2]), %%zmm9, %%zmm13\n" + "vfmadd213pd 64(%[C3]), %%zmm9, %%zmm14\n" + "vfmadd213pd 64(%[C4]), %%zmm9, %%zmm15\n" + "vfmadd213pd 64(%[C5]), %%zmm9, %%zmm16\n" + "vfmadd213pd 64(%[C6]), %%zmm9, %%zmm17\n" + "vfmadd213pd 64(%[C7]), %%zmm9, %%zmm18\n" + "vmovupd %%zmm11, 64(%[C0])\n" + "vmovupd %%zmm12, 64(%[C1])\n" + "vmovupd %%zmm13, 64(%[C2])\n" + "vmovupd %%zmm14, 64(%[C3])\n" + "vmovupd %%zmm15, 64(%[C4])\n" + "vmovupd %%zmm16, 64(%[C5])\n" + "vmovupd %%zmm17, 64(%[C6])\n" + "vmovupd %%zmm18, 64(%[C7])\n" + + : + [AO] "+r" (AO), + [A1] "+r" (A1), + [BO] "+r" (BO), + [C0] "+r" (CO1), + [kloop] "+r" (kloop) + : + [alpha] "r" (&alpha), + [C1] "r" (CO1 + 1 * ldc), + [C2] "r" (CO1 + 2 * ldc), + [C3] "r" (CO1 + 3 * ldc), + [C4] "r" (CO1 + 4 * ldc), + [C5] "r" (CO1 + 5 * ldc), + [C6] "r" (CO1 + 6 * ldc), + [C7] "r" (CO1 + 7 * ldc) + + : "memory", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", + "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18" + ); + CO1 += 16; + AO += 8 * K; + i-= 16; + } + + while (i >= 8) { + double *BO; + int kloop = K; + + BO = B + 12; + /* + * This is the inner loop for the hot hot path + * Written in inline asm because compilers like GCC 8 and earlier + * struggle with register allocation and are not good at using + * the AVX512 built in broadcast ability (1to8) + */ + asm( + "vxorpd %%zmm1, %%zmm1, %%zmm1\n" + "vmovapd %%zmm1, %%zmm2\n" + "vmovapd %%zmm1, %%zmm3\n" + "vmovapd %%zmm1, %%zmm4\n" + "vmovapd %%zmm1, %%zmm5\n" + "vmovapd %%zmm1, %%zmm6\n" + "vmovapd %%zmm1, %%zmm7\n" + "vmovapd %%zmm1, %%zmm8\n" + "vbroadcastsd (%[alpha]), %%zmm9\n" + "jmp .label1\n" + ".align 32\n" + /* Inner math loop */ + ".label1:\n" + "vmovupd -128(%[AO]),%%zmm0\n" + "vfmadd231pd -96(%[BO])%{1to8%}, %%zmm0, %%zmm1\n" + "vfmadd231pd -88(%[BO])%{1to8%}, %%zmm0, %%zmm2\n" + "vfmadd231pd -80(%[BO])%{1to8%}, %%zmm0, %%zmm3\n" + "vfmadd231pd -72(%[BO])%{1to8%}, %%zmm0, %%zmm4\n" + "vfmadd231pd -64(%[BO])%{1to8%}, %%zmm0, %%zmm5\n" + "vfmadd231pd -56(%[BO])%{1to8%}, %%zmm0, %%zmm6\n" + "vfmadd231pd -48(%[BO])%{1to8%}, %%zmm0, %%zmm7\n" + "vfmadd231pd -40(%[BO])%{1to8%}, %%zmm0, %%zmm8\n" + "add $64, %[AO]\n" + "add $64, %[BO]\n" + "subl $1, %[kloop]\n" + "jg .label1\n" + /* multiply the result by alpha and add to the memory */ + "vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n" + "vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n" + "vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n" + "vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n" + "vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n" + "vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n" + "vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n" + "vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n" + "vmovupd %%zmm1, (%[C0])\n" + "vmovupd %%zmm2, (%[C1])\n" + "vmovupd %%zmm3, (%[C2])\n" + "vmovupd %%zmm4, (%[C3])\n" + "vmovupd %%zmm5, (%[C4])\n" + "vmovupd %%zmm6, (%[C5])\n" + "vmovupd %%zmm7, (%[C6])\n" + "vmovupd %%zmm8, (%[C7])\n" + : + [AO] "+r" (AO), + [BO] "+r" (BO), + [C0] "+r" (CO1), + [kloop] "+r" (kloop) + : + [alpha] "r" (&alpha), + [C1] "r" (CO1 + 1 * ldc), + [C2] "r" (CO1 + 2 * ldc), + [C3] "r" (CO1 + 3 * ldc), + [C4] "r" (CO1 + 4 * ldc), + [C5] "r" (CO1 + 5 * ldc), + [C6] "r" (CO1 + 6 * ldc), + [C7] "r" (CO1 + 7 * ldc) + + : "memory", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9" + ); + CO1 += 8; + i-= 8; + } + + + + while (i >= 4) { + double *BO; + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11; + int kloop = K; + + BO = B + 12; + INIT4x8() + + while (kloop > 0) { + KERNEL4x8_SUB() + kloop--; + } + SAVE4x8(alpha) + i-= 4; + } + + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; + int kloop = K; + + BO = B + 12; + INIT2x8() + + while (kloop > 0) { + KERNEL2x8_SUB() + kloop--; + } + SAVE2x8(alpha) + i -= 2; + } + + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl2, dbl3, dbl4, dbl5, dbl6, dbl7, dbl8, dbl9, dbl10, dbl11; + int kloop = K; + + BO = B + 12; + INIT1x8() + + while (kloop > 0) { + KERNEL1x8_SUB() + kloop--; + } + SAVE1x8(alpha) + i -= 1; + } + B += K * 8; + N -= 8; + } + + if (N == 0) + return 0; + + + + // L8_0 + while (N >= 4) { + double *CO1; + double *AO; + int i; + // L8_10 + CO1 = C; + C += 4 * ldc; + + AO = A + 16; + + i = m; + while (i >= 8) { + double *BO; + // L8_11 + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm10, ymm11,ymm12,ymm13,ymm14,ymm15,ymm16,ymm17; + BO = B + 12; + int kloop = K; + + INIT8x4() + + while (kloop > 0) { + // L12_17 + KERNEL8x4_SUB() + kloop--; + } + // L8_19 + SAVE8x4(alpha) + + i -= 8; + } + while (i >= 4) { + // L8_11 + double *BO; + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; + BO = B + 12; + int kloop = K; + + INIT4x4() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x4_SUB() + kloop--; + } + // L8_19 + SAVE4x4(alpha) + + i -= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + BO = B; + BO += 12; + + INIT2x4() + int kloop = K; + + while (kloop > 0) { + KERNEL2x4_SUB() + kloop--; + } + SAVE2x4(alpha) + i -= 2; + } + // L13_40 + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl2, dbl3, dbl4, dbl5, dbl6, dbl7, dbl8; + int kloop = K; + BO = B + 12; + INIT1x4() + + while (kloop > 0) { + KERNEL1x4_SUB() + kloop--; + } + SAVE1x4(alpha) + i -= 1; + } + + B += K * 4; + N -= 4; + } + +/**************************************************************************************************/ + + // L8_0 + while (N >= 2) { + double *CO1; + double *AO; + int i; + // L8_10 + CO1 = C; + C += 2 * ldc; + + AO = A + 16; + + i = m; + while (i >= 8) { + double *BO; + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT8x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x2_SUB() + kloop--; + } + // L8_19 + SAVE8x2(alpha) + + i-=8; + } + + while (i >= 4) { + double *BO; + __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT4x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x2_SUB() + kloop--; + } + // L8_19 + SAVE4x2(alpha) + + i-=4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm2, xmm3, xmm4, xmm6; + int kloop = K; + BO = B + 12; + + INIT2x2() + + while (kloop > 0) { + KERNEL2x2_SUB() + kloop--; + } + SAVE2x2(alpha) + i -= 2; + } + // L13_40 + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl2, dbl4, dbl5; + int kloop = K; + BO = B + 12; + + INIT1x2() + + while (kloop > 0) { + KERNEL1x2_SUB() + kloop--; + } + SAVE1x2(alpha) + i -= 1; + } + + B += K * 2; + N -= 2; + } + + // L8_0 + while (N >= 1) { + // L8_10 + double *CO1; + double *AO; + int i; + + CO1 = C; + C += ldc; + + AO = A + 16; + + i = m; + while (i >= 8) { + double *BO; + __m512d zmm0, zmm2, zmm4; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT8x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x1_SUB() + kloop--; + } + // L8_19 + SAVE8x1(alpha) + + i-= 8; + } + while (i >= 4) { + double *BO; + __m256d ymm0, ymm2, ymm4, ymm5, ymm6, ymm7; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT4x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x1_SUB() + kloop--; + } + // L8_19 + SAVE4x1(alpha) + + i-= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm2, xmm4; + int kloop = K; + BO = B; + BO += 12; + + INIT2x1() + + while (kloop > 0) { + KERNEL2x1_SUB() + kloop--; + } + SAVE2x1(alpha) + i -= 2; + } + // L13_40 + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl4; + int kloop = K; + + BO = B; + BO += 12; + INIT1x1() + + + while (kloop > 0) { + KERNEL1x1_SUB() + kloop--; + } + SAVE1x1(alpha) + i -= 1; + } + + B += K * 1; + N -= 1; + } + + + return 0; +} diff --git a/kernel/x86_64/dgemm_ncopy_8_skylakex.c b/kernel/x86_64/dgemm_ncopy_8_skylakex.c new file mode 100644 index 000000000..74b336f3d --- /dev/null +++ b/kernel/x86_64/dgemm_ncopy_8_skylakex.c @@ -0,0 +1,421 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + FLOAT *boffset; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + FLOAT ctemp33, ctemp34, ctemp35, ctemp36; + FLOAT ctemp37, ctemp38, ctemp39, ctemp40; + FLOAT ctemp41, ctemp42, ctemp43, ctemp44; + FLOAT ctemp45, ctemp46, ctemp47, ctemp48; + FLOAT ctemp49, ctemp50, ctemp51, ctemp52; + FLOAT ctemp53, ctemp54, ctemp55, ctemp56; + FLOAT ctemp57, ctemp58, ctemp59, ctemp60; + FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + + + aoffset = a; + boffset = b; + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 3); + if (i > 0){ + do{ + __m128d xmm0, xmm1; + xmm0 = _mm_load_pd1(aoffset2 + 0); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 0); + _mm_storeu_pd(boffset + 0, xmm0); + + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + xmm1 = _mm_load_pd1(aoffset4 + 0); + xmm1 = _mm_loadl_pd(xmm1, aoffset3 + 0); + _mm_storeu_pd(boffset + 2, xmm1); + + xmm0 = _mm_load_pd1(aoffset6 + 0); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 0); + _mm_storeu_pd(boffset + 4, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 0); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 0); + _mm_storeu_pd(boffset + 6, xmm0); + + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + xmm0 = _mm_load_pd1(aoffset2 + 1); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 1); + _mm_storeu_pd(boffset + 8, xmm0); + + xmm0 = _mm_load_pd1(aoffset4 + 1); + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 1); + _mm_storeu_pd(boffset + 10, xmm0); + + xmm0 = _mm_load_pd1(aoffset6 + 1); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 1); + _mm_storeu_pd(boffset + 12, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 1); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 1); + _mm_storeu_pd(boffset + 14, xmm0); + + xmm0 = _mm_load_pd1(aoffset2 + 2); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 2); + _mm_storeu_pd(boffset + 16, xmm0); + + xmm0 = _mm_load_pd1(aoffset4 + 2); + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 2); + _mm_storeu_pd(boffset + 18, xmm0); + + xmm0 = _mm_load_pd1(aoffset6 + 2); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 2); + _mm_storeu_pd(boffset + 20, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 2); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 2); + _mm_storeu_pd(boffset + 22, xmm0); + + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + + xmm0 = _mm_load_pd1(aoffset2 + 3); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 3); + _mm_storeu_pd(boffset + 24, xmm0); + + xmm0 = _mm_load_pd1(aoffset4 + 3); + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 3); + _mm_storeu_pd(boffset + 26, xmm0); + + xmm0 = _mm_load_pd1(aoffset6 + 3); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 3); + _mm_storeu_pd(boffset + 28, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 3); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 3); + _mm_storeu_pd(boffset + 30, xmm0); + + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + + + xmm0 = _mm_load_pd1(aoffset2 + 4); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 4); + _mm_storeu_pd(boffset + 32, xmm0); + + xmm0 = _mm_load_pd1(aoffset4 + 4); + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 4); + _mm_storeu_pd(boffset + 34, xmm0); + + xmm0 = _mm_load_pd1(aoffset6 + 4); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 4); + _mm_storeu_pd(boffset + 36, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 4); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 4); + _mm_storeu_pd(boffset + 38, xmm0); + + ctemp39 = *(aoffset5 + 6); + ctemp40 = *(aoffset5 + 7); + + xmm0 = _mm_load_pd1(aoffset2 + 5); + xmm0 = _mm_loadl_pd(xmm0, aoffset1 + 5); + _mm_storeu_pd(boffset + 40, xmm0); + + xmm0 = _mm_load_pd1(aoffset4 + 5); + xmm0 = _mm_loadl_pd(xmm0, aoffset3 + 5); + _mm_storeu_pd(boffset + 42, xmm0); + + xmm0 = _mm_load_pd1(aoffset6 + 5); + xmm0 = _mm_loadl_pd(xmm0, aoffset5 + 5); + _mm_storeu_pd(boffset + 44, xmm0); + + xmm0 = _mm_load_pd1(aoffset8 + 5); + xmm0 = _mm_loadl_pd(xmm0, aoffset7 + 5); + _mm_storeu_pd(boffset + 46, xmm0); + + + ctemp47 = *(aoffset6 + 6); + ctemp48 = *(aoffset6 + 7); + + ctemp55 = *(aoffset7 + 6); + ctemp56 = *(aoffset7 + 7); + + ctemp63 = *(aoffset8 + 6); + ctemp64 = *(aoffset8 + 7); + + + *(boffset + 48) = ctemp07; + *(boffset + 49) = ctemp15; + *(boffset + 50) = ctemp23; + *(boffset + 51) = ctemp31; + *(boffset + 52) = ctemp39; + *(boffset + 53) = ctemp47; + *(boffset + 54) = ctemp55; + *(boffset + 55) = ctemp63; + + *(boffset + 56) = ctemp08; + *(boffset + 57) = ctemp16; + *(boffset + 58) = ctemp24; + *(boffset + 59) = ctemp32; + *(boffset + 60) = ctemp40; + *(boffset + 61) = ctemp48; + *(boffset + 62) = ctemp56; + *(boffset + 63) = ctemp64; + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + aoffset5 += 8; + aoffset6 += 8; + aoffset7 += 8; + aoffset8 += 8; + boffset += 64; + i --; + }while(i > 0); + } + + i = (m & 7); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp09 = *(aoffset2 + 0); + ctemp17 = *(aoffset3 + 0); + ctemp25 = *(aoffset4 + 0); + ctemp33 = *(aoffset5 + 0); + ctemp41 = *(aoffset6 + 0); + ctemp49 = *(aoffset7 + 0); + ctemp57 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + aoffset5 ++; + aoffset6 ++; + aoffset7 ++; + aoffset8 ++; + + boffset += 8; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp05; + *(boffset + 2) = ctemp09; + *(boffset + 3) = ctemp13; + + *(boffset + 4) = ctemp02; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp10; + *(boffset + 7) = ctemp14; + + *(boffset + 8) = ctemp03; + *(boffset + 9) = ctemp07; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp15; + + *(boffset + 12) = ctemp04; + *(boffset + 13) = ctemp08; + *(boffset + 14) = ctemp12; + *(boffset + 15) = ctemp16; + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + boffset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + ctemp03 = *(aoffset3 + 0); + ctemp04 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + + boffset += 4; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 ++; + aoffset2 ++; + boffset += 2; + } + } /* end of if(j > 0) */ + + if (n & 1){ + aoffset1 = aoffset; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + + aoffset1 ++; + boffset ++; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/x86_64/dgemm_tcopy_8_skylakex.c b/kernel/x86_64/dgemm_tcopy_8_skylakex.c new file mode 100644 index 000000000..472ad6349 --- /dev/null +++ b/kernel/x86_64/dgemm_tcopy_8_skylakex.c @@ -0,0 +1,417 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + j = (m >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 64; + + i = (n >> 3); + if (i > 0){ + do{ + __m512d row1, row2, row3, row4, row5, row6, row7, row8; + row1 = _mm512_loadu_pd(aoffset1); + aoffset1 += 8; + row2 = _mm512_loadu_pd(aoffset2); + aoffset2 += 8; + row3 = _mm512_loadu_pd(aoffset3); + aoffset3 += 8; + row4 = _mm512_loadu_pd(aoffset4); + aoffset4 += 8; + row5 = _mm512_loadu_pd(aoffset5); + aoffset5 += 8; + row6 = _mm512_loadu_pd(aoffset6); + aoffset6 += 8; + row7 = _mm512_loadu_pd(aoffset7); + aoffset7 += 8; + row8 = _mm512_loadu_pd(aoffset8); + aoffset8 += 8; + + _mm512_storeu_pd(boffset1 + 0, row1); + _mm512_storeu_pd(boffset1 + 8, row2); + _mm512_storeu_pd(boffset1 + 16, row3); + _mm512_storeu_pd(boffset1 + 24, row4); + _mm512_storeu_pd(boffset1 + 32, row5); + _mm512_storeu_pd(boffset1 + 40, row6); + _mm512_storeu_pd(boffset1 + 48, row7); + _mm512_storeu_pd(boffset1 + 56, row8); + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + __m256d row1, row2, row3, row4, row5, row6, row7, row8; + row1 = _mm256_loadu_pd(aoffset1); + aoffset1 += 4; + row2 = _mm256_loadu_pd(aoffset2); + aoffset2 += 4; + row3 = _mm256_loadu_pd(aoffset3); + aoffset3 += 4; + row4 = _mm256_loadu_pd(aoffset4); + aoffset4 += 4; + row5 = _mm256_loadu_pd(aoffset5); + aoffset5 += 4; + row6 = _mm256_loadu_pd(aoffset6); + aoffset6 += 4; + row7 = _mm256_loadu_pd(aoffset7); + aoffset7 += 4; + row8 = _mm256_loadu_pd(aoffset8); + aoffset8 += 4; + + _mm256_storeu_pd(boffset2 + 0, row1); + _mm256_storeu_pd(boffset2 + 4, row2); + _mm256_storeu_pd(boffset2 + 8, row3); + _mm256_storeu_pd(boffset2 + 12, row4); + _mm256_storeu_pd(boffset2 + 16, row5); + _mm256_storeu_pd(boffset2 + 20, row6); + _mm256_storeu_pd(boffset2 + 24, row7); + _mm256_storeu_pd(boffset2 + 28, row8); + boffset2 += 32; + } + + if (n & 2){ + __m128d row1, row2, row3, row4, row5, row6, row7, row8; + row1 = _mm_loadu_pd(aoffset1); + aoffset1 += 2; + + row2 = _mm_loadu_pd(aoffset2); + aoffset2 += 2; + + row3 = _mm_loadu_pd(aoffset3); + aoffset3 += 2; + + row4 = _mm_loadu_pd(aoffset4); + aoffset4 += 2; + + row5 = _mm_loadu_pd(aoffset5); + aoffset5 += 2; + + row6 = _mm_loadu_pd(aoffset6); + aoffset6 += 2; + + row7 = _mm_loadu_pd(aoffset7); + aoffset7 += 2; + + row8 = _mm_loadu_pd(aoffset8); + aoffset8 += 2; + + _mm_storeu_pd(boffset3 + 0, row1); + _mm_storeu_pd(boffset3 + 2, row2); + _mm_storeu_pd(boffset3 + 4, row3); + _mm_storeu_pd(boffset3 + 6, row4); + _mm_storeu_pd(boffset3 + 8, row5); + _mm_storeu_pd(boffset3 + 10, row6); + _mm_storeu_pd(boffset3 + 12, row7); + _mm_storeu_pd(boffset3 + 14, row8); + boffset3 += 16; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + ctemp05 = *(aoffset5 + 0); + aoffset5 ++; + ctemp06 = *(aoffset6 + 0); + aoffset6 ++; + ctemp07 = *(aoffset7 + 0); + aoffset7 ++; + ctemp08 = *(aoffset8 + 0); + aoffset8 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + *(boffset4 + 2) = ctemp03; + *(boffset4 + 3) = ctemp04; + *(boffset4 + 4) = ctemp05; + *(boffset4 + 5) = ctemp06; + *(boffset4 + 6) = ctemp07; + *(boffset4 + 7) = ctemp08; + boffset4 += 8; + } + + j--; + }while(j > 0); + } + + if (m & 4){ + + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + i = (n >> 3); + if (i > 0){ + + do{ + __m512d row1, row2, row3, row4; + row1 = _mm512_loadu_pd(aoffset1); + aoffset1 += 8; + row2 = _mm512_loadu_pd(aoffset2); + aoffset2 += 8; + row3 = _mm512_loadu_pd(aoffset3); + aoffset3 += 8; + row4 = _mm512_loadu_pd(aoffset4); + aoffset4 += 8; + + _mm512_storeu_pd(boffset1 + 0, row1); + _mm512_storeu_pd(boffset1 + 8, row2); + _mm512_storeu_pd(boffset1 + 16, row3); + _mm512_storeu_pd(boffset1 + 24, row4); + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4) { + __m256d row1, row2, row3, row4; + row1 = _mm256_loadu_pd(aoffset1); + aoffset1 += 4; + row2 = _mm256_loadu_pd(aoffset2); + aoffset2 += 4; + row3 = _mm256_loadu_pd(aoffset3); + aoffset3 += 4; + row4 = _mm256_loadu_pd(aoffset4); + aoffset4 += 4; + _mm256_storeu_pd(boffset2 + 0, row1); + _mm256_storeu_pd(boffset2 + 4, row2); + _mm256_storeu_pd(boffset2 + 8, row3); + _mm256_storeu_pd(boffset2 + 12, row4); + boffset2 += 16; + } + + if (n & 2){ + __m128d row1, row2, row3, row4; + row1 = _mm_loadu_pd(aoffset1); + aoffset1 += 2; + + row2 = _mm_loadu_pd(aoffset2); + aoffset2 += 2; + + row3 = _mm_loadu_pd(aoffset3); + aoffset3 += 2; + + row4 = _mm_loadu_pd(aoffset4); + aoffset4 += 2; + + + _mm_storeu_pd(boffset3 + 0, row1); + _mm_storeu_pd(boffset3 + 2, row2); + _mm_storeu_pd(boffset3 + 4, row3); + _mm_storeu_pd(boffset3 + 6, row4); + boffset3 += 8; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + *(boffset4 + 2) = ctemp03; + *(boffset4 + 3) = ctemp04; + boffset4 += 4; + } + } + + if (m & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + i = (n >> 3); + if (i > 0){ + do{ + __m512d row1, row2; + row1 = _mm512_loadu_pd(aoffset1); + aoffset1 += 8; + row2 = _mm512_loadu_pd(aoffset2); + aoffset2 += 8; + + _mm512_storeu_pd(boffset1 + 0, row1); + _mm512_storeu_pd(boffset1 + 8, row2); + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + __m256d row1, row2; + row1 = _mm256_loadu_pd(aoffset1); + aoffset1 += 4; + row2 = _mm256_loadu_pd(aoffset2); + aoffset2 += 4; + _mm256_storeu_pd(boffset2 + 0, row1); + _mm256_storeu_pd(boffset2 + 4, row2); + boffset2 += 8; + } + + if (n & 2){ + __m128d row1, row2; + row1 = _mm_loadu_pd(aoffset1); + aoffset1 += 2; + + row2 = _mm_loadu_pd(aoffset2); + aoffset2 += 2; + + + _mm_storeu_pd(boffset3 + 0, row1); + _mm_storeu_pd(boffset3 + 2, row2); + boffset3 += 4; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + boffset4 += 2; + } + } + + if (m & 1){ + aoffset1 = aoffset; + // aoffset += lda; + + boffset1 = boffset; + // boffset += 8; + + i = (n >> 3); + if (i > 0){ + do{ + __m512d row1; + row1 = _mm512_loadu_pd(aoffset1); + aoffset1 += 8; + + _mm512_storeu_pd(boffset1 + 0, row1); + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + __m256d row1; + row1 = _mm256_loadu_pd(aoffset1); + aoffset1 += 4; + _mm256_storeu_pd(boffset2 + 0, row1); + // boffset2 += 4; + } + + if (n & 2){ + __m128d row1; + row1 = _mm_loadu_pd(aoffset1); + aoffset1 += 2; + + _mm_storeu_pd(boffset3 + 0, row1); + + // boffset3 += 2; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + *(boffset4 + 0) = ctemp01; + boffset4 ++; + } + } + + return 0; +} diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c new file mode 100644 index 000000000..498c46f0d --- /dev/null +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -0,0 +1,152 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#include + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, + FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc){ + + BLASLONG i, j; + FLOAT *c_offset1, *c_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + + /* fast path.. just zero the whole matrix */ + if (m == ldc && (unsigned long)beta == (unsigned long)ZERO) { + memset(c, 0, m * n * sizeof(FLOAT)); + return 0; + } + + if (n == 0 || m == 0) + return 0; + + c_offset = c; + + if (beta == ZERO){ + __m512 z_zero; + __m256 y_zero; + + z_zero = _mm512_setzero_ps(); + y_zero = _mm256_setzero_ps(); + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = m; + + while (i >= 32) { + _mm512_storeu_ps(c_offset1, z_zero); + _mm512_storeu_ps(c_offset1 + 16, z_zero); + c_offset1 += 32; + i -= 32; + } + while (i >= 8) { + _mm256_storeu_ps(c_offset1, y_zero); + c_offset1 += 8; + i -= 8; + } + + while (i > 0) { + *c_offset1 = ZERO; + c_offset1 ++; + i --; + } + j --; + } while (j > 0); + + } else { + + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = (m >> 3); + if (i > 0){ + do { + ctemp1 = *(c_offset1 + 0); + ctemp2 = *(c_offset1 + 1); + ctemp3 = *(c_offset1 + 2); + ctemp4 = *(c_offset1 + 3); + ctemp5 = *(c_offset1 + 4); + ctemp6 = *(c_offset1 + 5); + ctemp7 = *(c_offset1 + 6); + ctemp8 = *(c_offset1 + 7); + + ctemp1 *= beta; + ctemp2 *= beta; + ctemp3 *= beta; + ctemp4 *= beta; + ctemp5 *= beta; + ctemp6 *= beta; + ctemp7 *= beta; + ctemp8 *= beta; + + *(c_offset1 + 0) = ctemp1; + *(c_offset1 + 1) = ctemp2; + *(c_offset1 + 2) = ctemp3; + *(c_offset1 + 3) = ctemp4; + *(c_offset1 + 4) = ctemp5; + *(c_offset1 + 5) = ctemp6; + *(c_offset1 + 6) = ctemp7; + *(c_offset1 + 7) = ctemp8; + c_offset1 += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0){ + do { + ctemp1 = *c_offset1; + ctemp1 *= beta; + *c_offset1 = ctemp1; + c_offset1 ++; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + + } + return 0; +}; diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c new file mode 100644 index 000000000..10d3d22ed --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -0,0 +1,1177 @@ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + + +/* comment below left for history, data does not represent the implementation in this file */ + +/********************************************************************* +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 384 +* A_PR1 512 +* B_PR1 512 +* +* +* 2014/07/28 Saar +* Performance at 9216x9216x9216: +* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) +* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) +* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) +* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) +* +*********************************************************************/ + +#include "common.h" +#include + + + +/******************************************************************************************* +* 8 lines of N +*******************************************************************************************/ + + + + + + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +#define INIT64x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row0b = _mm512_setzero_ps(); \ + row1b = _mm512_setzero_ps(); \ + row2b = _mm512_setzero_ps(); \ + row3b = _mm512_setzero_ps(); \ + row0c = _mm512_setzero_ps(); \ + row1c = _mm512_setzero_ps(); \ + row2c = _mm512_setzero_ps(); \ + row3c = _mm512_setzero_ps(); \ + row0d = _mm512_setzero_ps(); \ + row1d = _mm512_setzero_ps(); \ + row2d = _mm512_setzero_ps(); \ + row3d = _mm512_setzero_ps(); \ + +#define KERNEL64x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm1 = _mm512_loadu_ps(A1); \ + zmm5 = _mm512_loadu_ps(A2); \ + zmm7 = _mm512_loadu_ps(A3); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + row0b += zmm1 * zmm2; \ + row1b += zmm1 * zmm3; \ + row0c += zmm5 * zmm2; \ + row1c += zmm5 * zmm3; \ + row0d += zmm7 * zmm2; \ + row1d += zmm7 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + row2b += zmm1 * zmm2; \ + row3b += zmm1 * zmm3; \ + row2c += zmm5 * zmm2; \ + row3c += zmm5 * zmm3; \ + row2d += zmm7 * zmm2; \ + row3d += zmm7 * zmm3; \ + BO += 4; \ + AO += 16; \ + A1 += 16; \ + A2 += 16; \ + A3 += 16; \ + + +#define SAVE64x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0b *= zmm0; \ + row1b *= zmm0; \ + row2b *= zmm0; \ + row3b *= zmm0; \ + row0c *= zmm0; \ + row1c *= zmm0; \ + row2c *= zmm0; \ + row3c *= zmm0; \ + row0d *= zmm0; \ + row1d *= zmm0; \ + row2d *= zmm0; \ + row3d *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0*ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1*ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2*ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3*ldc); \ + _mm512_storeu_ps(CO1 + 0*ldc, row0); \ + _mm512_storeu_ps(CO1 + 1*ldc, row1); \ + _mm512_storeu_ps(CO1 + 2*ldc, row2); \ + _mm512_storeu_ps(CO1 + 3*ldc, row3); \ + row0b += _mm512_loadu_ps(CO1 + 0*ldc + 16); \ + row1b += _mm512_loadu_ps(CO1 + 1*ldc + 16); \ + row2b += _mm512_loadu_ps(CO1 + 2*ldc + 16); \ + row3b += _mm512_loadu_ps(CO1 + 3*ldc + 16); \ + _mm512_storeu_ps(CO1 + 0*ldc + 16, row0b); \ + _mm512_storeu_ps(CO1 + 1*ldc + 16, row1b); \ + _mm512_storeu_ps(CO1 + 2*ldc + 16, row2b); \ + _mm512_storeu_ps(CO1 + 3*ldc + 16, row3b); \ + row0c += _mm512_loadu_ps(CO1 + 0*ldc + 32); \ + row1c += _mm512_loadu_ps(CO1 + 1*ldc + 32); \ + row2c += _mm512_loadu_ps(CO1 + 2*ldc + 32); \ + row3c += _mm512_loadu_ps(CO1 + 3*ldc + 32); \ + _mm512_storeu_ps(CO1 + 0*ldc + 32, row0c); \ + _mm512_storeu_ps(CO1 + 1*ldc + 32, row1c); \ + _mm512_storeu_ps(CO1 + 2*ldc + 32, row2c); \ + _mm512_storeu_ps(CO1 + 3*ldc + 32, row3c); \ + row0d += _mm512_loadu_ps(CO1 + 0*ldc + 48); \ + row1d += _mm512_loadu_ps(CO1 + 1*ldc + 48); \ + row2d += _mm512_loadu_ps(CO1 + 2*ldc + 48); \ + row3d += _mm512_loadu_ps(CO1 + 3*ldc + 48); \ + _mm512_storeu_ps(CO1 + 0*ldc + 48, row0d); \ + _mm512_storeu_ps(CO1 + 1*ldc + 48, row1d); \ + _mm512_storeu_ps(CO1 + 2*ldc + 48, row2d); \ + _mm512_storeu_ps(CO1 + 3*ldc + 48, row3d); + + +#define INIT48x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row0b = _mm512_setzero_ps(); \ + row1b = _mm512_setzero_ps(); \ + row2b = _mm512_setzero_ps(); \ + row3b = _mm512_setzero_ps(); \ + row0c = _mm512_setzero_ps(); \ + row1c = _mm512_setzero_ps(); \ + row2c = _mm512_setzero_ps(); \ + row3c = _mm512_setzero_ps(); \ + +#define KERNEL48x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm1 = _mm512_loadu_ps(A1); \ + zmm5 = _mm512_loadu_ps(A2); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + row0b += zmm1 * zmm2; \ + row1b += zmm1 * zmm3; \ + row0c += zmm5 * zmm2; \ + row1c += zmm5 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + row2b += zmm1 * zmm2; \ + row3b += zmm1 * zmm3; \ + row2c += zmm5 * zmm2; \ + row3c += zmm5 * zmm3; \ + BO += 4; \ + AO += 16; \ + A1 += 16; \ + A2 += 16; + + +#define SAVE48x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0b *= zmm0; \ + row1b *= zmm0; \ + row2b *= zmm0; \ + row3b *= zmm0; \ + row0c *= zmm0; \ + row1c *= zmm0; \ + row2c *= zmm0; \ + row3c *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0*ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1*ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2*ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3*ldc); \ + _mm512_storeu_ps(CO1 + 0*ldc, row0); \ + _mm512_storeu_ps(CO1 + 1*ldc, row1); \ + _mm512_storeu_ps(CO1 + 2*ldc, row2); \ + _mm512_storeu_ps(CO1 + 3*ldc, row3); \ + row0b += _mm512_loadu_ps(CO1 + 0*ldc + 16); \ + row1b += _mm512_loadu_ps(CO1 + 1*ldc + 16); \ + row2b += _mm512_loadu_ps(CO1 + 2*ldc + 16); \ + row3b += _mm512_loadu_ps(CO1 + 3*ldc + 16); \ + _mm512_storeu_ps(CO1 + 0*ldc + 16, row0b); \ + _mm512_storeu_ps(CO1 + 1*ldc + 16, row1b); \ + _mm512_storeu_ps(CO1 + 2*ldc + 16, row2b); \ + _mm512_storeu_ps(CO1 + 3*ldc + 16, row3b); \ + row0c += _mm512_loadu_ps(CO1 + 0*ldc + 32); \ + row1c += _mm512_loadu_ps(CO1 + 1*ldc + 32); \ + row2c += _mm512_loadu_ps(CO1 + 2*ldc + 32); \ + row3c += _mm512_loadu_ps(CO1 + 3*ldc + 32); \ + _mm512_storeu_ps(CO1 + 0*ldc + 32, row0c); \ + _mm512_storeu_ps(CO1 + 1*ldc + 32, row1c); \ + _mm512_storeu_ps(CO1 + 2*ldc + 32, row2c); \ + _mm512_storeu_ps(CO1 + 3*ldc + 32, row3c); + + +#define INIT32x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row0b = _mm512_setzero_ps(); \ + row1b = _mm512_setzero_ps(); \ + row2b = _mm512_setzero_ps(); \ + row3b = _mm512_setzero_ps(); \ + +#define KERNEL32x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm1 = _mm512_loadu_ps(A1); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + row0b += zmm1 * zmm2; \ + row1b += zmm1 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + row2b += zmm1 * zmm2; \ + row3b += zmm1 * zmm3; \ + BO += 4; \ + AO += 16; \ + A1 += 16; + + +#define SAVE32x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0b *= zmm0; \ + row1b *= zmm0; \ + row2b *= zmm0; \ + row3b *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0*ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1*ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2*ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3*ldc); \ + _mm512_storeu_ps(CO1 + 0*ldc, row0); \ + _mm512_storeu_ps(CO1 + 1*ldc, row1); \ + _mm512_storeu_ps(CO1 + 2*ldc, row2); \ + _mm512_storeu_ps(CO1 + 3*ldc, row3); \ + row0b += _mm512_loadu_ps(CO1 + 0*ldc + 16); \ + row1b += _mm512_loadu_ps(CO1 + 1*ldc + 16); \ + row2b += _mm512_loadu_ps(CO1 + 2*ldc + 16); \ + row3b += _mm512_loadu_ps(CO1 + 3*ldc + 16); \ + _mm512_storeu_ps(CO1 + 0*ldc + 16, row0b); \ + _mm512_storeu_ps(CO1 + 1*ldc + 16, row1b); \ + _mm512_storeu_ps(CO1 + 2*ldc + 16, row2b); \ + _mm512_storeu_ps(CO1 + 3*ldc + 16, row3b); + + + +#define INIT16x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + +#define KERNEL16x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + BO += 4; \ + AO += 16; + + +#define SAVE16x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ + _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm512_storeu_ps(CO1 + 3 * ldc, row3); + + + +/*******************************************************************************************/ + +#define INIT8x4() \ + ymm4 = _mm256_setzero_ps(); \ + ymm6 = _mm256_setzero_ps(); \ + ymm8 = _mm256_setzero_ps(); \ + ymm10 = _mm256_setzero_ps(); \ + +#define KERNEL8x4_SUB() \ + ymm0 = _mm256_loadu_ps(AO); \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ + ymm4 += ymm0 * ymm2; \ + ymm6 += ymm0 * ymm3; \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \ + ymm8 += ymm0 * ymm2; \ + ymm10 += ymm0 * ymm3; \ + BO += 4; \ + AO += 8; + + +#define SAVE8x4(ALPHA) \ + ymm0 = _mm256_set1_ps(ALPHA); \ + ymm4 *= ymm0; \ + ymm6 *= ymm0; \ + ymm8 *= ymm0; \ + ymm10 *= ymm0; \ + ymm4 += _mm256_loadu_ps(CO1 + 0 * ldc); \ + ymm6 += _mm256_loadu_ps(CO1 + 1 * ldc); \ + ymm8 += _mm256_loadu_ps(CO1 + 2 * ldc); \ + ymm10 += _mm256_loadu_ps(CO1 + 3 * ldc); \ + _mm256_storeu_ps(CO1 + 0 * ldc, ymm4); \ + _mm256_storeu_ps(CO1 + 1 * ldc, ymm6); \ + _mm256_storeu_ps(CO1 + 2 * ldc, ymm8); \ + _mm256_storeu_ps(CO1 + 3 * ldc, ymm10); \ + + + +/*******************************************************************************************/ + +#define INIT4x4() \ + row0 = _mm_setzero_ps(); \ + row1 = _mm_setzero_ps(); \ + row2 = _mm_setzero_ps(); \ + row3 = _mm_setzero_ps(); \ + + +#define KERNEL4x4_SUB() \ + xmm0 = _mm_loadu_ps(AO); \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += xmm0 * xmm2; \ + row3 += xmm0 * xmm3; \ + BO += 4; \ + AO += 4; + + +#define SAVE4x4(ALPHA) \ + xmm0 = _mm_set1_ps(ALPHA); \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row2 *= xmm0; \ + row3 *= xmm0; \ + row0 += _mm_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm_loadu_ps(CO1 + 3 * ldc); \ + _mm_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm_storeu_ps(CO1 + 3 * ldc, row3); \ + + +/*******************************************************************************************/ + +#define INIT2x4() \ + row0 = 0; row0b = 0; row1 = 0; row1b = 0; \ + row2 = 0; row2b = 0; row3 = 0; row3b = 0; + +#define KERNEL2x4_SUB() \ + xmm0 = *(AO); \ + xmm1 = *(AO + 1); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row0b += xmm1 * xmm2; \ + row1 += xmm0 * xmm3; \ + row1b += xmm1 * xmm3; \ + xmm2 = *(BO + 2); \ + xmm3 = *(BO + 3); \ + row2 += xmm0 * xmm2; \ + row2b += xmm1 * xmm2; \ + row3 += xmm0 * xmm3; \ + row3b += xmm1 * xmm3; \ + BO += 4; \ + AO += 2; + + +#define SAVE2x4(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row0b *= xmm0; \ + row1 *= xmm0; \ + row1b *= xmm0; \ + row2 *= xmm0; \ + row2b *= xmm0; \ + row3 *= xmm0; \ + row3b *= xmm0; \ + *(CO1 + 0 * ldc + 0) += row0; \ + *(CO1 + 0 * ldc + 1) += row0b; \ + *(CO1 + 1 * ldc + 0) += row1; \ + *(CO1 + 1 * ldc + 1) += row1b; \ + *(CO1 + 2 * ldc + 0) += row2; \ + *(CO1 + 2 * ldc + 1) += row2b; \ + *(CO1 + 3 * ldc + 0) += row3; \ + *(CO1 + 3 * ldc + 1) += row3b; \ + + + +/*******************************************************************************************/ + +#define INIT1x4() \ + row0 = 0; row1 = 0; row2 = 0; row3 = 0; +#define KERNEL1x4_SUB() \ + xmm0 = *(AO ); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + xmm2 = *(BO + 2); \ + xmm3 = *(BO + 3); \ + row2 += xmm0 * xmm2; \ + row3 += xmm0 * xmm3; \ + BO += 4; \ + AO += 1; + + +#define SAVE1x4(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row2 *= xmm0; \ + row3 *= xmm0; \ + *(CO1 + 0 * ldc) += row0; \ + *(CO1 + 1 * ldc) += row1; \ + *(CO1 + 2 * ldc) += row2; \ + *(CO1 + 3 * ldc) += row3; \ + + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define INIT16x2() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + + +#define KERNEL16x2_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + BO += 2; \ + AO += 16; + + +#define SAVE16x2(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row0 += _mm512_loadu_ps(CO1); \ + row1 += _mm512_loadu_ps(CO1 + ldc); \ + _mm512_storeu_ps(CO1 , row0); \ + _mm512_storeu_ps(CO1 + ldc, row1); \ + + + + +/*******************************************************************************************/ + +#define INIT8x2() \ + ymm4 = _mm256_setzero_ps(); \ + ymm6 = _mm256_setzero_ps(); \ + +#define KERNEL8x2_SUB() \ + ymm0 = _mm256_loadu_ps(AO); \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ + ymm4 += ymm0 * ymm2; \ + ymm6 += ymm0 * ymm3; \ + BO += 2; \ + AO += 8; + + +#define SAVE8x2(ALPHA) \ + ymm0 = _mm256_set1_ps(ALPHA); \ + ymm4 *= ymm0; \ + ymm6 *= ymm0; \ + ymm4 += _mm256_loadu_ps(CO1); \ + ymm6 += _mm256_loadu_ps(CO1 + ldc); \ + _mm256_storeu_ps(CO1 , ymm4); \ + _mm256_storeu_ps(CO1 + ldc, ymm6); \ + + + +/*******************************************************************************************/ + +#define INIT4x2() \ + row0 = _mm_setzero_ps(); \ + row1 = _mm_setzero_ps(); \ + +#define KERNEL4x2_SUB() \ + xmm0 = _mm_loadu_ps(AO); \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + BO += 2; \ + AO += 4; + + +#define SAVE4x2(ALPHA) \ + xmm0 = _mm_set1_ps(ALPHA); \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row0 += _mm_loadu_ps(CO1); \ + row1 += _mm_loadu_ps(CO1 + ldc); \ + _mm_storeu_ps(CO1 , row0); \ + _mm_storeu_ps(CO1 + ldc, row1); \ + + + +/*******************************************************************************************/ + + +#define INIT2x2() \ + row0 = 0; row0b = 0; row1 = 0; row1b = 0; \ + +#define KERNEL2x2_SUB() \ + xmm0 = *(AO + 0); \ + xmm1 = *(AO + 1); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row0b += xmm1 * xmm2; \ + row1 += xmm0 * xmm3; \ + row1b += xmm1 * xmm3; \ + BO += 2; \ + AO += 2; \ + + +#define SAVE2x2(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row0b *= xmm0; \ + row1 *= xmm0; \ + row1b *= xmm0; \ + *(CO1 ) += row0; \ + *(CO1 +1 ) += row0b; \ + *(CO1 + ldc ) += row1; \ + *(CO1 + ldc +1) += row1b; \ + + +/*******************************************************************************************/ + +#define INIT1x2() \ + row0 = 0; row1 = 0; + +#define KERNEL1x2_SUB() \ + xmm0 = *(AO); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + BO += 2; \ + AO += 1; + + +#define SAVE1x2(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row1 *= xmm0; \ + *(CO1 ) += row0; \ + *(CO1 + ldc ) += row1; \ + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define INIT16x1() \ + row0 = _mm512_setzero_ps(); \ + +#define KERNEL16x1_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + row0 += zmm0 * zmm2; \ + BO += 1; \ + AO += 16; + + +#define SAVE16x1(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row0 += _mm512_loadu_ps(CO1); \ + _mm512_storeu_ps(CO1 , row0); \ + + +/*******************************************************************************************/ + +#define INIT8x1() \ + ymm4 = _mm256_setzero_ps(); + +#define KERNEL8x1_SUB() \ + ymm0 = _mm256_loadu_ps(AO); \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO)); \ + ymm4 += ymm0 * ymm2; \ + BO += 1; \ + AO += 8; + + +#define SAVE8x1(ALPHA) \ + ymm0 = _mm256_set1_ps(ALPHA); \ + ymm4 *= ymm0; \ + ymm4 += _mm256_loadu_ps(CO1); \ + _mm256_storeu_ps(CO1 , ymm4); \ + + +/*******************************************************************************************/ + +#define INIT4x1() \ + row0 = _mm_setzero_ps(); \ + +#define KERNEL4x1_SUB() \ + xmm0 = _mm_loadu_ps(AO); \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO)); \ + row0 += xmm0 * xmm2; \ + BO += 1; \ + AO += 4; + + +#define SAVE4x1(ALPHA) \ + xmm0 = _mm_set1_ps(ALPHA); \ + row0 *= xmm0; \ + row0 += _mm_loadu_ps(CO1); \ + _mm_storeu_ps(CO1 , row0); \ + + + +/*******************************************************************************************/ + +#define INIT2x1() \ + row0 = 0; row0b = 0; + +#define KERNEL2x1_SUB() \ + xmm0 = *(AO + 0); \ + xmm1 = *(AO + 1); \ + xmm2 = *(BO); \ + row0 += xmm0 * xmm2; \ + row0b += xmm1 * xmm2; \ + BO += 1; \ + AO += 2; + + +#define SAVE2x1(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row0b *= xmm0; \ + *(CO1 ) += row0; \ + *(CO1 +1 ) += row0b; \ + + +/*******************************************************************************************/ + +#define INIT1x1() \ + row0 = 0; + +#define KERNEL1x1_SUB() \ + xmm0 = *(AO); \ + xmm2 = *(BO); \ + row0 += xmm0 * xmm2; \ + BO += 1; \ + AO += 1; + + +#define SAVE1x1(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + *(CO1 ) += row0; \ + + +/*******************************************************************************************/ + + +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG ldc) +{ + unsigned long M = m, N = n, K = k; + if (M == 0) + return 0; + if (N == 0) + return 0; + if (K == 0) + return 0; + + + while (N >= 4) { + float *CO1; + float *AO; + int i; + // L8_10 + CO1 = C; + C += 4 * ldc; + + AO = A; + + i = m; + while (i >= 64) { + float *BO; + float *A1, *A2, *A3; + // L8_11 + __m512 zmm0, zmm1, zmm2, zmm3, row0, zmm5, row1, zmm7, row2, row3, row0b, row1b, row2b, row3b, row0c, row1c, row2c, row3c, row0d, row1d, row2d, row3d; + BO = B; + int kloop = K; + + A1 = AO + 16 * K; + A2 = A1 + 16 * K; + A3 = A2 + 16 * K; + + INIT64x4() + + while (kloop > 0) { + // L12_17 + KERNEL64x4_SUB() + kloop--; + } + // L8_19 + SAVE64x4(alpha) + CO1 += 64; + AO += 48 * K; + + i -= 64; + } + while (i >= 32) { + float *BO; + float *A1; + // L8_11 + __m512 zmm0, zmm1, zmm2, zmm3, row0, row1, row2, row3, row0b, row1b, row2b, row3b; + BO = B; + int kloop = K; + + A1 = AO + 16 * K; + + INIT32x4() + + while (kloop > 0) { + // L12_17 + KERNEL32x4_SUB() + kloop--; + } + // L8_19 + SAVE32x4(alpha) + CO1 += 32; + AO += 16 * K; + + i -= 32; + } + while (i >= 16) { + float *BO; + // L8_11 + __m512 zmm0, zmm2, zmm3, row0, row1, row2, row3; + BO = B; + int kloop = K; + + INIT16x4() + + while (kloop > 0) { + // L12_17 + KERNEL16x4_SUB() + kloop--; + } + // L8_19 + SAVE16x4(alpha) + CO1 += 16; + + i -= 16; + } + while (i >= 8) { + float *BO; + // L8_11 + __m256 ymm0, ymm2, ymm3, ymm4, ymm6,ymm8,ymm10; + BO = B; + int kloop = K; + + INIT8x4() + + while (kloop > 0) { + // L12_17 + KERNEL8x4_SUB() + kloop--; + } + // L8_19 + SAVE8x4(alpha) + CO1 += 8; + + i -= 8; + } + while (i >= 4) { + // L8_11 + float *BO; + __m128 xmm0, xmm2, xmm3, row0, row1, row2, row3; + BO = B; + int kloop = K; + + INIT4x4() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x4_SUB() + kloop--; + } + // L8_19 + SAVE4x4(alpha) + CO1 += 4; + + i -= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + float *BO; + float xmm0, xmm1, xmm2, xmm3, row0, row0b, row1, row1b, row2, row2b, row3, row3b; + BO = B; + + INIT2x4() + int kloop = K; + + while (kloop > 0) { + KERNEL2x4_SUB() + kloop--; + } + SAVE2x4(alpha) + CO1 += 2; + i -= 2; + } + // L13_40 + while (i >= 1) { + float *BO; + float xmm0, xmm2, xmm3, row0, row1, row2, row3; + int kloop = K; + BO = B; + INIT1x4() + + while (kloop > 0) { + KERNEL1x4_SUB() + kloop--; + } + SAVE1x4(alpha) + CO1 += 1; + i -= 1; + } + + B += K * 4; + N -= 4; + } + +/**************************************************************************************************/ + + // L8_0 + while (N >= 2) { + float *CO1; + float *AO; + int i; + // L8_10 + CO1 = C; + C += 2 * ldc; + + AO = A; + + i = m; + while (i >= 16) { + float *BO; + + // L8_11 + __m512 zmm0, zmm2, zmm3, row0, row1; + BO = B; + int kloop = K; + + INIT16x2() + + while (kloop > 0) { + // L12_17 + KERNEL16x2_SUB() + kloop--; + } + // L8_19 + SAVE16x2(alpha) + CO1 += 16; + + i -= 16; + } + while (i >= 8) { + float *BO; + __m256 ymm0, ymm2, ymm3, ymm4, ymm6; + // L8_11 + BO = B; + int kloop = K; + + INIT8x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x2_SUB() + kloop--; + } + // L8_19 + SAVE8x2(alpha) + CO1 += 8; + + i-=8; + } + + while (i >= 4) { + float *BO; + __m128 xmm0, xmm2, xmm3, row0, row1; + // L8_11 + BO = B; + int kloop = K; + + INIT4x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x2_SUB() + kloop--; + } + // L8_19 + SAVE4x2(alpha) + CO1 += 4; + + i-=4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + float *BO; + float xmm0, xmm1, xmm2, xmm3, row0, row0b, row1, row1b; + int kloop = K; + BO = B; + + INIT2x2() + + while (kloop > 0) { + KERNEL2x2_SUB() + kloop--; + } + SAVE2x2(alpha) + CO1 += 2; + i -= 2; + } + // L13_40 + while (i >= 1) { + float *BO; + float xmm0, xmm2, xmm3, row0, row1; + int kloop = K; + BO = B; + + INIT1x2() + + while (kloop > 0) { + KERNEL1x2_SUB() + kloop--; + } + SAVE1x2(alpha) + CO1 += 1; + i -= 1; + } + + B += K * 2; + N -= 2; + } + + // L8_0 + while (N >= 1) { + // L8_10 + float *CO1; + float *AO; + int i; + + CO1 = C; + C += ldc; + + AO = A; + + i = m; + while (i >= 16) { + float *BO; + __m512 zmm0, zmm2, row0; + // L8_11 + BO = B; + int kloop = K; + + INIT16x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL16x1_SUB() + kloop--; + } + // L8_19 + SAVE16x1(alpha) + CO1 += 16; + + i-= 16; + } + while (i >= 8) { + float *BO; + __m256 ymm0, ymm2, ymm4; + // L8_11 + BO = B; + int kloop = K; + + INIT8x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x1_SUB() + kloop--; + } + // L8_19 + SAVE8x1(alpha) + CO1 += 8; + + i-= 8; + } + while (i >= 4) { + float *BO; + __m128 xmm0, xmm2, row0; + // L8_11 + BO = B; + int kloop = K; + + INIT4x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x1_SUB() + kloop--; + } + // L8_19 + SAVE4x1(alpha) + CO1 += 4; + + i-= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + float *BO; + float xmm0, xmm1, xmm2, row0, row0b; + int kloop = K; + BO = B; + + INIT2x1() + + while (kloop > 0) { + KERNEL2x1_SUB() + kloop--; + } + SAVE2x1(alpha) + CO1 += 2; + i -= 2; + } + // L13_40 + while (i >= 1) { + float *BO; + float xmm0, xmm2, row0; + int kloop = K; + + BO = B; + INIT1x1() + + + while (kloop > 0) { + KERNEL1x1_SUB() + kloop--; + } + SAVE1x1(alpha) + CO1 += 1; + i -= 1; + } + + B += K * 1; + N -= 1; + } + + + return 0; +} diff --git a/kernel/x86_64/sgemm_ncopy_4_skylakex.c b/kernel/x86_64/sgemm_ncopy_4_skylakex.c new file mode 100644 index 000000000..8577e3b38 --- /dev/null +++ b/kernel/x86_64/sgemm_ncopy_4_skylakex.c @@ -0,0 +1,207 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#include + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + j = (n >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + __m128 row0, row1, row2, row3; + + row0 = _mm_loadu_ps(a_offset1); + row1 = _mm_loadu_ps(a_offset2); + row2 = _mm_loadu_ps(a_offset3); + row3 = _mm_loadu_ps(a_offset4); + + _MM_TRANSPOSE4_PS(row0, row1, row2, row3); + + _mm_storeu_ps(b_offset + 0, row0); + _mm_storeu_ps(b_offset + 4, row1); + _mm_storeu_ps(b_offset + 8, row2); + _mm_storeu_ps(b_offset + 12, row3); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + b_offset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + ctemp9 = *(a_offset3 + 0); + ctemp13 = *(a_offset4 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + a_offset1 ++; + a_offset2 ++; + a_offset3 ++; + a_offset4 ++; + + b_offset += 4; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp2; + *(b_offset + 3) = ctemp6; + + *(b_offset + 4) = ctemp3; + *(b_offset + 5) = ctemp7; + *(b_offset + 6) = ctemp4; + *(b_offset + 7) = ctemp8; + + a_offset1 += 4; + a_offset2 += 4; + b_offset += 8; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + + a_offset1 ++; + a_offset2 ++; + b_offset += 2; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 1){ + a_offset1 = a_offset; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + a_offset1 += 4; + b_offset += 4; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + *(b_offset + 0) = ctemp1; + a_offset1 ++; + b_offset += 1; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/x86_64/sgemm_tcopy_16_skylakex.c b/kernel/x86_64/sgemm_tcopy_16_skylakex.c new file mode 100644 index 000000000..dbacc5081 --- /dev/null +++ b/kernel/x86_64/sgemm_tcopy_16_skylakex.c @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m, n); +#endif + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + *(boffset + 16) = ctemp17; + *(boffset + 17) = ctemp18; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp20; + *(boffset + 20) = ctemp21; + *(boffset + 21) = ctemp22; + *(boffset + 22) = ctemp23; + *(boffset + 23) = ctemp24; + + *(boffset + 24) = ctemp25; + *(boffset + 25) = ctemp26; + *(boffset + 26) = ctemp27; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp29; + *(boffset + 29) = ctemp30; + *(boffset + 30) = ctemp31; + *(boffset + 31) = ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + boffset += 16; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + boffset += 8; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + *(boffset + 0) = ctemp01; + // boffset += 1; + } + } + + return 0; +} diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index 1e2509bf0..8262c3488 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -34,6 +34,13 @@ #ifndef _LAPACKE_CONFIG_H_ #define _LAPACKE_CONFIG_H_ +// For Android prior to API 21 (no include) +#if defined(__ANDROID__) +#if __ANDROID_API__ < 21 +#define LAPACK_COMPLEX_STRUCTURE +#endif +#endif + #ifdef __cplusplus #if defined(LAPACK_COMPLEX_CPP) #include diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 44884d4a5..7672f9f73 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -2454,6 +2454,8 @@ endif all: ../../$(LAPACKELIB) +.PHONY: ../../$(LAPACKELIB) + ../../$(LAPACKELIB): $(OBJ_A) $(OBJ_B) $(DEPRECATED) $(EXTENDED) $(MATGEN) $(ARCH) $(ARCHFLAGS) $@ $(OBJ_A) $(ARCH) $(ARCHFLAGS) $@ $(OBJ_B) diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_2stage_work.c index 2cc7b9ad2..dbd6e9049 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_2stage_work.c @@ -50,7 +50,6 @@ lapack_int LAPACKE_dsytrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,n); - lapack_int ldb_t = MAX(1,n); double* a_t = NULL; double* tb_t = NULL; /* Check leading dimension(s) */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_2stage_work.c index 5b8010d9e..b9ba0fb56 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_2stage_work.c @@ -50,7 +50,6 @@ lapack_int LAPACKE_zhetrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,n); - lapack_int ldb_t = MAX(1,n); lapack_complex_double* a_t = NULL; lapack_complex_double* tb_t = NULL; /* Check leading dimension(s) */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_2stage_work.c index f91c42257..db27e2873 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_2stage_work.c @@ -50,7 +50,6 @@ lapack_int LAPACKE_zsytrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,n); - lapack_int ldb_t = MAX(1,n); lapack_complex_double* a_t = NULL; lapack_complex_double* tb_t = NULL; /* Check leading dimension(s) */ diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 531cb51fc..87a8f51e4 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -552,6 +552,8 @@ endif all: ../$(LAPACKLIB) +.PHONY: ../$(LAPACKLIB) + ../$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) $(RANLIB) $@ diff --git a/lapack-netlib/SRC/sgelss.f b/lapack-netlib/SRC/sgelss.f index 29380d4dc..84a882d2e 100644 --- a/lapack-netlib/SRC/sgelss.f +++ b/lapack-netlib/SRC/sgelss.f @@ -407,7 +407,7 @@ * Matrix all zero. Return zero solution. * CALL SLASET( 'F', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) - CALL SLASET( 'F', MINMN, 1, ZERO, ZERO, S, 1 ) + CALL SLASET( 'F', MINMN, 1, ZERO, ZERO, S, MINMN ) RANK = 0 GO TO 70 END IF diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile index e20004c2f..a1d784fa5 100644 --- a/lapack-netlib/TESTING/MATGEN/Makefile +++ b/lapack-netlib/TESTING/MATGEN/Makefile @@ -57,6 +57,8 @@ all: ../../$(TMGLIB) ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \ $(DZATGEN) +.PHONY: ../../$(TMGLIB) + ../../$(TMGLIB): $(ALLOBJ) $(ARCH) $(ARCHFLAGS) $@ $^ $(RANLIB) $@ diff --git a/param.h b/param.h index cfa4bba5c..8f56cdaaa 100644 --- a/param.h +++ b/param.h @@ -1627,6 +1627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 #define SWITCH_RATIO 32 +#define GEMM_PREFERED_SIZE 32 #ifdef ARCH_X86 @@ -2542,8 +2543,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif +// Common ARMv8 parameters +#if defined(ARMV8) -#if defined(CORTEXA57) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2551,6 +2553,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL +#define SYMV_P 16 + +// Darwin / Cross +#if defined(OS_DARWIN) && defined(CROSS) + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#else // Linux / Native + +#if defined(CORTEXA53) || defined(CORTEXA57) || \ + defined(CORTEXA72) || defined(CORTEXA73) || \ + defined(FALKOR) + #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2578,17 +2618,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 2048 - -#define SYMV_P 16 -#endif - -#if defined(ARMV8) -#define SNUMOPT 2 -#define DNUMOPT 2 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#elif defined(THUNDERX) #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2617,56 +2647,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 - -#define SYMV_P 16 -#endif - -#if defined(THUNDERX) -#define SNUMOPT 2 -#define DNUMOPT 2 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL - -#define SGEMM_DEFAULT_UNROLL_M 4 -#define SGEMM_DEFAULT_UNROLL_N 4 - -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 2 - -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 - -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 - -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 128 -#define CGEMM_DEFAULT_P 96 -#define ZGEMM_DEFAULT_P 64 - -#define SGEMM_DEFAULT_Q 240 -#define DGEMM_DEFAULT_Q 120 -#define CGEMM_DEFAULT_Q 120 -#define ZGEMM_DEFAULT_Q 120 - -#define SGEMM_DEFAULT_R 12288 -#define DGEMM_DEFAULT_R 8192 -#define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 4096 - - -#define SYMV_P 16 -#endif - -#if defined(THUNDERX2T99) || defined(VULCAN) -#define SNUMOPT 2 -#define DNUMOPT 2 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#elif defined(THUNDERX2T99) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2680,23 +2661,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P sgemm_p -#define DGEMM_DEFAULT_P dgemm_p -#define CGEMM_DEFAULT_P cgemm_p -#define ZGEMM_DEFAULT_P zgemm_p +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q sgemm_q -#define DGEMM_DEFAULT_Q dgemm_q -#define CGEMM_DEFAULT_Q cgemm_q -#define ZGEMM_DEFAULT_Q zgemm_q +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 -#define SGEMM_DEFAULT_R sgemm_r -#define DGEMM_DEFAULT_R dgemm_r -#define CGEMM_DEFAULT_R cgemm_r -#define ZGEMM_DEFAULT_R zgemm_r +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 -#define SYMV_P 16 -#endif +#else // Other/undetected ARMv8 cores + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#endif // Cores + +#endif // Linux / Darwin + +#endif // ARMv8 #if defined(ARMV5) #define SNUMOPT 2 diff --git a/utest/test_fork.c b/utest/test_fork.c index 9fc51287c..0b90407b1 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -31,10 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ -#include "openblas_utest.h" #include #include #include +#include "openblas_utest.h" void* xmalloc(size_t n) {