Merge branch 'develop'
This commit is contained in:
commit
85636ff1a0
|
|
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4)
|
|||
project(OpenBLAS)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 2)
|
||||
set(OpenBLAS_PATCH_VERSION 18)
|
||||
set(OpenBLAS_PATCH_VERSION 19)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
enable_language(ASM)
|
||||
|
|
@ -45,8 +45,8 @@ endif()
|
|||
|
||||
message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.")
|
||||
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/utils.cmake")
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/system.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
|
||||
|
||||
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
|
||||
|
||||
|
|
@ -123,9 +123,9 @@ endforeach ()
|
|||
# Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke.
|
||||
# Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want.
|
||||
if (NOT NOFORTRAN AND NOT NO_LAPACK)
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/lapack.cmake")
|
||||
if (NOT NO_LAPACKE)
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/lapacke.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/lapacke.cmake")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
|
@ -137,7 +137,7 @@ endif()
|
|||
# add objects to the openblas lib
|
||||
add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/export.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
|
||||
|
||||
# Set output for libopenblas
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
|
|
|
|||
|
|
@ -150,3 +150,14 @@ In chronological order:
|
|||
* theoractice <https://github.com/theoractice/>
|
||||
* [2016-03-20] Fix compiler error in VisualStudio with CMake
|
||||
* [2016-03-22] Fix access violation on Windows while static linking
|
||||
|
||||
* Paul Mustière <https://github.com/buffer51/>
|
||||
* [2016-02-04] Fix Android build on ARMV7
|
||||
* [2016-04-26] Android build with LAPACK for ARMV7 & ARMV8
|
||||
|
||||
* Shivraj Patil <https://github.com/sva-img/>
|
||||
* [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA
|
||||
|
||||
* Kaustubh Raste <https://github.com/ksraste/>
|
||||
* [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA
|
||||
* [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA
|
||||
|
|
|
|||
|
|
@ -1,4 +1,22 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.2.19
|
||||
1-Sep-2016
|
||||
common:
|
||||
* Improved cross compiling.
|
||||
* Fix the bug on musl libc.
|
||||
|
||||
POWER:
|
||||
* Optimize BLAS on Power8
|
||||
* Fixed Julia+OpenBLAS bugs on Power8
|
||||
|
||||
MIPS:
|
||||
* Optimize BLAS on MIPS P5600 and I6400 (Thanks, Shivraj Patil, Kaustubh Raste)
|
||||
|
||||
ARM:
|
||||
* Improved on ARM Cortex-A57. (Thanks, Ashwin Sekhar T K)
|
||||
|
||||
|
||||
====================================================================
|
||||
Version 0.2.18
|
||||
12-Apr-2016
|
||||
|
|
|
|||
4
Makefile
4
Makefile
|
|
@ -108,8 +108,6 @@ endif
|
|||
|
||||
tests :
|
||||
ifndef NOFORTRAN
|
||||
ifndef TARGET
|
||||
ifndef CROSS
|
||||
touch $(LIBNAME)
|
||||
ifndef NO_FBLAS
|
||||
$(MAKE) -C test all
|
||||
|
|
@ -119,8 +117,6 @@ ifndef NO_CBLAS
|
|||
$(MAKE) -C ctest all
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
libs :
|
||||
ifeq ($(CORE), UNKOWN)
|
||||
|
|
|
|||
|
|
@ -20,75 +20,75 @@ lib.grd :
|
|||
$(error OpenBLAS: Please run "make" firstly)
|
||||
|
||||
install : lib.grd
|
||||
@-mkdir -p $(DESTDIR)$(PREFIX)
|
||||
@-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR)
|
||||
@-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
|
||||
@-mkdir -p "$(DESTDIR)$(PREFIX)"
|
||||
@-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)"
|
||||
@-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
||||
@-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)"
|
||||
@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
#for inc
|
||||
@echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
||||
@echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
|
||||
@echo \#define OPENBLAS_CONFIG_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
|
||||
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
|
||||
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
|
||||
@cat openblas_config_template.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
|
||||
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
|
||||
|
||||
@echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@echo \#ifndef OPENBLAS_F77BLAS_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
|
||||
@echo \#define OPENBLAS_F77BLAS_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
|
||||
@echo \#include \"openblas_config.h\" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
|
||||
@cat common_interface.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
|
||||
@echo \#endif >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
|
||||
@echo \#ifndef OPENBLAS_F77BLAS_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
|
||||
@echo \#define OPENBLAS_F77BLAS_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
|
||||
@echo \#include \"openblas_config.h\" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
|
||||
@cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
|
||||
@echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
|
||||
|
||||
ifndef NO_CBLAS
|
||||
@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@sed 's/common/openblas_config/g' cblas.h > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h
|
||||
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
|
||||
endif
|
||||
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
|
||||
endif
|
||||
|
||||
#for install static library
|
||||
ifndef NO_STATIC
|
||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
|
||||
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||
@install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
|
||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
@cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
|
||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), NetBSD)
|
||||
@cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
|
||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
@-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
|
||||
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
|
||||
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
@-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR)
|
||||
@-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
||||
@-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
|
||||
|
|
@ -96,34 +96,34 @@ endif
|
|||
endif
|
||||
#Generating OpenBLASConfig.cmake
|
||||
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
|
||||
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
||||
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
||||
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
|
||||
ifndef NO_SHARED
|
||||
#ifeq logical or
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
endif
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
endif
|
||||
else
|
||||
#only static
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
endif
|
||||
#Generating OpenBLASConfigVersion.cmake
|
||||
@echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
|
||||
@echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
||||
@echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
||||
@echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
||||
@echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
||||
@echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
||||
@echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
||||
@echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
||||
@echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
||||
@echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
||||
@echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||
@echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||
@echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||
@echo "else ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||
@echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||
@echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||
@echo " set (PACKAGE_VERSION_EXACT TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||
@echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||
@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||
@echo Install OK!
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,3 @@
|
|||
ifdef BINARY64
|
||||
else
|
||||
endif
|
||||
|
|
@ -1,4 +1,26 @@
|
|||
# CCOMMON_OPT += -DALLOC_SHM
|
||||
|
||||
ifdef USE_THREAD
|
||||
ifeq ($(USE_THREAD), 0)
|
||||
USE_OPENMP = 0
|
||||
else
|
||||
USE_OPENMP = 1
|
||||
endif
|
||||
else
|
||||
USE_OPENMP = 1
|
||||
endif
|
||||
|
||||
|
||||
|
||||
ifeq ($(CORE), POWER8)
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
else
|
||||
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
FLAMEPATH = $(HOME)/flame/lib
|
||||
|
||||
|
|
@ -16,6 +38,16 @@ else
|
|||
endif
|
||||
endif
|
||||
|
||||
#Either uncomment below line or run make with `USE_MASS=1` to enable support of MASS library
|
||||
#USE_MASS = 1
|
||||
|
||||
ifeq ($(USE_MASS), 1)
|
||||
# Path to MASS libs, change it if the libs are installed at any other location
|
||||
MASSPATH = /opt/ibm/xlmass/8.1.3/lib
|
||||
COMMON_OPT += -mveclibabi=mass -ftree-vectorize -funsafe-math-optimizations -DUSE_MASS
|
||||
EXTRALIB += -L$(MASSPATH) -lmass -lmassvp8 -lmass_simdp8
|
||||
endif
|
||||
|
||||
ifdef BINARY64
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -17,14 +17,26 @@ ifdef CPUIDEMU
|
|||
EXFLAGS = -DCPUIDEMU -DVENDOR=99
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), P5600)
|
||||
TARGET_FLAGS = -mips32r5
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), I6400)
|
||||
TARGET_FLAGS = -mips64r6
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), P6600)
|
||||
TARGET_FLAGS = -mips64r6
|
||||
endif
|
||||
|
||||
all: getarch_2nd
|
||||
./getarch_2nd 0 >> $(TARGET_MAKE)
|
||||
./getarch_2nd 1 >> $(TARGET_CONF)
|
||||
|
||||
config.h : c_check f_check getarch
|
||||
perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC)
|
||||
perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS)
|
||||
ifneq ($(ONLY_CBLAS), 1)
|
||||
perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC)
|
||||
perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS)
|
||||
else
|
||||
#When we only build CBLAS, we set NOFORTRAN=2
|
||||
echo "NOFORTRAN=2" >> $(TARGET_MAKE)
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.2.18
|
||||
VERSION = 0.2.19
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
|
@ -52,6 +52,7 @@ VERSION = 0.2.18
|
|||
# USE_THREAD = 0
|
||||
|
||||
# If you're going to use this library with OpenMP, please comment it in.
|
||||
# This flag is always set for POWER8. Don't modify the flag
|
||||
# USE_OPENMP = 1
|
||||
|
||||
# You can define maximum number of threads. Basically it should be
|
||||
|
|
@ -153,10 +154,12 @@ NO_AFFINITY = 1
|
|||
|
||||
# Common Optimization Flag;
|
||||
# The default -O2 is enough.
|
||||
# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT
|
||||
# COMMON_OPT = -O2
|
||||
|
||||
# gfortran option for LAPACK
|
||||
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
|
||||
# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT
|
||||
# FCOMMON_OPT = -frecursive
|
||||
|
||||
# Profiling flags
|
||||
|
|
|
|||
|
|
@ -159,7 +159,7 @@ ifndef GOTOBLAS_MAKEFILE
|
|||
export GOTOBLAS_MAKEFILE = 1
|
||||
|
||||
# Generating Makefile.conf and config.h
|
||||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) all)
|
||||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
|
||||
|
||||
ifndef TARGET_CORE
|
||||
include $(TOPDIR)/Makefile.conf
|
||||
|
|
@ -462,7 +462,7 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), mips64)
|
||||
ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
|
||||
NO_BINARY_MODE = 1
|
||||
endif
|
||||
|
||||
|
|
@ -502,13 +502,16 @@ endif
|
|||
|
||||
ifdef NO_BINARY_MODE
|
||||
|
||||
ifeq ($(ARCH), mips64)
|
||||
ifeq ($(ARCH), $(filter $(ARCH),mips64))
|
||||
ifdef BINARY64
|
||||
CCOMMON_OPT += -mabi=64
|
||||
else
|
||||
CCOMMON_OPT += -mabi=n32
|
||||
endif
|
||||
BINARY_DEFINED = 1
|
||||
else ifeq ($(ARCH), $(filter $(ARCH),mips))
|
||||
CCOMMON_OPT += -mabi=32
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), LOONGSON3A)
|
||||
|
|
@ -521,6 +524,21 @@ CCOMMON_OPT += -march=mips64
|
|||
FCOMMON_OPT += -march=mips64
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), P5600)
|
||||
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), I6400)
|
||||
CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
|
||||
FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), P6600)
|
||||
CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS)
|
||||
FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), AIX)
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
|
@ -589,12 +607,14 @@ ifneq ($(NO_LAPACK), 1)
|
|||
EXTRALIB += -lgfortran
|
||||
endif
|
||||
ifdef NO_BINARY_MODE
|
||||
ifeq ($(ARCH), mips64)
|
||||
ifeq ($(ARCH), $(filter $(ARCH),mips64))
|
||||
ifdef BINARY64
|
||||
FCOMMON_OPT += -mabi=64
|
||||
else
|
||||
FCOMMON_OPT += -mabi=n32
|
||||
endif
|
||||
else ifeq ($(ARCH), $(filter $(ARCH),mips))
|
||||
FCOMMON_OPT += -mabi=32
|
||||
endif
|
||||
else
|
||||
ifdef BINARY64
|
||||
|
|
@ -678,20 +698,6 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(ARCH), mips64)
|
||||
ifndef BINARY64
|
||||
FCOMMON_OPT += -m32
|
||||
else
|
||||
FCOMMON_OPT += -m64
|
||||
endif
|
||||
else
|
||||
ifdef BINARY64
|
||||
FCOMMON_OPT += -mabi=64
|
||||
else
|
||||
FCOMMON_OPT += -mabi=n32
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -mp
|
||||
endif
|
||||
|
|
@ -707,7 +713,7 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), mips64)
|
||||
ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
|
||||
ifndef BINARY64
|
||||
FCOMMON_OPT += -n32
|
||||
else
|
||||
|
|
@ -737,7 +743,7 @@ endif
|
|||
|
||||
ifeq ($(C_COMPILER), OPEN64)
|
||||
|
||||
ifeq ($(ARCH), mips64)
|
||||
ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
|
||||
ifndef BINARY64
|
||||
CCOMMON_OPT += -n32
|
||||
else
|
||||
|
|
@ -1126,6 +1132,8 @@ export HAVE_VFP
|
|||
export HAVE_VFPV3
|
||||
export HAVE_VFPV4
|
||||
export HAVE_NEON
|
||||
export HAVE_MSA
|
||||
export MSA_FLAGS
|
||||
export KERNELDIR
|
||||
export FUNCTION_PROFILE
|
||||
export TARGET_CORE
|
||||
|
|
|
|||
30
README.md
30
README.md
|
|
@ -43,6 +43,35 @@ On X86 box, compile this library for loongson3a CPU with loongcc (based on Open6
|
|||
|
||||
make DEBUG=1
|
||||
|
||||
### Compile with MASS Support on Power CPU (Optional dependency)
|
||||
|
||||
[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and
|
||||
Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER.
|
||||
The library can be installed as below -
|
||||
|
||||
* On Ubuntu:
|
||||
|
||||
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
|
||||
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install libxlmass-devel.8.1.3
|
||||
|
||||
* On RHEL/CentOS:
|
||||
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
|
||||
sudo rpm --import repomd.xml.key
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
|
||||
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
|
||||
sudo yum install libxlmass-devel.8.1.3
|
||||
|
||||
After installing MASS library, compile openblas with USE_MASS=1.
|
||||
|
||||
Example:
|
||||
|
||||
Compiling on Power8 with MASS support -
|
||||
|
||||
make USE_MASS=1 TARGET=POWER8
|
||||
|
||||
### Install to the directory (optional)
|
||||
|
||||
Example:
|
||||
|
|
@ -82,6 +111,7 @@ Please read GotoBLAS_01Readme.txt
|
|||
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
|
||||
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
|
||||
- **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
|
||||
## Usages
|
||||
Link with libopenblas.a or -lopenblas for shared library.
|
||||
|
|
|
|||
|
|
@ -53,26 +53,31 @@ PPC440
|
|||
PPC440FP2
|
||||
CELL
|
||||
|
||||
3.MIPS64 CPU:
|
||||
3.MIPS CPU:
|
||||
P5600
|
||||
|
||||
4.MIPS64 CPU:
|
||||
SICORTEX
|
||||
LOONGSON3A
|
||||
LOONGSON3B
|
||||
I6400
|
||||
P6600
|
||||
|
||||
4.IA64 CPU:
|
||||
5.IA64 CPU:
|
||||
ITANIUM2
|
||||
|
||||
5.SPARC CPU:
|
||||
6.SPARC CPU:
|
||||
SPARC
|
||||
SPARCV7
|
||||
|
||||
6.ARM CPU:
|
||||
7.ARM CPU:
|
||||
CORTEXA15
|
||||
CORTEXA9
|
||||
ARMV7
|
||||
ARMV6
|
||||
ARMV5
|
||||
|
||||
7.ARM 64-bit CPU:
|
||||
8.ARM 64-bit CPU:
|
||||
ARMV8
|
||||
CORTEXA57
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
version: 0.2.18.{build}
|
||||
version: 0.2.19.{build}
|
||||
|
||||
#environment:
|
||||
|
||||
|
|
|
|||
|
|
@ -173,7 +173,9 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
|||
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
|
||||
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
|
||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
|
||||
smallscaling
|
||||
smallscaling \
|
||||
isamax.goto idamax.goto icamax.goto izamax.goto \
|
||||
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto
|
||||
|
||||
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
||||
|
|
@ -226,7 +228,9 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
|
|||
sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
|
||||
sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \
|
||||
spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \
|
||||
ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas
|
||||
ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \
|
||||
isamax.atlas idamax.atlas icamax.atlas izamax.atlas \
|
||||
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto
|
||||
|
||||
mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
|
||||
scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \
|
||||
|
|
@ -261,7 +265,9 @@ endif
|
|||
|
||||
essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \
|
||||
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \
|
||||
slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl
|
||||
slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \
|
||||
scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \
|
||||
strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl
|
||||
|
||||
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
|
||||
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
|
||||
|
|
@ -393,6 +399,9 @@ scholesky.mkl : scholesky.$(SUFFIX)
|
|||
scholesky.veclib : scholesky.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
scholesky.essl : scholesky.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Dcholesky ###################################################
|
||||
|
||||
dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME)
|
||||
|
|
@ -410,6 +419,9 @@ dcholesky.mkl : dcholesky.$(SUFFIX)
|
|||
dcholesky.veclib : dcholesky.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
dcholesky.essl : dcholesky.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Ccholesky ###################################################
|
||||
|
||||
ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME)
|
||||
|
|
@ -427,6 +439,9 @@ ccholesky.mkl : ccholesky.$(SUFFIX)
|
|||
ccholesky.veclib : ccholesky.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
ccholesky.essl : ccholesky.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
|
||||
##################################### Zcholesky ###################################################
|
||||
|
||||
|
|
@ -445,6 +460,9 @@ zcholesky.mkl : zcholesky.$(SUFFIX)
|
|||
zcholesky.veclib : zcholesky.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
zcholesky.essl : zcholesky.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Sgemm ####################################################
|
||||
sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
|
@ -683,6 +701,9 @@ strsm.mkl : strsm.$(SUFFIX)
|
|||
strsm.veclib : strsm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
strsm.essl : strsm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Dtrsm ####################################################
|
||||
dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
|
@ -699,6 +720,9 @@ dtrsm.mkl : dtrsm.$(SUFFIX)
|
|||
dtrsm.veclib : dtrsm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
dtrsm.essl : dtrsm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Ctrsm ####################################################
|
||||
|
||||
ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME)
|
||||
|
|
@ -716,6 +740,9 @@ ctrsm.mkl : ctrsm.$(SUFFIX)
|
|||
ctrsm.veclib : ctrsm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
ctrsm.essl : ctrsm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Ztrsm ####################################################
|
||||
|
||||
ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME)
|
||||
|
|
@ -733,6 +760,9 @@ ztrsm.mkl : ztrsm.$(SUFFIX)
|
|||
ztrsm.veclib : ztrsm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
ztrsm.essl : ztrsm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Ssyrk ####################################################
|
||||
ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
|
@ -1911,6 +1941,63 @@ zgemm3m.mkl : zgemm3m.$(SUFFIX)
|
|||
zgemm3m.veclib : zgemm3m.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## ISAMAX ##############################################
|
||||
isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
isamax.atlas : isamax.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## IDAMAX ##############################################
|
||||
idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
idamax.atlas : idamax.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## ICAMAX ##############################################
|
||||
icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
icamax.atlas : icamax.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## IZAMAX ##############################################
|
||||
izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
izamax.atlas : izamax.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## SNRM2 ##############################################
|
||||
snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
snrm2.atlas : snrm2.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## DNRM2 ##############################################
|
||||
dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
dnrm2.atlas : dnrm2.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## Sscnrm2 ##############################################
|
||||
scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
scnrm2.atlas : scnrm2.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## Ddznrm2 ##############################################
|
||||
dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
dznrm2.atlas : dznrm2.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
|
||||
###################################################################################################
|
||||
|
||||
slinpack.$(SUFFIX) : linpack.c
|
||||
|
|
@ -2217,11 +2304,38 @@ cgemm3m.$(SUFFIX) : gemm3m.c
|
|||
zgemm3m.$(SUFFIX) : gemm3m.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
|
||||
isamax.$(SUFFIX) : iamax.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
idamax.$(SUFFIX) : iamax.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
icamax.$(SUFFIX) : iamax.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
izamax.$(SUFFIX) : iamax.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
|
||||
snrm2.$(SUFFIX) : nrm2.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
dnrm2.$(SUFFIX) : nrm2.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
scnrm2.$(SUFFIX) : nrm2.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
dznrm2.$(SUFFIX) : nrm2.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
|
||||
smallscaling: smallscaling.c ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread
|
||||
|
||||
clean ::
|
||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl
|
||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling
|
||||
|
||||
include $(TOPDIR)/Makefile.tail
|
||||
|
||||
|
|
|
|||
|
|
@ -183,9 +183,9 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
#ifdef COMPLEX
|
||||
fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6);
|
||||
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg);
|
||||
#else
|
||||
fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6);
|
||||
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MBytes\n",
|
||||
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
|
||||
" %10.2f MBytes %10.6f sec\n",
|
||||
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -184,8 +184,8 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -221,7 +221,7 @@ int main(int argc, char *argv[]){
|
|||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
|
||||
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -258,7 +258,7 @@ int main(int argc, char *argv[]){
|
|||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
|
||||
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,190 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#undef IAMAX
|
||||
|
||||
#ifdef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define IAMAX BLASFUNC(izamax)
|
||||
#else
|
||||
#define IAMAX BLASFUNC(icamax)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define IAMAX BLASFUNC(idamax)
|
||||
#else
|
||||
#define IAMAX BLASFUNC(isamax)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x;
|
||||
blasint m, i;
|
||||
blasint inc_x=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Time\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
IAMAX (&m, x, &inc_x);
|
||||
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
|
||||
timeg += time1;
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr, " %10.6f secs\n", timeg);
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -0,0 +1,190 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#undef NRM2
|
||||
|
||||
#ifdef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define NRM2 BLASFUNC(dznrm2)
|
||||
#else
|
||||
#define NRM2 BLASFUNC(scnrm2)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define NRM2 BLASFUNC(dnrm2)
|
||||
#else
|
||||
#define NRM2 BLASFUNC(snrm2)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x;
|
||||
blasint m, i;
|
||||
blasint inc_x=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Time\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
NRM2 (&m, x, &inc_x);
|
||||
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
|
||||
timeg += time1;
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr, " %10.6f secs\n", timeg);
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -186,8 +186,8 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6);
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -189,9 +189,9 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
#ifdef COMPLEX
|
||||
fprintf(stderr, " %10.2f MFlops\n", 6. * (double)m / timeg * 1.e-6);
|
||||
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 6. * (double)m / timeg * 1.e-6, timeg);
|
||||
#else
|
||||
fprintf(stderr, " %10.2f MFlops\n", 1. * (double)m / timeg * 1.e-6);
|
||||
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 1. * (double)m / timeg * 1.e-6, timeg);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
#include <time.h>
|
||||
#include <cblas.h>
|
||||
#include <omp.h>
|
||||
#include <pthread.h>
|
||||
#define MIN_SIZE 5
|
||||
#define MAX_SIZE 60
|
||||
#define NB_SIZE 10
|
||||
|
|
|
|||
|
|
@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MBytes\n",
|
||||
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
|
||||
" %10.2f MBytes %10.6f sec\n",
|
||||
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -191,8 +191,8 @@ int main(int argc, char *argv[]){
|
|||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6, time1);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -184,8 +184,8 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
52
c_check
52
c_check
|
|
@ -1,5 +1,8 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
use File::Basename;
|
||||
use File::Temp qw(tempfile);
|
||||
|
||||
# Checking cross compile
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
|
||||
|
|
@ -8,6 +11,7 @@ $hostarch = "arm" if ($hostarch =~ /^arm.*/);
|
|||
$hostarch = "arm64" if ($hostarch eq "aarch64");
|
||||
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
|
||||
|
||||
$tmpf = new File::Temp( UNLINK => 1 );
|
||||
$binary = $ENV{"BINARY"};
|
||||
|
||||
$makefile = shift(@ARGV);
|
||||
|
|
@ -26,14 +30,12 @@ if ($?) {
|
|||
|
||||
$cross_suffix = "";
|
||||
|
||||
if ($ARGV[0] =~ /(.*)(-[.\d]+)/) {
|
||||
if ($1 =~ /(.*-)(.*)/) {
|
||||
$cross_suffix = $1;
|
||||
}
|
||||
} else {
|
||||
if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) {
|
||||
$cross_suffix = $1;
|
||||
}
|
||||
if (dirname($compiler_name) ne ".") {
|
||||
$cross_suffix .= dirname($compiler_name) . "/";
|
||||
}
|
||||
|
||||
if (basename($compiler_name) =~ /(.*-)(.*)/) {
|
||||
$cross_suffix .= $1;
|
||||
}
|
||||
|
||||
$compiler = "";
|
||||
|
|
@ -63,7 +65,7 @@ $os = Android if ($data =~ /OS_ANDROID/);
|
|||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips32 if ($data =~ /ARCH_MIPS32/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
|
|
@ -79,7 +81,12 @@ if ($os eq "AIX") {
|
|||
$defined = 1;
|
||||
}
|
||||
|
||||
if (($architecture eq "mips32") || ($architecture eq "mips64")) {
|
||||
if ($architecture eq "mips") {
|
||||
$compiler_name .= " -mabi=32";
|
||||
$defined = 1;
|
||||
}
|
||||
|
||||
if ($architecture eq "mips64") {
|
||||
$compiler_name .= " -mabi=n32" if ($binary eq "32");
|
||||
$compiler_name .= " -mabi=64" if ($binary eq "64");
|
||||
$defined = 1;
|
||||
|
|
@ -152,10 +159,28 @@ if ($?) {
|
|||
die 1;
|
||||
}
|
||||
|
||||
$have_msa = 0;
|
||||
if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
||||
$code = '"addvi.b $w0, $w1, 1"';
|
||||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
|
||||
print $tmpf "#include <msa.h>\n\n";
|
||||
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
|
||||
|
||||
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$have_msa = 0;
|
||||
} else {
|
||||
$have_msa = 1;
|
||||
}
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips32 if ($data =~ /ARCH_MIPS32/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
|
|
@ -243,9 +268,11 @@ print MAKEFILE "BINARY64=\n" if $binformat ne bin64;
|
|||
print MAKEFILE "BINARY32=1\n" if $binformat eq bin32;
|
||||
print MAKEFILE "BINARY64=1\n" if $binformat eq bin64;
|
||||
print MAKEFILE "FU=$need_fu\n" if $need_fu ne "";
|
||||
print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne "";
|
||||
print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne "";
|
||||
print MAKEFILE "CROSS=1\n" if $cross != 0;
|
||||
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
|
||||
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
|
||||
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
|
||||
|
||||
$os =~ tr/[a-z]/[A-Z]/;
|
||||
$architecture =~ tr/[a-z]/[A-Z]/;
|
||||
|
|
@ -257,6 +284,7 @@ print CONFFILE "#define C_$compiler\t1\n";
|
|||
print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32;
|
||||
print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
|
||||
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
|
||||
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
|
||||
|
||||
if ($os eq "LINUX") {
|
||||
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ endif()
|
|||
add_custom_command(
|
||||
TARGET ${OpenBLAS_LIBNAME} PRE_LINK
|
||||
COMMAND perl
|
||||
ARGS "${CMAKE_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def"
|
||||
ARGS "${PROJECT_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def"
|
||||
COMMENT "Create openblas.def file"
|
||||
VERBATIM)
|
||||
|
||||
|
|
|
|||
|
|
@ -50,20 +50,20 @@ else()
|
|||
set(TARGET_CONF "config.h")
|
||||
endif ()
|
||||
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/c_check.cmake")
|
||||
|
||||
if (NOT NOFORTRAN)
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
|
||||
endif ()
|
||||
|
||||
# compile getarch
|
||||
set(GETARCH_SRC
|
||||
${CMAKE_SOURCE_DIR}/getarch.c
|
||||
${PROJECT_SOURCE_DIR}/getarch.c
|
||||
${CPUIDEMO}
|
||||
)
|
||||
|
||||
if (NOT MSVC)
|
||||
list(APPEND GETARCH_SRC ${CMAKE_SOURCE_DIR}/cpuid.S)
|
||||
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
|
||||
endif ()
|
||||
|
||||
if (MSVC)
|
||||
|
|
@ -76,7 +76,7 @@ set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
|
|||
file(MAKE_DIRECTORY ${GETARCH_DIR})
|
||||
try_compile(GETARCH_RESULT ${GETARCH_DIR}
|
||||
SOURCES ${GETARCH_SRC}
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${CMAKE_SOURCE_DIR}
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GETARCH_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
|
||||
)
|
||||
|
|
@ -97,8 +97,8 @@ set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
|
|||
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
|
||||
file(MAKE_DIRECTORY ${GETARCH2_DIR})
|
||||
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
|
||||
SOURCES ${CMAKE_SOURCE_DIR}/getarch_2nd.c
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${CMAKE_SOURCE_DIR}
|
||||
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GETARCH2_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
|
||||
)
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
## Description: Ported from OpenBLAS/Makefile.system
|
||||
##
|
||||
|
||||
set(NETLIB_LAPACK_DIR "${CMAKE_SOURCE_DIR}/lapack-netlib")
|
||||
set(NETLIB_LAPACK_DIR "${PROJECT_SOURCE_DIR}/lapack-netlib")
|
||||
|
||||
# TODO: Makefile.system detects Darwin (mac) and switches to clang here -hpa
|
||||
# http://stackoverflow.com/questions/714100/os-detecting-makefile
|
||||
|
|
@ -78,7 +78,7 @@ else ()
|
|||
set(ONLY_CBLAS 0)
|
||||
endif ()
|
||||
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/prebuild.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
||||
|
||||
if (NOT DEFINED NUM_THREADS)
|
||||
set(NUM_THREADS ${NUM_CORES})
|
||||
|
|
@ -124,17 +124,17 @@ set(OBJCOPY "${CROSS_SUFFIX}objcopy")
|
|||
set(OBJCONV "${CROSS_SUFFIX}objconv")
|
||||
|
||||
# OS dependent settings
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/os.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/os.cmake")
|
||||
|
||||
# Architecture dependent settings
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/arch.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake")
|
||||
|
||||
# C Compiler dependent settings
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/cc.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
|
||||
|
||||
if (NOT NOFORTRAN)
|
||||
# Fortran Compiler dependent settings
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/fc.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
|
||||
endif ()
|
||||
|
||||
if (BINARY64)
|
||||
|
|
@ -247,10 +247,10 @@ if (NOT DEFINED SYMBOLSUFFIX)
|
|||
set(SYMBOLSUFFIX "")
|
||||
endif ()
|
||||
|
||||
set(KERNELDIR "${CMAKE_SOURCE_DIR}/kernel/${ARCH}")
|
||||
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
|
||||
|
||||
# TODO: nead to convert these Makefiles
|
||||
# include ${CMAKE_SOURCE_DIR}/cmake/${ARCH}.cmake
|
||||
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
|
||||
|
||||
if (${CORE} STREQUAL "PPC440")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_QALLOC")
|
||||
|
|
@ -410,8 +410,8 @@ set(LIBDEFNAME "${LIBNAME}.${LIBSUFFIX}.def")
|
|||
set(LIBEXPNAME "${LIBNAME}.${LIBSUFFIX}.exp")
|
||||
set(LIBZIPNAME "${LIBNAME}.${LIBSUFFIX}.zip")
|
||||
|
||||
set(LIBS "${CMAKE_SOURCE_DIR}/${LIBNAME}")
|
||||
set(LIBS_P "${CMAKE_SOURCE_DIR}/${LIBNAME_P}")
|
||||
set(LIBS "${PROJECT_SOURCE_DIR}/${LIBNAME}")
|
||||
set(LIBS_P "${PROJECT_SOURCE_DIR}/${LIBNAME_P}")
|
||||
|
||||
|
||||
set(LIB_COMPONENTS BLAS)
|
||||
|
|
|
|||
16
common.h
16
common.h
|
|
@ -332,6 +332,13 @@ typedef int blasint;
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef POWER8
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
#ifdef PILEDRIVER
|
||||
#ifndef YIELDING
|
||||
|
|
@ -397,6 +404,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
|||
#include "common_sparc.h"
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_MIPS
|
||||
#include "common_mips.h"
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_MIPS64
|
||||
#include "common_mips64.h"
|
||||
#endif
|
||||
|
|
@ -615,9 +626,14 @@ void gotoblas_profile_init(void);
|
|||
void gotoblas_profile_quit(void);
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
#ifndef C_MSVC
|
||||
int omp_in_parallel(void);
|
||||
int omp_get_num_procs(void);
|
||||
#else
|
||||
__declspec(dllimport) int __cdecl omp_in_parallel(void);
|
||||
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
|
||||
#endif
|
||||
#else
|
||||
#ifdef __ELF__
|
||||
int omp_in_parallel (void) __attribute__ ((weak));
|
||||
int omp_get_num_procs(void) __attribute__ ((weak));
|
||||
|
|
|
|||
|
|
@ -0,0 +1,109 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
#ifndef COMMON_MIPS
|
||||
#define COMMON_MIPS
|
||||
|
||||
#define MB
|
||||
#define WMB
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#define RETURN_BY_COMPLEX
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
static void INLINE blas_lock(volatile unsigned long *address){
|
||||
|
||||
}
|
||||
#define BLAS_LOCK_DEFINED
|
||||
|
||||
static inline unsigned int rpcc(void){
|
||||
unsigned long ret;
|
||||
|
||||
__asm__ __volatile__(".set push \n"
|
||||
"rdhwr %0, $30 \n"
|
||||
".set pop" : "=r"(ret) : : "memory");
|
||||
|
||||
return ret;
|
||||
}
|
||||
#define RPCC_DEFINED
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
}
|
||||
|
||||
#define GET_IMAGE(res)
|
||||
|
||||
#define GET_IMAGE_CANCEL
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef F_INTERFACE
|
||||
#define REALNAME ASMNAME
|
||||
#else
|
||||
#define REALNAME ASMFNAME
|
||||
#endif
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.arm ;\
|
||||
.global REALNAME ;\
|
||||
.func REALNAME ;\
|
||||
REALNAME:
|
||||
|
||||
#define EPILOGUE
|
||||
|
||||
#define PROFCODE
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#ifndef PAGESIZE
|
||||
#define PAGESIZE ( 4 << 10)
|
||||
#endif
|
||||
#define HUGE_PAGESIZE ( 4 << 20)
|
||||
|
||||
#define BUFFER_SIZE (16 << 20)
|
||||
|
||||
|
||||
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
|
||||
|
||||
#ifndef MAP_ANONYMOUS
|
||||
#define MAP_ANONYMOUS MAP_ANON
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -102,7 +102,7 @@ static void INLINE blas_lock(volatile unsigned long *address){
|
|||
|
||||
static inline unsigned int rpcc(void){
|
||||
unsigned long ret;
|
||||
#if defined(LOONGSON3A) || defined(LOONGSON3B)
|
||||
|
||||
// unsigned long long tmp;
|
||||
//__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory");
|
||||
//ret=tmp;
|
||||
|
|
@ -111,17 +111,10 @@ static inline unsigned int rpcc(void){
|
|||
"rdhwr %0, $2\n"
|
||||
".set pop": "=r"(ret):: "memory");
|
||||
|
||||
#else
|
||||
__asm__ __volatile__(".set push \n"
|
||||
".set mips32r2\n"
|
||||
"rdhwr %0, $30 \n"
|
||||
".set pop" : "=r"(ret) : : "memory");
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
#define RPCC_DEFINED
|
||||
|
||||
#if defined(LOONGSON3A) || defined(LOONGSON3B)
|
||||
#ifndef NO_AFFINITY
|
||||
#define WHEREAMI
|
||||
static inline int WhereAmI(void){
|
||||
|
|
@ -134,7 +127,6 @@ static inline int WhereAmI(void){
|
|||
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
|
|
|
|||
|
|
@ -39,8 +39,13 @@
|
|||
#ifndef COMMON_POWER
|
||||
#define COMMON_POWER
|
||||
|
||||
#if defined(POWER8)
|
||||
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
||||
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
||||
#else
|
||||
#define MB __asm__ __volatile__ ("sync")
|
||||
#define WMB __asm__ __volatile__ ("sync")
|
||||
#endif
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
|
|
@ -798,7 +803,7 @@ Lmcount$lazy_ptr:
|
|||
#elif defined(PPC440FP2)
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#elif defined(POWER8)
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
#define BUFFER_SIZE ( 64 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#endif
|
||||
|
|
|
|||
60
cpuid_mips.c
60
cpuid_mips.c
|
|
@ -71,15 +71,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/*********************************************************************/
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_SICORTEX 1
|
||||
#define CPU_LOONGSON3A 2
|
||||
#define CPU_LOONGSON3B 3
|
||||
#define CPU_P5600 1
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKOWN",
|
||||
"SICORTEX",
|
||||
"LOONGSON3A",
|
||||
"LOONGSON3B"
|
||||
"P5600"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
|
|
@ -120,7 +116,7 @@ int detect(void){
|
|||
if (strstr(p, "loongson3a"))
|
||||
return CPU_LOONGSON3A;
|
||||
}else{
|
||||
return CPU_SICORTEX;
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
}
|
||||
//Check model name for Loongson3
|
||||
|
|
@ -149,64 +145,40 @@ char *get_corename(void){
|
|||
}
|
||||
|
||||
void get_architecture(void){
|
||||
printf("MIPS64");
|
||||
printf("MIPS");
|
||||
}
|
||||
|
||||
void get_subarchitecture(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("LOONGSON3A");
|
||||
}else if(detect()==CPU_LOONGSON3B){
|
||||
printf("LOONGSON3B");
|
||||
if(detect()==CPU_P5600){
|
||||
printf("P5600");
|
||||
}else{
|
||||
printf("SICORTEX");
|
||||
printf("UNKNOWN");
|
||||
}
|
||||
}
|
||||
|
||||
void get_subdirname(void){
|
||||
printf("mips64");
|
||||
printf("mips");
|
||||
}
|
||||
|
||||
void get_cpuconfig(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("#define LOONGSON3A\n");
|
||||
if(detect()==CPU_P5600){
|
||||
printf("#define P5600\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 512488\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 32\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
}else if(detect()==CPU_LOONGSON3B){
|
||||
printf("#define LOONGSON3B\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 512488\n");
|
||||
printf("#define L2_LINESIZE 32\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
}else{
|
||||
printf("#define SICORTEX\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 512488\n");
|
||||
printf("#define L2_LINESIZE 32\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 32\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
}else{
|
||||
printf("#define UNKNOWN\n");
|
||||
}
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("loongson3a\n");
|
||||
}else if(detect()==CPU_LOONGSON3B) {
|
||||
printf("loongson3b\n");
|
||||
if(detect()==CPU_P5600) {
|
||||
printf("p5600\n");
|
||||
}else{
|
||||
#ifdef __mips64
|
||||
printf("mips64\n");
|
||||
#else
|
||||
printf("mips32\n");
|
||||
#endif
|
||||
printf("mips\n");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,238 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
**********************************************************************************/
|
||||
|
||||
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_SICORTEX 1
|
||||
#define CPU_LOONGSON3A 2
|
||||
#define CPU_LOONGSON3B 3
|
||||
#define CPU_I6400 4
|
||||
#define CPU_P6600 5
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKOWN",
|
||||
"SICORTEX",
|
||||
"LOONGSON3A",
|
||||
"LOONGSON3B",
|
||||
"I6400",
|
||||
"P6600"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
|
||||
#ifdef linux
|
||||
FILE *infile;
|
||||
char buffer[512], *p;
|
||||
|
||||
p = (char *)NULL;
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("cpu", buffer, 3)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if(p != NULL){
|
||||
if (strstr(p, "Loongson-3A")){
|
||||
return CPU_LOONGSON3A;
|
||||
}else if(strstr(p, "Loongson-3B")){
|
||||
return CPU_LOONGSON3B;
|
||||
}else if (strstr(p, "Loongson-3")){
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
p = (char *)NULL;
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("system type", buffer, 11)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if (strstr(p, "loongson3a"))
|
||||
return CPU_LOONGSON3A;
|
||||
}else{
|
||||
return CPU_SICORTEX;
|
||||
}
|
||||
}
|
||||
//Check model name for Loongson3
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
p = (char *)NULL;
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("model name", buffer, 10)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if(p != NULL){
|
||||
if (strstr(p, "Loongson-3A")){
|
||||
return CPU_LOONGSON3A;
|
||||
}else if(strstr(p, "Loongson-3B")){
|
||||
return CPU_LOONGSON3B;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
|
||||
char *get_corename(void){
|
||||
return cpuname[detect()];
|
||||
}
|
||||
|
||||
void get_architecture(void){
|
||||
printf("MIPS64");
|
||||
}
|
||||
|
||||
void get_subarchitecture(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("LOONGSON3A");
|
||||
}else if(detect()==CPU_LOONGSON3B){
|
||||
printf("LOONGSON3B");
|
||||
}else if(detect()==CPU_I6400){
|
||||
printf("I6400");
|
||||
}else if(detect()==CPU_P6600){
|
||||
printf("P6600");
|
||||
}else{
|
||||
printf("SICORTEX");
|
||||
}
|
||||
}
|
||||
|
||||
void get_subdirname(void){
|
||||
printf("mips64");
|
||||
}
|
||||
|
||||
void get_cpuconfig(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("#define LOONGSON3A\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 512488\n");
|
||||
printf("#define L2_LINESIZE 32\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
}else if(detect()==CPU_LOONGSON3B){
|
||||
printf("#define LOONGSON3B\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 512488\n");
|
||||
printf("#define L2_LINESIZE 32\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
}else if(detect()==CPU_I6400){
|
||||
printf("#define I6400\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 32\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
}else if(detect()==CPU_P6600){
|
||||
printf("#define P6600\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 32\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
}else{
|
||||
printf("#define SICORTEX\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 512488\n");
|
||||
printf("#define L2_LINESIZE 32\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 32\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
}
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("loongson3a\n");
|
||||
}else if(detect()==CPU_LOONGSON3B) {
|
||||
printf("loongson3b\n");
|
||||
}else if(detect()==CPU_I6400) {
|
||||
printf("i6400\n");
|
||||
}else if(detect()==CPU_P6600) {
|
||||
printf("p6600\n");
|
||||
}else{
|
||||
printf("mips64\n");
|
||||
}
|
||||
}
|
||||
|
|
@ -1172,6 +1172,8 @@ int get_cpuname(void){
|
|||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 12:
|
||||
// Braswell
|
||||
case 13:
|
||||
// Avoton
|
||||
return CPUTYPE_NEHALEM;
|
||||
|
|
@ -1678,6 +1680,8 @@ int get_coretype(void){
|
|||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 12:
|
||||
// Braswell
|
||||
case 13:
|
||||
// Avoton
|
||||
return CORE_NEHALEM;
|
||||
|
|
|
|||
2
ctest.c
2
ctest.c
|
|
@ -110,7 +110,7 @@ ARCH_MIPS64
|
|||
#endif
|
||||
|
||||
#if defined(__mips32) || defined(__mips)
|
||||
ARCH_MIPS32
|
||||
ARCH_MIPS
|
||||
#endif
|
||||
|
||||
#ifdef __alpha
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
include_directories(${CMAKE_SOURCE_DIR})
|
||||
include_directories(${PROJECT_SOURCE_DIR})
|
||||
|
||||
enable_language(Fortran)
|
||||
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
|
|||
all :: all1 all2 all3
|
||||
|
||||
all1: xscblat1 xdcblat1 xccblat1 xzcblat1
|
||||
ifndef CROSS
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
OMP_NUM_THREADS=2 ./xscblat1
|
||||
OMP_NUM_THREADS=2 ./xdcblat1
|
||||
|
|
@ -53,8 +54,10 @@ else
|
|||
OPENBLAS_NUM_THREADS=2 ./xccblat1
|
||||
OPENBLAS_NUM_THREADS=2 ./xzcblat1
|
||||
endif
|
||||
endif
|
||||
|
||||
all2: xscblat2 xdcblat2 xccblat2 xzcblat2
|
||||
ifndef CROSS
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
OMP_NUM_THREADS=2 ./xscblat2 < sin2
|
||||
OMP_NUM_THREADS=2 ./xdcblat2 < din2
|
||||
|
|
@ -66,8 +69,10 @@ else
|
|||
OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2
|
||||
OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2
|
||||
endif
|
||||
endif
|
||||
|
||||
all3: xscblat3 xdcblat3 xccblat3 xzcblat3
|
||||
ifndef CROSS
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
OMP_NUM_THREADS=2 ./xscblat3 < sin3
|
||||
OMP_NUM_THREADS=2 ./xdcblat3 < din3
|
||||
|
|
@ -88,6 +93,7 @@ else
|
|||
OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
|
||||
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
include_directories(${CMAKE_SOURCE_DIR})
|
||||
include_directories(${PROJECT_SOURCE_DIR})
|
||||
|
||||
# sources that need to be compiled twice, once with no flags and once with LOWER
|
||||
set(UL_SOURCES
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
include_directories(${CMAKE_SOURCE_DIR})
|
||||
include_directories(${PROJECT_SOURCE_DIR})
|
||||
|
||||
# N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
include_directories(${CMAKE_SOURCE_DIR})
|
||||
include_directories(${PROJECT_SOURCE_DIR})
|
||||
|
||||
if (${CORE} STREQUAL "PPC440")
|
||||
set(MEMORY memory_qalloc.c)
|
||||
|
|
|
|||
|
|
@ -261,8 +261,8 @@ static gotoblas_t *get_coretype(void){
|
|||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Avoton
|
||||
if (model == 13) {
|
||||
//Intel Braswell / Avoton
|
||||
if (model == 12 || model == 13) {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
|
|
@ -439,7 +439,7 @@ static gotoblas_t *force_coretype(char *coretype){
|
|||
char message[128];
|
||||
//char mname[20];
|
||||
|
||||
for ( i=1 ; i <= 21; i++)
|
||||
for ( i=1 ; i <= 22; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype,corename[i],20))
|
||||
{
|
||||
|
|
|
|||
|
|
@ -361,6 +361,9 @@ static void numa_mapping(void) {
|
|||
unsigned long work, bit;
|
||||
int count = 0;
|
||||
int bitmask_idx = 0;
|
||||
int current_cpu;
|
||||
int current_node = 0;
|
||||
int cpu_count = 0;
|
||||
|
||||
for (node = 0; node < common -> num_nodes; node ++) {
|
||||
core = 0;
|
||||
|
|
@ -382,33 +385,84 @@ static void numa_mapping(void) {
|
|||
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
|
||||
#endif
|
||||
|
||||
h = 1;
|
||||
current_cpu = sched_getcpu();
|
||||
for (cpu = 0; cpu < count; cpu++) {
|
||||
if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) {
|
||||
current_node = READ_NODE(common -> cpu_info[cpu]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (i = 0; i < MAX_BITMASK_LEN; i++)
|
||||
cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]);
|
||||
|
||||
while (h < count) h = 2 * h + 1;
|
||||
/*
|
||||
* If all the processes can be accommodated in the
|
||||
* in the current node itself, then bind to cores
|
||||
* from the current node only
|
||||
*/
|
||||
if (numprocs <= cpu_count) {
|
||||
/*
|
||||
* First sort all the cores in order from the current node.
|
||||
* Then take remaining nodes one by one in order,
|
||||
* and sort their cores in order.
|
||||
*/
|
||||
for (i = 0; i < count; i++) {
|
||||
for (j = 0; j < count - 1; j++) {
|
||||
int node_1, node_2;
|
||||
int core_1, core_2;
|
||||
int swap = 0;
|
||||
|
||||
while (h > 1) {
|
||||
h /= 2;
|
||||
for (i = h; i < count; i++) {
|
||||
work = common -> cpu_info[i];
|
||||
bit = CPU_ISSET(i, &cpu_orig_mask[0]);
|
||||
j = i - h;
|
||||
while (work < common -> cpu_info[j]) {
|
||||
common -> cpu_info[j + h] = common -> cpu_info[j];
|
||||
if (CPU_ISSET(j, &cpu_orig_mask[0])) {
|
||||
CPU_SET(j + h, &cpu_orig_mask[0]);
|
||||
} else {
|
||||
CPU_CLR(j + h, &cpu_orig_mask[0]);
|
||||
}
|
||||
j -= h;
|
||||
if (j < 0) break;
|
||||
}
|
||||
common -> cpu_info[j + h] = work;
|
||||
if (bit) {
|
||||
CPU_SET(j + h, &cpu_orig_mask[0]);
|
||||
} else {
|
||||
CPU_CLR(j + h, &cpu_orig_mask[0]);
|
||||
node_1 = READ_NODE(common -> cpu_info[j]);
|
||||
node_2 = READ_NODE(common -> cpu_info[j + 1]);
|
||||
core_1 = READ_CORE(common -> cpu_info[j]);
|
||||
core_2 = READ_CORE(common -> cpu_info[j + 1]);
|
||||
|
||||
if (node_1 == node_2) {
|
||||
if (core_1 > core_2)
|
||||
swap = 1;
|
||||
} else {
|
||||
if ((node_2 == current_node) ||
|
||||
((node_1 != current_node) && (node_1 > node_2)))
|
||||
swap = 1;
|
||||
}
|
||||
if (swap) {
|
||||
unsigned long temp;
|
||||
|
||||
temp = common->cpu_info[j];
|
||||
common->cpu_info[j] = common->cpu_info[j + 1];
|
||||
common->cpu_info[j + 1] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
h = 1;
|
||||
|
||||
while (h < count) h = 2 * h + 1;
|
||||
|
||||
while (h > 1) {
|
||||
h /= 2;
|
||||
for (i = h; i < count; i++) {
|
||||
work = common -> cpu_info[i];
|
||||
bit = CPU_ISSET(i, &cpu_orig_mask[0]);
|
||||
j = i - h;
|
||||
while (work < common -> cpu_info[j]) {
|
||||
common -> cpu_info[j + h] = common -> cpu_info[j];
|
||||
if (CPU_ISSET(j, &cpu_orig_mask[0])) {
|
||||
CPU_SET(j + h, &cpu_orig_mask[0]);
|
||||
} else {
|
||||
CPU_CLR(j + h, &cpu_orig_mask[0]);
|
||||
}
|
||||
j -= h;
|
||||
if (j < 0) break;
|
||||
}
|
||||
common -> cpu_info[j + h] = work;
|
||||
if (bit) {
|
||||
CPU_SET(j + h, &cpu_orig_mask[0]);
|
||||
} else {
|
||||
CPU_CLR(j + h, &cpu_orig_mask[0]);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -416,7 +470,10 @@ static void numa_mapping(void) {
|
|||
fprintf(stderr, "\nSorting ...\n\n");
|
||||
|
||||
for (cpu = 0; cpu < count; cpu++)
|
||||
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
|
||||
fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu],
|
||||
READ_CPU(common -> cpu_info[cpu]),
|
||||
READ_CORE(common -> cpu_info[cpu]),
|
||||
READ_NODE(common -> cpu_info[cpu]));
|
||||
#endif
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -167,7 +167,7 @@ int get_L2_size(void){
|
|||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
|
||||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER)
|
||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
|
||||
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
|
|
@ -251,7 +251,7 @@ int get_L2_size(void){
|
|||
void blas_set_parameter(void){
|
||||
|
||||
int factor;
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
int size = 16;
|
||||
#else
|
||||
int size = get_L2_size();
|
||||
|
|
|
|||
|
|
@ -110,9 +110,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
|
|||
endif
|
||||
ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2))
|
||||
#only build without Fortran
|
||||
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
else
|
||||
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
endif
|
||||
|
||||
dllinit.$(SUFFIX) : dllinit.c
|
||||
|
|
|
|||
9
f_check
9
f_check
|
|
@ -114,7 +114,7 @@ if ($compiler eq "") {
|
|||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($data =~ /IBM/) {
|
||||
if ($data =~ /IBM XL/) {
|
||||
$vendor = IBM;
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
|
@ -223,7 +223,12 @@ if (!$?) {
|
|||
}
|
||||
#For gfortran MIPS
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
$mips_data = `$compiler_bin -E -dM - < /dev/null`;
|
||||
if ($mips_data =~ /_MIPS_ISA_MIPS64/) {
|
||||
$link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
} else {
|
||||
$link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
}
|
||||
$binary = "" if ($?);
|
||||
}
|
||||
|
|
|
|||
49
getarch.c
49
getarch.c
|
|
@ -131,6 +131,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/* #define FORCE_SICORTEX */
|
||||
/* #define FORCE_LOONGSON3A */
|
||||
/* #define FORCE_LOONGSON3B */
|
||||
/* #define FORCE_I6400 */
|
||||
/* #define FORCE_P6600 */
|
||||
/* #define FORCE_P5600 */
|
||||
/* #define FORCE_ITANIUM2 */
|
||||
/* #define FORCE_SPARC */
|
||||
/* #define FORCE_SPARCV7 */
|
||||
|
|
@ -699,6 +702,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_I6400
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "MIPS"
|
||||
#define SUBARCHITECTURE "I6400"
|
||||
#define SUBDIRNAME "mips64"
|
||||
#define ARCHCONFIG "-DI6400 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
#define LIBNAME "i6400"
|
||||
#define CORENAME "I6400"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_P6600
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "MIPS"
|
||||
#define SUBARCHITECTURE "P6600"
|
||||
#define SUBDIRNAME "mips64"
|
||||
#define ARCHCONFIG "-DP6600 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
#define LIBNAME "p6600"
|
||||
#define CORENAME "P6600"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_P5600
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "MIPS"
|
||||
#define SUBARCHITECTURE "P5600"
|
||||
#define SUBDIRNAME "mips"
|
||||
#define ARCHCONFIG "-DP5600 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
#define LIBNAME "p5600"
|
||||
#define CORENAME "P5600"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ITANIUM2
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "IA64"
|
||||
|
|
@ -888,7 +933,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#ifdef __mips__
|
||||
#ifdef __mips64
|
||||
#include "cpuid_mips64.c"
|
||||
#else
|
||||
#include "cpuid_mips.c"
|
||||
#endif
|
||||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
include_directories(${CMAKE_SOURCE_DIR})
|
||||
include_directories(${PROJECT_SOURCE_DIR})
|
||||
|
||||
|
||||
set(BLAS1_SOURCES
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -42,6 +42,10 @@
|
|||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
// Disable multi-threading as it does not show any performance
|
||||
// benefits. Keep the multi-threading code for the record.
|
||||
#undef SMP
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
|
||||
|
|
|
|||
|
|
@ -243,6 +243,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
#endif
|
||||
{
|
||||
buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT);
|
||||
// It seems to be required for some K8 or Barcelona CPU
|
||||
buffer_size += 8;
|
||||
if(incx != 1)
|
||||
buffer_size += n * 2;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
|
||||
include_directories(${CMAKE_SOURCE_DIR})
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/kernel.cmake")
|
||||
include_directories(${PROJECT_SOURCE_DIR})
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/kernel.cmake")
|
||||
|
||||
# Makefile
|
||||
|
||||
|
|
|
|||
|
|
@ -12,10 +12,6 @@ ifeq ($(ARCH), ia64)
|
|||
USE_GEMM3M = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), MIPS)
|
||||
USE_GEMM3M = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), arm)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -40,6 +40,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
{
|
||||
BLASLONG i=0,j=0;
|
||||
|
||||
if ( (n <= 0) || (inc_x <= 0))
|
||||
return(0);
|
||||
|
||||
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
|
|
|
|||
|
|
@ -43,6 +43,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
|
|||
BLASLONG ip = 0;
|
||||
FLOAT temp;
|
||||
|
||||
if ( (n <= 0) || (inc_x <= 0))
|
||||
return(0);
|
||||
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -58,43 +58,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
str TMPF, [Y], #SZ
|
||||
#else
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v0.2s}, [X], #8
|
||||
st1 {v0.2s}, [Y], #8
|
||||
ldr d0, [X], #8
|
||||
str d0, [Y], #8
|
||||
#else
|
||||
ld1 {v0.2d}, [X], #16
|
||||
st1 {v0.2d}, [Y], #16
|
||||
ldr q0, [X], #16
|
||||
str q0, [Y], #16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v0.4s}, [X], #16
|
||||
st1 {v0.4s}, [Y], #16
|
||||
ldr q0, [X], #16
|
||||
str q0, [Y], #16
|
||||
#else // DOUBLE
|
||||
ld1 {v0.4s}, [X], #16
|
||||
ld1 {v1.4s}, [X], #16
|
||||
st1 {v0.4s}, [Y], #16
|
||||
st1 {v1.4s}, [Y], #16
|
||||
ldr q0, [X], #16
|
||||
str q0, [Y], #16
|
||||
ldr q1, [X], #16
|
||||
str q1, [Y], #16
|
||||
|
||||
#endif
|
||||
#else // COMPLEX
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v0.4s}, [X], #16
|
||||
ld1 {v1.4s}, [X], #16
|
||||
st1 {v0.4s}, [Y], #16
|
||||
st1 {v1.4s}, [Y], #16
|
||||
ldr q0, [X], #16
|
||||
str q0, [Y], #16
|
||||
ldr q1, [X], #16
|
||||
str q1, [Y], #16
|
||||
#else // DOUBLE
|
||||
ld1 {v0.4s}, [X], #16
|
||||
ld1 {v1.4s}, [X], #16
|
||||
ld1 {v2.4s}, [X], #16
|
||||
ld1 {v3.4s}, [X], #16
|
||||
st1 {v0.4s}, [Y], #16
|
||||
st1 {v1.4s}, [Y], #16
|
||||
st1 {v2.4s}, [Y], #16
|
||||
st1 {v3.4s}, [Y], #16
|
||||
ldr q0, [X], #16
|
||||
str q0, [Y], #16
|
||||
ldr q1, [X], #16
|
||||
str q1, [Y], #16
|
||||
ldr q2, [X], #16
|
||||
str q2, [Y], #16
|
||||
ldr q3, [X], #16
|
||||
str q3, [Y], #16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -339,7 +339,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp q0, q1, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ldp q2, q3, [pCRow0]
|
||||
fmla v2.2d, v18.2d, alphaV0
|
||||
|
|
@ -356,7 +355,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp q4, q5, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, #32
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ldp q6, q7, [pCRow1]
|
||||
fmla v6.2d, v22.2d, alphaV0
|
||||
|
|
@ -373,7 +371,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp q0, q1, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #32
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
ldp q2, q3, [pCRow2]
|
||||
fmla v2.2d, v26.2d, alphaV0
|
||||
|
|
@ -390,7 +387,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp q4, q5, [pCRow3]
|
||||
|
||||
add pCRow3, pCRow3, #32
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
ldp q6, q7, [pCRow3]
|
||||
fmla v6.2d, v30.2d, alphaV0
|
||||
|
|
@ -434,33 +430,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro SAVE4x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
fmla v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
add pCRow0, pCRow0, #32
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
fmla v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
add pCRow1, pCRow1, #32
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow2]
|
||||
fmla v8.2d, v24.2d, alphaV0
|
||||
fmla v9.2d, v25.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
add pCRow2, pCRow2, #32
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||
ld1 {v12.2d, v13.2d}, [pCRow3]
|
||||
fmla v12.2d, v28.2d, alphaV0
|
||||
fmla v13.2d, v29.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
st1 {v12.2d, v13.2d}, [pCRow3]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
add pCRow3, pCRow3, #32
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
|
@ -487,29 +488,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro SAVE2x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
add pCRow0, pCRow0, #16
|
||||
|
||||
ld1 {v12.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
add pCRow1, pCRow1, #16
|
||||
|
||||
ld1 {v8.2d}, [pCRow2]
|
||||
fmla v8.2d, v24.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
add pCRow2, pCRow2, #16
|
||||
|
||||
ld1 {v12.2d}, [pCRow1]
|
||||
ld1 {v12.2d}, [pCRow3]
|
||||
fmla v12.2d, v28.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
st1 {v12.2d}, [pCRow3]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
add pCRow3, pCRow3, #16
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
|
@ -532,7 +538,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro SAVE1x4
|
||||
fmov alpha0, alpha
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v8.d}[0], [pCRow0]
|
||||
ld1 {v8.d}[1], [pCRow1]
|
||||
|
|
@ -540,16 +545,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
st1 {v8.d}[0], [pCRow0]
|
||||
st1 {v8.d}[1], [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow1, pCRow2, LDC
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
add pCRow0, pCRow0, #8
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
add pCRow1, pCRow1, #8
|
||||
|
||||
ld1 {v12.d}[0], [pCRow2]
|
||||
ld1 {v12.d}[1], [pCRow1]
|
||||
ld1 {v12.d}[1], [pCRow3]
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.d}[0], [pCRow2]
|
||||
st1 {v12.d}[1], [pCRow1]
|
||||
st1 {v12.d}[1], [pCRow3]
|
||||
|
||||
add pCRow0, pCRow0, #8
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
add pCRow2, pCRow2, #8
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
add pCRow3, pCRow3, #8
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
|
@ -578,6 +588,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
fmla v22.2d, v2.2d, v8.d[1]
|
||||
|
|
@ -586,7 +598,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro SAVE8x2
|
||||
fmov alpha0, alpha
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
fmla v0.2d, v16.2d, alphaV0
|
||||
|
|
@ -595,6 +606,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmla v3.2d, v19.2d, alphaV0
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
add pCRow0, pCRow0, #64
|
||||
|
||||
ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0
|
||||
fmla v5.2d, v21.2d, alphaV0
|
||||
|
|
@ -602,7 +616,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmla v7.2d, v23.2d, alphaV0
|
||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
add pCRow1, pCRow1, #64
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
|
@ -628,19 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro SAVE4x2
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
fmla v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
add pCRow0, pCRow0, #32
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
fmla v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
add pCRow1, pCRow1, #32
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
|
@ -663,17 +681,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro SAVE2x2
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
||||
add pCRow1 , pCRow0, LDC
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
add pCRow0, pCRow0, #16
|
||||
|
||||
ld1 {v12.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
add pCRow1, pCRow1, #16
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
|
@ -694,7 +715,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro SAVE1x2
|
||||
fmov alpha0, alpha
|
||||
add pCRow1 , pCRow0, LDC
|
||||
|
||||
ld1 {v8.d}[0], [pCRow0]
|
||||
ld1 {v8.d}[1], [pCRow1]
|
||||
|
|
@ -702,7 +722,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
st1 {v8.d}[0], [pCRow0]
|
||||
st1 {v8.d}[1], [pCRow1]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
add pCRow0, pCRow0, #8
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
add pCRow1, pCRow1, #8
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
|
@ -726,12 +749,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x1
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
fmla v0.2d, v16.2d, alphaV0
|
||||
fmla v1.2d, v17.2d, alphaV0
|
||||
|
|
@ -739,6 +764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmla v3.2d, v19.2d, alphaV0
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
add pCRow0, pCRow0, #64
|
||||
.endm
|
||||
|
||||
|
|
@ -763,11 +789,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro SAVE4x1
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
fmla v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
add pCRow0, pCRow0, #32
|
||||
.endm
|
||||
|
||||
|
|
@ -790,10 +818,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro SAVE2x1
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
add pCRow0, pCRow0, #16
|
||||
.endm
|
||||
|
||||
|
|
@ -819,6 +849,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmadd d8, d16, alpha0, d8
|
||||
str d8, [pCRow0]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
add pCRow0, pCRow0, #8
|
||||
.endm
|
||||
|
||||
|
|
@ -858,6 +889,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
/******************************************************************************/
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_BEGIN:
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
|
@ -989,17 +1021,26 @@ dgemm_kernel_L4_M4_20:
|
|||
cmp counterL , #0
|
||||
ble dgemm_kernel_L4_M4_40
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M4_22:
|
||||
|
||||
KERNEL4x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL4x4_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL4x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL4x4_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
KERNEL4x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL4x4_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL4x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL4x4_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dgemm_kernel_L4_M4_22
|
||||
|
|
@ -1012,6 +1053,8 @@ dgemm_kernel_L4_M4_40:
|
|||
dgemm_kernel_L4_M4_42:
|
||||
|
||||
KERNEL4x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dgemm_kernel_L4_M4_42
|
||||
|
|
@ -1022,7 +1065,6 @@ dgemm_kernel_L4_M4_100:
|
|||
|
||||
dgemm_kernel_L4_M4_END:
|
||||
|
||||
|
||||
dgemm_kernel_L4_M2_BEGIN:
|
||||
|
||||
mov counterI, origM
|
||||
|
|
@ -1042,16 +1084,23 @@ dgemm_kernel_L4_M2_20:
|
|||
cmp counterL , #0
|
||||
ble dgemm_kernel_L4_M2_40
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M2_22:
|
||||
|
||||
KERNEL2x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL2x4_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL2x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL2x4_SUB
|
||||
|
||||
KERNEL2x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL2x4_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL2x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL2x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
|
|
@ -1063,9 +1112,12 @@ dgemm_kernel_L4_M2_40:
|
|||
ands counterL , origK, #7 // counterL = counterL % 8
|
||||
ble dgemm_kernel_L4_M2_100
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
dgemm_kernel_L4_M2_42:
|
||||
|
||||
KERNEL2x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dgemm_kernel_L4_M2_42
|
||||
|
|
@ -1092,15 +1144,22 @@ dgemm_kernel_L4_M1_20:
|
|||
cmp counterL , #0
|
||||
ble dgemm_kernel_L4_M1_40
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M1_22:
|
||||
KERNEL1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL1x4_SUB
|
||||
KERNEL1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL1x4_SUB
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
KERNEL1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL1x4_SUB
|
||||
KERNEL1x4_SUB
|
||||
KERNEL1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL1x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
|
|
@ -1112,9 +1171,11 @@ dgemm_kernel_L4_M1_40:
|
|||
ands counterL , origK, #7 // counterL = counterL % 8
|
||||
ble dgemm_kernel_L4_M1_100
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
dgemm_kernel_L4_M1_42:
|
||||
|
||||
KERNEL1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dgemm_kernel_L4_M1_42
|
||||
|
|
@ -1143,9 +1204,10 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
|
|||
tst counterJ , #2
|
||||
ble dgemm_kernel_L1_BEGIN
|
||||
|
||||
mov pCRow0, pC // pCRow0 = pC
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
add pC,pC,LDC, lsl #1
|
||||
add pC, pCRow1, LDC
|
||||
|
||||
mov pA, origPA // pA = A
|
||||
|
||||
|
|
@ -1156,6 +1218,7 @@ dgemm_kernel_L2_M8_BEGIN:
|
|||
cmp counterI, #0
|
||||
ble dgemm_kernel_L2_M4_BEGIN
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L2_M8_20:
|
||||
|
||||
INIT8x2
|
||||
|
|
@ -1165,28 +1228,31 @@ dgemm_kernel_L2_M8_20:
|
|||
asr counterL , origK, #3 // counterL = counterL / 8
|
||||
cmp counterL,#0
|
||||
ble dgemm_kernel_L2_M8_40
|
||||
.align 5
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L2_M8_22:
|
||||
KERNEL8x2_SUB
|
||||
KERNEL8x2_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL8x2_SUB
|
||||
KERNEL8x2_SUB
|
||||
|
||||
KERNEL8x2_SUB
|
||||
KERNEL8x2_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL8x2_SUB
|
||||
KERNEL8x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dgemm_kernel_L2_M8_22
|
||||
|
||||
|
||||
dgemm_kernel_L2_M8_40:
|
||||
|
||||
ands counterL , origK, #7 // counterL = counterL % 8
|
||||
ble dgemm_kernel_L2_M8_100
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
dgemm_kernel_L2_M8_42:
|
||||
|
||||
KERNEL8x2_SUB
|
||||
|
|
@ -1221,17 +1287,23 @@ dgemm_kernel_L2_M4_20:
|
|||
asr counterL , origK, #3 // counterL = counterL / 8
|
||||
cmp counterL,#0
|
||||
ble dgemm_kernel_L2_M4_40
|
||||
.align 5
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L2_M4_22:
|
||||
KERNEL4x2_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL4x2_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL4x2_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL4x2_SUB
|
||||
|
||||
KERNEL4x2_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL4x2_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL4x2_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL4x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
|
|
@ -1243,9 +1315,12 @@ dgemm_kernel_L2_M4_40:
|
|||
ands counterL , origK, #7 // counterL = counterL % 8
|
||||
ble dgemm_kernel_L2_M4_100
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
dgemm_kernel_L2_M4_42:
|
||||
|
||||
KERNEL4x2_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dgemm_kernel_L2_M4_42
|
||||
|
|
@ -1279,19 +1354,26 @@ dgemm_kernel_L2_M2_20:
|
|||
dgemm_kernel_L2_M2_22:
|
||||
|
||||
KERNEL2x2_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL2x2_SUB
|
||||
KERNEL2x2_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL2x2_SUB
|
||||
|
||||
KERNEL2x2_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL2x2_SUB
|
||||
KERNEL2x2_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL2x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dgemm_kernel_L2_M2_22
|
||||
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
dgemm_kernel_L2_M2_40:
|
||||
|
||||
ands counterL , origK, #7 // counterL = counterL % 8
|
||||
|
|
@ -1329,18 +1411,24 @@ dgemm_kernel_L2_M1_20:
|
|||
dgemm_kernel_L2_M1_22:
|
||||
KERNEL1x2_SUB
|
||||
KERNEL1x2_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL1x2_SUB
|
||||
KERNEL1x2_SUB
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
KERNEL1x2_SUB
|
||||
KERNEL1x2_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL1x2_SUB
|
||||
KERNEL1x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dgemm_kernel_L2_M1_22
|
||||
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
dgemm_kernel_L2_M1_40:
|
||||
|
||||
ands counterL , origK, #7 // counterL = counterL % 8
|
||||
|
|
@ -1380,6 +1468,7 @@ dgemm_kernel_L1_M8_BEGIN:
|
|||
cmp counterI, #0
|
||||
ble dgemm_kernel_L1_M4_BEGIN
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L1_M8_20:
|
||||
|
||||
INIT8x1
|
||||
|
|
@ -1388,14 +1477,16 @@ dgemm_kernel_L1_M8_20:
|
|||
asr counterL , origK, #3 // counterL = counterL / 8
|
||||
cmp counterL , #0
|
||||
ble dgemm_kernel_L1_M8_40
|
||||
.align 5
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L1_M8_22:
|
||||
KERNEL8x1_SUB
|
||||
KERNEL8x1_SUB
|
||||
KERNEL8x1_SUB
|
||||
KERNEL8x1_SUB
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
KERNEL8x1_SUB
|
||||
KERNEL8x1_SUB
|
||||
KERNEL8x1_SUB
|
||||
|
|
@ -1410,6 +1501,7 @@ dgemm_kernel_L1_M8_40:
|
|||
ands counterL , origK, #7 // counterL = counterL % 8
|
||||
ble dgemm_kernel_L1_M8_100
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
dgemm_kernel_L1_M8_42:
|
||||
|
||||
KERNEL8x1_SUB
|
||||
|
|
@ -1443,17 +1535,23 @@ dgemm_kernel_L1_M4_20:
|
|||
asr counterL , origK, #3 // counterL = counterL / 8
|
||||
cmp counterL , #0
|
||||
ble dgemm_kernel_L1_M4_40
|
||||
.align 5
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L1_M4_22:
|
||||
KERNEL4x1_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL4x1_SUB
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
KERNEL4x1_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL4x1_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
|
|
@ -1465,9 +1563,11 @@ dgemm_kernel_L1_M4_40:
|
|||
ands counterL , origK, #7 // counterL = counterL % 8
|
||||
ble dgemm_kernel_L1_M4_100
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
dgemm_kernel_L1_M4_42:
|
||||
|
||||
KERNEL4x1_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dgemm_kernel_L1_M4_42
|
||||
|
|
@ -1501,18 +1601,24 @@ dgemm_kernel_L1_M2_22:
|
|||
|
||||
KERNEL2x1_SUB
|
||||
KERNEL2x1_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL2x1_SUB
|
||||
KERNEL2x1_SUB
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
KERNEL2x1_SUB
|
||||
KERNEL2x1_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL2x1_SUB
|
||||
KERNEL2x1_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dgemm_kernel_L1_M2_22
|
||||
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
dgemm_kernel_L1_M2_40:
|
||||
|
||||
ands counterL , origK, #7 // counterL = counterL % 8
|
||||
|
|
@ -1547,14 +1653,17 @@ dgemm_kernel_L1_M1_20:
|
|||
cmp counterL , #0
|
||||
ble dgemm_kernel_L1_M1_40
|
||||
|
||||
|
||||
dgemm_kernel_L1_M1_22:
|
||||
KERNEL1x1_SUB
|
||||
KERNEL1x1_SUB
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
KERNEL1x1_SUB
|
||||
KERNEL1x1_SUB
|
||||
|
||||
KERNEL1x1_SUB
|
||||
KERNEL1x1_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNEL1x1_SUB
|
||||
KERNEL1x1_SUB
|
||||
|
||||
|
|
@ -1567,6 +1676,8 @@ dgemm_kernel_L1_M1_40:
|
|||
ands counterL , origK, #7 // counterL = counterL % 8
|
||||
ble dgemm_kernel_L1_M1_100
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
dgemm_kernel_L1_M1_42:
|
||||
|
||||
KERNEL1x1_SUB
|
||||
|
|
|
|||
|
|
@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
#define pA x15
|
||||
#define temp x16
|
||||
#define tempOffset x17
|
||||
#define tempK x18
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alpha x17
|
||||
#define temp x18
|
||||
#define tempOffset x19
|
||||
#define tempK x20
|
||||
|
||||
#define alpha0 d10
|
||||
#define alphaV0 v10.d[0]
|
||||
#define alpha1 d11
|
||||
#define alphaV1 v11.d[0]
|
||||
#define alpha2 d14
|
||||
#define alphaV2 v14.d[0]
|
||||
#define alpha3 d15
|
||||
#define alphaV3 v15.d[0]
|
||||
|
||||
#define A_PRE_SIZE 2560
|
||||
#define B_PRE_SIZE 448
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
// 00 origM
|
||||
// 01 origN
|
||||
|
|
@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//v05 pA1_2, pA1_3
|
||||
//v06 pA1_4, pA1_5
|
||||
//v07 pA1_6, pA1_7
|
||||
//v08 must save pB0_0, pB0_1
|
||||
//v09 must save pB0_2, pB0_3
|
||||
//v10 must save ALPHA0
|
||||
//v11 must save ALPHA1
|
||||
//v12 must save pB1_0, pB1_1
|
||||
//v13 must save pB1_2, pB1_3
|
||||
//v14 must save ALPHA2
|
||||
//v15 must save ALPHA3
|
||||
//v08 must save pB0_0
|
||||
//v09 must save pB0_1
|
||||
//v10 must save pB0_2 --> ALPHA0
|
||||
//v11 must save pB0_3
|
||||
//v12 must save pB1_0
|
||||
//v13 must save pB1_1
|
||||
//v14 must save pB1_2
|
||||
//v15 must save pB1_3
|
||||
//v16 must save C00, C01
|
||||
//v17 must save C02, C03
|
||||
//v18 C04, C05
|
||||
|
|
@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_I
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ldp q0, q1, [pA], #32
|
||||
|
||||
ldp d8, d9, [pB], #16
|
||||
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
fmul v20.2d, v0.2d, v9.d[0]
|
||||
|
||||
ldp d10, d11, [pB], #16
|
||||
|
||||
fmul v17.2d, v1.2d, v8.d[0]
|
||||
fmul v21.2d, v1.2d, v9.d[0]
|
||||
|
||||
ldp q2, q3, [pA], #32
|
||||
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
fmul v28.2d, v0.2d, v11.d[0]
|
||||
|
||||
ldp q4, q5, [pA], #32
|
||||
|
||||
fmul v25.2d, v1.2d, v10.d[0]
|
||||
fmul v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
ldp d12, d13, [pB], #16
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
fmul v22.2d, v2.2d, v9.d[0]
|
||||
|
||||
ldp d14, d15, [pB], #16
|
||||
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
fmul v30.2d, v2.2d, v11.d[0]
|
||||
|
||||
ldp q6, q7, [pA], #32
|
||||
|
||||
fmul v19.2d, v3.2d, v8.d[0]
|
||||
fmul v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
fmul v21.2d, v1.2d, v8.d[1]
|
||||
fmul v22.2d, v2.2d, v8.d[1]
|
||||
fmul v23.2d, v3.2d, v8.d[1]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmul v24.2d, v0.2d, v9.d[0]
|
||||
fmul v25.2d, v1.2d, v9.d[0]
|
||||
fmul v26.2d, v2.2d, v9.d[0]
|
||||
fmul v27.2d, v3.2d, v9.d[0]
|
||||
fmul v31.2d, v3.2d, v11.d[0]
|
||||
fmul v23.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmul v28.2d, v0.2d, v9.d[1]
|
||||
fmul v29.2d, v1.2d, v9.d[1]
|
||||
fmul v30.2d, v2.2d, v9.d[1]
|
||||
fmul v31.2d, v3.2d, v9.d[1]
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v6.2d, v7.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
|
||||
ldp q4, q5, [pA], #32
|
||||
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
|
||||
ldp d12, d13, [pB], #16
|
||||
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
ldp d14, d15, [pB], #16
|
||||
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v22.2d, v2.2d, v9.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmla v26.2d, v2.2d, v10.d[0]
|
||||
fmla v30.2d, v2.2d, v11.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
fmla v23.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
fmla v22.2d, v2.2d, v8.d[1]
|
||||
fmla v23.2d, v3.2d, v8.d[1]
|
||||
ldp q6, q7, [pA], #32
|
||||
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
fmla v26.2d, v2.2d, v9.d[0]
|
||||
fmla v27.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
fmla v30.2d, v2.2d, v9.d[1]
|
||||
fmla v31.2d, v3.2d, v9.d[1]
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v6.2d, v7.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
fmla v27.2d, v3.2d, v10.d[0]
|
||||
fmla v31.2d, v3.2d, v11.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
|
||||
ldp q0, q1, [pA], #32
|
||||
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
ldp d8, d9, [pB], #16
|
||||
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
|
||||
ldp d10, d11, [pB], #16
|
||||
|
||||
fmla v18.2d, v6.2d, v12.d[0]
|
||||
fmla v22.2d, v6.2d, v13.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v26.2d, v6.2d, v14.d[0]
|
||||
fmla v30.2d, v6.2d, v15.d[0]
|
||||
|
||||
fmla v19.2d, v7.2d, v12.d[0]
|
||||
fmla v23.2d, v7.2d, v13.d[0]
|
||||
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
fmla v22.2d, v6.2d, v12.d[1]
|
||||
fmla v23.2d, v7.2d, v12.d[1]
|
||||
ldp q2, q3, [pA], #32
|
||||
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
fmla v26.2d, v6.2d, v13.d[0]
|
||||
fmla v27.2d, v7.2d, v13.d[0]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
fmla v30.2d, v6.2d, v13.d[1]
|
||||
fmla v31.2d, v7.2d, v13.d[1]
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
fmla v27.2d, v7.2d, v14.d[0]
|
||||
fmla v31.2d, v7.2d, v15.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v18.2d, v6.2d, v12.d[0]
|
||||
fmla v22.2d, v6.2d, v13.d[0]
|
||||
fmla v26.2d, v6.2d, v14.d[0]
|
||||
fmla v30.2d, v6.2d, v15.d[0]
|
||||
|
||||
fmla v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
fmla v22.2d, v6.2d, v12.d[1]
|
||||
fmla v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
fmla v26.2d, v6.2d, v13.d[0]
|
||||
fmla v27.2d, v7.2d, v13.d[0]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
fmla v30.2d, v6.2d, v13.d[1]
|
||||
fmla v31.2d, v7.2d, v13.d[1]
|
||||
fmla v23.2d, v7.2d, v13.d[0]
|
||||
fmla v27.2d, v7.2d, v14.d[0]
|
||||
fmla v31.2d, v7.2d, v15.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ldp q0, q1, [pA], #32
|
||||
|
||||
ldp d8, d9, [pB], #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
|
||||
ldp d10, d11, [pB], #16
|
||||
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
|
||||
ldp q2, q3, [pA], #32
|
||||
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v22.2d, v2.2d, v9.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
|
||||
fmla v26.2d, v2.2d, v10.d[0]
|
||||
fmla v30.2d, v2.2d, v11.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
fmla v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
fmla v22.2d, v2.2d, v8.d[1]
|
||||
fmla v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
fmla v26.2d, v2.2d, v9.d[0]
|
||||
fmla v27.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
fmla v30.2d, v2.2d, v9.d[1]
|
||||
fmla v31.2d, v3.2d, v9.d[1]
|
||||
fmla v31.2d, v3.2d, v11.d[0]
|
||||
fmla v23.2d, v3.2d, v9.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
add pCRow1, pCRow0, LDC
|
||||
fmov alpha0, alpha
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0
|
||||
fmul v1.2d, v17.2d, alphaV1
|
||||
fmul v2.2d, v18.2d, alphaV2
|
||||
fmul v3.2d, v19.2d, alphaV3
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
fmul v1.2d, v17.2d, alphaV0
|
||||
stp q0, q1, [pCRow0]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow0, pCRow0, #32
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
fmul v2.2d, v18.2d, alphaV0
|
||||
fmul v3.2d, v19.2d, alphaV0
|
||||
stp q2, q3, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
fmul v4.2d, v20.2d, alphaV0
|
||||
fmul v5.2d, v21.2d, alphaV1
|
||||
fmul v6.2d, v22.2d, alphaV2
|
||||
fmul v7.2d, v23.2d, alphaV3
|
||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
||||
fmul v5.2d, v21.2d, alphaV0
|
||||
stp q4, q5, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
add pCRow1, pCRow1, #32
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
fmul v6.2d, v22.2d, alphaV0
|
||||
fmul v7.2d, v23.2d, alphaV0
|
||||
stp q6, q7, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, #32
|
||||
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
fmul v0.2d, v24.2d, alphaV0
|
||||
fmul v1.2d, v25.2d, alphaV1
|
||||
fmul v2.2d, v26.2d, alphaV2
|
||||
fmul v3.2d, v27.2d, alphaV3
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2]
|
||||
fmul v1.2d, v25.2d, alphaV0
|
||||
stp q0, q1, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #32
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
fmul v2.2d, v26.2d, alphaV0
|
||||
fmul v3.2d, v27.2d, alphaV0
|
||||
stp q2, q3, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #32
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
fmul v4.2d, v28.2d, alphaV0
|
||||
fmul v5.2d, v29.2d, alphaV1
|
||||
fmul v6.2d, v30.2d, alphaV2
|
||||
fmul v7.2d, v31.2d, alphaV3
|
||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
||||
fmul v5.2d, v29.2d, alphaV0
|
||||
stp q4, q5, [pCRow3]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
add pCRow3, pCRow3, #32
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
fmul v6.2d, v30.2d, alphaV0
|
||||
fmul v7.2d, v31.2d, alphaV0
|
||||
stp q6, q7, [pCRow3]
|
||||
|
||||
add pCRow3, pCRow3, #32
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
|
@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
fmul v9.2d, v17.2d, alphaV1
|
||||
fmul v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
fmul v12.2d, v20.2d, alphaV2
|
||||
fmul v13.2d, v21.2d, alphaV3
|
||||
fmul v12.2d, v20.2d, alphaV0
|
||||
fmul v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
|
||||
fmul v8.2d, v24.2d, alphaV0
|
||||
fmul v9.2d, v25.2d, alphaV1
|
||||
fmul v9.2d, v25.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
fmul v12.2d, v28.2d, alphaV2
|
||||
fmul v13.2d, v29.2d, alphaV3
|
||||
fmul v12.2d, v28.2d, alphaV0
|
||||
fmul v13.2d, v29.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
|
@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
fmul v12.2d, v20.2d, alphaV1
|
||||
fmul v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
|
||||
fmul v8.2d, v24.2d, alphaV2
|
||||
fmul v8.2d, v24.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
fmul v12.2d, v28.2d, alphaV3
|
||||
fmul v12.2d, v28.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
|
@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
|
|
@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add pCRow2, pCRow1, LDC
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
fmul v12.2d, v20.2d, alphaV1
|
||||
fmul v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.d}[0], [pCRow2]
|
||||
st1 {v12.d}[1], [pCRow1]
|
||||
|
||||
|
|
@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE8x2
|
||||
fmov alpha0, alpha
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0
|
||||
fmul v1.2d, v17.2d, alphaV1
|
||||
fmul v2.2d, v18.2d, alphaV2
|
||||
fmul v3.2d, v19.2d, alphaV3
|
||||
fmul v1.2d, v17.2d, alphaV0
|
||||
fmul v2.2d, v18.2d, alphaV0
|
||||
fmul v3.2d, v19.2d, alphaV0
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
fmul v4.2d, v20.2d, alphaV0
|
||||
fmul v5.2d, v21.2d, alphaV1
|
||||
fmul v6.2d, v22.2d, alphaV2
|
||||
fmul v7.2d, v23.2d, alphaV3
|
||||
fmul v5.2d, v21.2d, alphaV0
|
||||
fmul v6.2d, v22.2d, alphaV0
|
||||
fmul v7.2d, v23.2d, alphaV0
|
||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
|
@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
fmul v9.2d, v17.2d, alphaV1
|
||||
fmul v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
fmul v12.2d, v20.2d, alphaV2
|
||||
fmul v13.2d, v21.2d, alphaV3
|
||||
fmul v12.2d, v20.2d, alphaV0
|
||||
fmul v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
|
@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
||||
add pCRow1 , pCRow0, LDC
|
||||
|
||||
fmul v12.2d, v20.2d, alphaV1
|
||||
fmul v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
|
@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
fmov alpha0, alpha
|
||||
add pCRow1 , pCRow0, LDC
|
||||
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
|
|
@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE8x1
|
||||
fmov alpha0, alpha
|
||||
fmul v0.2d, v16.2d, alphaV0
|
||||
fmul v1.2d, v17.2d, alphaV1
|
||||
fmul v2.2d, v18.2d, alphaV2
|
||||
fmul v3.2d, v19.2d, alphaV3
|
||||
fmul v1.2d, v17.2d, alphaV0
|
||||
fmul v2.2d, v18.2d, alphaV0
|
||||
fmul v3.2d, v19.2d, alphaV0
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
|
@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
fmul v9.2d, v17.2d, alphaV1
|
||||
fmul v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
|
@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
||||
|
|
@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x1
|
||||
fmov alpha0, alpha
|
||||
fmul d8, d16, alpha0
|
||||
str d8, [pCRow0]
|
||||
|
||||
|
|
@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
fmov alpha0, d0
|
||||
fmov alpha1, d0
|
||||
fmov alpha2, d0
|
||||
fmov alpha3, d0
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alpha, d0
|
||||
|
||||
lsl LDC, LDC, #3 // ldc = ldc * 8
|
||||
|
||||
|
|
@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/******************************************************************************/
|
||||
|
||||
dtrmm_kernel_L4_BEGIN:
|
||||
mov pCRow0, pC // pCRow0 = C
|
||||
add pC, pC, LDC, lsl #2
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow3, pCRow2, LDC
|
||||
|
||||
add pC, pCRow3, LDC
|
||||
|
||||
|
||||
#if defined(LEFT)
|
||||
mov tempOffset, offset
|
||||
|
|
@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN:
|
|||
cmp counterI, #0
|
||||
ble dtrmm_kernel_L4_M4_BEGIN
|
||||
|
||||
.align 5
|
||||
dtrmm_kernel_L4_M8_20:
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
|
@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20:
|
|||
add tempK, tempOffset, #4
|
||||
#endif
|
||||
|
||||
asr counterL , tempK, #1 // L = K / 2
|
||||
asr counterL , tempK, #3 // L = K / 8
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
blt dtrmm_kernel_L4_M8_32
|
||||
|
||||
KERNEL8x4_I // do one in the K
|
||||
KERNEL8x4_M2 // do another in the K
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble dtrmm_kernel_L4_M8_22a
|
||||
.align 5
|
||||
|
||||
.align 5
|
||||
dtrmm_kernel_L4_M8_22:
|
||||
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dtrmm_kernel_L4_M8_22
|
||||
|
||||
|
||||
.align 5
|
||||
dtrmm_kernel_L4_M8_22a:
|
||||
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_E
|
||||
|
||||
b dtrmm_kernel_L4_M8_44
|
||||
|
||||
.align 5
|
||||
dtrmm_kernel_L4_M8_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble dtrmm_kernel_L4_M8_40
|
||||
|
||||
KERNEL8x4_I
|
||||
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_E
|
||||
|
||||
b dtrmm_kernel_L4_M8_44
|
||||
|
|
@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40:
|
|||
|
||||
dtrmm_kernel_L4_M8_44:
|
||||
|
||||
ands counterL , tempK, #1
|
||||
ands counterL , tempK, #7
|
||||
ble dtrmm_kernel_L4_M8_100
|
||||
|
||||
.align 5
|
||||
dtrmm_kernel_L4_M8_46:
|
||||
|
||||
KERNEL8x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne dtrmm_kernel_L4_M8_46
|
||||
|
||||
dtrmm_kernel_L4_M8_100:
|
||||
|
||||
SAVE8x4
|
||||
|
|
@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100:
|
|||
#if defined(LEFT)
|
||||
add tempOffset, tempOffset, #8
|
||||
#endif
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
dtrmm_kernel_L4_M8_END:
|
||||
subs counterI, counterI, #1
|
||||
|
|
|
|||
|
|
@ -68,6 +68,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define SHZ 3
|
||||
#endif
|
||||
|
||||
#define A_PRE_SIZE 768
|
||||
#define Y_PRE_SIZE 768
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro SAVE_REGS
|
||||
|
|
@ -105,36 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v2.4s, v3.4s}, [A_PTR], #32
|
||||
ld1 {v4.4s, v5.4s}, [Y_IPTR], #32
|
||||
fmla v4.4s, v1.4s, v2.4s
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
|
||||
fmla v5.4s, v1.4s, v3.4s
|
||||
st1 {v4.4s, v5.4s}, [Y_OPTR], #32
|
||||
|
||||
ld1 {v6.4s, v7.4s}, [A_PTR], #32
|
||||
ld1 {v8.4s, v9.4s}, [Y_IPTR], #32
|
||||
fmla v8.4s, v1.4s, v6.4s
|
||||
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
|
||||
fmla v9.4s, v1.4s, v7.4s
|
||||
st1 {v8.4s, v9.4s}, [Y_OPTR], #32
|
||||
#else //DOUBLE
|
||||
ld1 {v2.2d, v3.2d}, [A_PTR], #32
|
||||
ld1 {v4.2d, v5.2d}, [Y_IPTR], #32
|
||||
fmla v4.2d, v1.2d, v2.2d
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
|
||||
fmla v5.2d, v1.2d, v3.2d
|
||||
st1 {v4.2d, v5.2d}, [Y_OPTR], #32
|
||||
|
||||
ld1 {v6.2d, v7.2d}, [A_PTR], #32
|
||||
ld1 {v8.2d, v9.2d}, [Y_IPTR], #32
|
||||
fmla v8.2d, v1.2d, v6.2d
|
||||
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
|
||||
fmla v9.2d, v1.2d, v7.2d
|
||||
st1 {v8.2d, v9.2d}, [Y_OPTR], #32
|
||||
|
||||
ld1 {v10.2d, v11.2d}, [A_PTR], #32
|
||||
ld1 {v12.2d, v13.2d}, [Y_IPTR], #32
|
||||
fmla v12.2d, v1.2d, v10.2d
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
|
||||
fmla v13.2d, v1.2d, v11.2d
|
||||
st1 {v12.2d, v13.2d}, [Y_OPTR], #32
|
||||
|
||||
ld1 {v14.2d, v15.2d}, [A_PTR], #32
|
||||
ld1 {v16.2d, v17.2d}, [Y_IPTR], #32
|
||||
fmla v16.2d, v1.2d, v14.2d
|
||||
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
|
||||
fmla v17.2d, v1.2d, v15.2d
|
||||
st1 {v16.2d, v17.2d}, [Y_OPTR], #32
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define J x11 /* loop variable */
|
||||
#define I x12 /* loop variable */
|
||||
|
||||
#define X_PREFETCH_SIZE 768
|
||||
#define A_PREFETCH_SIZE 768
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
|
@ -112,42 +115,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64
|
||||
ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64
|
||||
fmla v1.4s, v5.4s, v9.4s
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
|
||||
fmla v2.4s, v6.4s, v10.4s
|
||||
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
|
||||
fmla v3.4s, v7.4s, v11.4s
|
||||
ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
|
||||
fmla v4.4s, v8.4s, v12.4s
|
||||
|
||||
ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
|
||||
ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64
|
||||
fmla v1.4s, v13.4s, v17.4s
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
|
||||
fmla v2.4s, v14.4s, v18.4s
|
||||
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
|
||||
fmla v3.4s, v15.4s, v19.4s
|
||||
fmla v4.4s, v16.4s, v20.4s
|
||||
#else
|
||||
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
|
||||
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
|
||||
fmla v1.2d, v5.2d, v9.2d
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
|
||||
fmla v2.2d, v6.2d, v10.2d
|
||||
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
|
||||
fmla v3.2d, v7.2d, v11.2d
|
||||
fmla v4.2d, v8.2d, v12.2d
|
||||
|
||||
ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
|
||||
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
|
||||
fmla v1.2d, v13.2d, v17.2d
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
|
||||
fmla v2.2d, v14.2d, v18.2d
|
||||
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
|
||||
fmla v3.2d, v15.2d, v19.2d
|
||||
fmla v4.2d, v16.2d, v20.2d
|
||||
|
||||
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
|
||||
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
|
||||
fmla v1.2d, v5.2d, v9.2d
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
|
||||
fmla v2.2d, v6.2d, v10.2d
|
||||
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
|
||||
fmla v3.2d, v7.2d, v11.2d
|
||||
fmla v4.2d, v8.2d, v12.2d
|
||||
|
||||
ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
|
||||
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
|
||||
fmla v1.2d, v13.2d, v17.2d
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
|
||||
fmla v2.2d, v14.2d, v18.2d
|
||||
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
|
||||
fmla v3.2d, v15.2d, v19.2d
|
||||
fmla v4.2d, v16.2d, v20.2d
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -72,6 +72,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fabs MAXF, MAXF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8
|
||||
#if !defined(DOUBLE)
|
||||
ldp q2, q3, [X], #32
|
||||
fabs v2.4s, v2.4s
|
||||
fabs v3.4s, v3.4s
|
||||
fmax v2.4s, v2.4s, v3.4s
|
||||
fmaxv TMPF, v2.4s
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
csel INDEX, INDEX, Z, COND
|
||||
add Z, Z, #8
|
||||
#else
|
||||
ldp q2, q3, [X], #32
|
||||
ldp q4, q5, [X], #32
|
||||
fabs v2.2d, v2.2d
|
||||
fabs v3.2d, v3.2d
|
||||
fabs v4.2d, v4.2d
|
||||
fabs v5.2d, v5.2d
|
||||
|
||||
fmax v2.2d, v2.2d, v3.2d
|
||||
fmax v4.2d, v4.2d, v5.2d
|
||||
fmax v2.2d, v2.2d, v4.2d
|
||||
fmaxp TMPF, v2.2d
|
||||
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
csel INDEX, INDEX, Z, COND
|
||||
add Z, Z, #8
|
||||
#endif
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8_FINALIZE
|
||||
sub x6, INDEX, #1
|
||||
#if !defined(DOUBLE)
|
||||
lsl x6, x6, #2
|
||||
add x7, x7, x6
|
||||
ldp q2, q3, [x7]
|
||||
fabs v2.4s, v2.4s
|
||||
fabs v3.4s, v3.4s
|
||||
|
||||
ins v4.s[0], v3.s[0]
|
||||
ins v5.s[0], v3.s[1]
|
||||
ins v6.s[0], v3.s[2]
|
||||
ins v7.s[0], v3.s[3]
|
||||
|
||||
add x6, INDEX, #7
|
||||
fcmp MAXF, s7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s6
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s5
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v2.s[0]
|
||||
ins v5.s[0], v2.s[1]
|
||||
ins v6.s[0], v2.s[2]
|
||||
ins v7.s[0], v2.s[3]
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s6
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s5
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
#else
|
||||
add x6, x6, #4
|
||||
lsl x6, x6, #3
|
||||
add x7, x7, x6
|
||||
ldp q2, q3, [x7]
|
||||
|
||||
fabs v2.2d, v2.2d
|
||||
fabs v3.2d, v3.2d
|
||||
|
||||
ins v4.d[0], v2.d[0]
|
||||
ins v5.d[0], v2.d[1]
|
||||
ins v6.d[0], v3.d[0]
|
||||
ins v7.d[0], v3.d[1]
|
||||
|
||||
add x6, INDEX, #7
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d6
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d5
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x7, x7, #32
|
||||
ldp q2, q3, [x7]
|
||||
|
||||
fabs v2.2d, v2.2d
|
||||
fabs v3.2d, v3.2d
|
||||
|
||||
ins v4.d[0], v2.d[0]
|
||||
ins v5.d[0], v2.d[1]
|
||||
ins v6.d[0], v3.d[0]
|
||||
ins v7.d[0], v3.d[1]
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d6
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d5
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
#endif
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 TMPVF, [X], INC_X
|
||||
add Z, Z, #1
|
||||
|
|
@ -92,6 +234,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
cmp INC_X, xzr
|
||||
ble iamax_kernel_zero
|
||||
|
||||
cmp INC_X, #1
|
||||
bne iamax_kernel_S_BEGIN
|
||||
mov x7, X
|
||||
|
||||
iamax_kernel_F_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
ble iamax_kernel_L999
|
||||
|
||||
asr I, N, #3
|
||||
cmp I, xzr
|
||||
beq iamax_kernel_F1
|
||||
|
||||
add Z, Z, #1
|
||||
iamax_kernel_F8:
|
||||
|
||||
KERNEL_F8
|
||||
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_F8
|
||||
|
||||
KERNEL_F8_FINALIZE
|
||||
|
||||
sub Z, Z, #1
|
||||
iamax_kernel_F1:
|
||||
|
||||
ands I, N, #7
|
||||
ble iamax_kernel_L999
|
||||
|
||||
iamax_kernel_F10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_F10
|
||||
|
||||
b iamax_kernel_L999
|
||||
|
||||
iamax_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
|
|
|
|||
|
|
@ -78,6 +78,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8
|
||||
#if !defined(DOUBLE)
|
||||
ldp q2, q3, [X], #32
|
||||
ldp q4, q5, [X], #32
|
||||
|
||||
fabs v2.4s, v2.4s
|
||||
fabs v3.4s, v3.4s
|
||||
fabs v4.4s, v4.4s
|
||||
fabs v5.4s, v5.4s
|
||||
|
||||
faddp v2.4s, v2.4s, v3.4s
|
||||
faddp v3.4s, v4.4s, v5.4s
|
||||
|
||||
fmax v2.4s, v2.4s, v3.4s
|
||||
fmaxv TMPF, v2.4s
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
csel INDEX, INDEX, Z, COND
|
||||
add Z, Z, #8
|
||||
#else
|
||||
ldp q2, q3, [X], #32
|
||||
ldp q4, q5, [X], #32
|
||||
ldp q16, q17, [X], #32
|
||||
ldp q18, q19, [X], #32
|
||||
|
||||
fabs v2.2d, v2.2d
|
||||
fabs v3.2d, v3.2d
|
||||
fabs v4.2d, v4.2d
|
||||
fabs v5.2d, v5.2d
|
||||
fabs v16.2d, v16.2d
|
||||
fabs v17.2d, v17.2d
|
||||
fabs v18.2d, v18.2d
|
||||
fabs v19.2d, v19.2d
|
||||
|
||||
faddp v2.2d, v2.2d, v3.2d
|
||||
faddp v3.2d, v4.2d, v5.2d
|
||||
faddp v4.2d, v16.2d, v17.2d
|
||||
faddp v5.2d, v18.2d, v19.2d
|
||||
|
||||
fmax v2.2d, v2.2d, v3.2d
|
||||
fmax v4.2d, v4.2d, v5.2d
|
||||
fmax v2.2d, v2.2d, v4.2d
|
||||
fmaxp TMPF, v2.2d
|
||||
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
csel INDEX, INDEX, Z, COND
|
||||
add Z, Z, #8
|
||||
#endif
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8_FINALIZE
|
||||
sub x6, INDEX, #1
|
||||
#if !defined(DOUBLE)
|
||||
lsl x6, x6, #3
|
||||
add x7, x7, x6
|
||||
|
||||
ldp q2, q3, [x7]
|
||||
ldp q4, q5, [x7, #32]
|
||||
|
||||
fabs v2.4s, v2.4s
|
||||
fabs v3.4s, v3.4s
|
||||
fabs v4.4s, v4.4s
|
||||
fabs v5.4s, v5.4s
|
||||
|
||||
faddp v2.4s, v2.4s, v3.4s
|
||||
faddp v3.4s, v4.4s, v5.4s
|
||||
|
||||
ins v4.s[0], v3.s[3]
|
||||
add x6, INDEX, #7
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v3.s[2]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v3.s[1]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v3.s[0]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v2.s[3]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v2.s[2]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v2.s[1]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v2.s[0]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
#else
|
||||
lsl x6, x6, #4
|
||||
add x7, x7, x6
|
||||
|
||||
ldp q2, q3, [x7]
|
||||
ldp q4, q5, [x7, #32]
|
||||
ldp q16, q17, [x7, #64]
|
||||
ldp q18, q19, [x7, #96]
|
||||
|
||||
fabs v2.2d, v2.2d
|
||||
fabs v3.2d, v3.2d
|
||||
fabs v4.2d, v4.2d
|
||||
fabs v5.2d, v5.2d
|
||||
fabs v16.2d, v16.2d
|
||||
fabs v17.2d, v17.2d
|
||||
fabs v18.2d, v18.2d
|
||||
fabs v19.2d, v19.2d
|
||||
|
||||
faddp v2.2d, v2.2d, v3.2d
|
||||
faddp v3.2d, v4.2d, v5.2d
|
||||
faddp v4.2d, v16.2d, v17.2d
|
||||
faddp v5.2d, v18.2d, v19.2d
|
||||
|
||||
ins v7.d[0], v5.d[1]
|
||||
add x6, INDEX, #7
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v7.d[0], v5.d[0]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v7.d[0], v4.d[1]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v7.d[0], v4.d[0]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v7.d[0], v3.d[1]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v7.d[0], v3.d[0]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v7.d[0], v2.d[1]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v7.d[0], v2.d[0]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v1.2s}, [X], INC_X
|
||||
|
|
@ -107,6 +280,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
cmp INC_X, xzr
|
||||
ble iamax_kernel_zero
|
||||
|
||||
cmp INC_X, #1
|
||||
bne iamax_kernel_S_BEGIN
|
||||
mov x7, X
|
||||
|
||||
|
||||
iamax_kernel_F_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
ble iamax_kernel_L999
|
||||
|
||||
asr I, N, #3
|
||||
cmp I, xzr
|
||||
ble iamax_kernel_F1
|
||||
|
||||
add Z, Z, #1
|
||||
|
||||
iamax_kernel_F8:
|
||||
|
||||
KERNEL_F8
|
||||
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_F8
|
||||
|
||||
KERNEL_F8_FINALIZE
|
||||
|
||||
sub Z, Z, #1
|
||||
iamax_kernel_F1:
|
||||
|
||||
ands I, N, #7
|
||||
ble iamax_kernel_L999
|
||||
|
||||
iamax_kernel_F10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_F10
|
||||
|
||||
b iamax_kernel_L999
|
||||
|
||||
iamax_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
#define pA x15
|
||||
#define alpha_save_R x16
|
||||
#define alpha_save_I x17
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alphaR x17
|
||||
#define alphaI x18
|
||||
|
||||
#define alpha0_R d10
|
||||
#define alphaV0_R v10.d[0]
|
||||
#define alpha0_I d11
|
||||
#define alphaV0_I v11.d[0]
|
||||
|
||||
#define alpha1_R d14
|
||||
#define alphaV1_R v14.d[0]
|
||||
#define alpha1_I d15
|
||||
#define alphaV1_I v15.d[0]
|
||||
|
||||
#define A_PRE_SIZE 2560
|
||||
#define B_PRE_SIZE 448
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define OP_rr fmla
|
||||
|
|
@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 pA
|
||||
// 16 alpha_save_R
|
||||
// 17 alpha_save_I
|
||||
// 18 must save
|
||||
// 15 pCRow3
|
||||
// 16 pA
|
||||
// 17 alpha_save_R
|
||||
// 18 must save alpha_save_I
|
||||
// 19 must save
|
||||
// 20 must save
|
||||
// 21 must save
|
||||
|
|
@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL4x4_I
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
|
|
@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
|
|
@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
|
@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
|
@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
|
@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
|
@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmul v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
|
@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
|
|
@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB] // For next round
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
|
|
@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA] // For next round
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
|
|
@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA] // For next round
|
||||
add pA, pA, #32
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
OP_rr v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
|
|
@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v17.2d, v4.2d, v13.d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
ld2 {v8.2d, v9.2d}, [pB] // For next round
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v6.2d, v12.d[0]
|
||||
|
|
@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v19.2d, v6.2d, v13.d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v4.2d, v12.d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA] // For next round
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
|
|
@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v24.2d, v4.2d, v14.d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
OP_rr v28.2d, v4.2d, v14.d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.d[1]
|
||||
|
|
@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
|
|
@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
|
|
@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL4x4_SUB
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
|
|
@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
ld2 {v0.2d, v1.2d}, [pCRow0]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pCRow0]
|
||||
fmla v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmla v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmla v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmla v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmla v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v6.2d, v7.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow1, #32
|
||||
|
||||
ld2 {v6.2d, v7.2d}, [pCRow1]
|
||||
fmla v6.2d, v22.2d, alphaV0_R
|
||||
fmls v6.2d, v23.2d, alphaV0_I
|
||||
fmla v7.2d, v22.2d, alphaV1_I
|
||||
fmla v7.2d, v23.2d, alphaV1_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmla v7.2d, v22.2d, alphaV0_I
|
||||
fmla v7.2d, v23.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow1, pCRow1, #32
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow2]
|
||||
fmla v0.2d, v24.2d, alphaV0_R
|
||||
fmls v0.2d, v25.2d, alphaV0_I
|
||||
fmla v1.2d, v24.2d, alphaV1_I
|
||||
fmla v1.2d, v25.2d, alphaV1_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmla v1.2d, v24.2d, alphaV0_I
|
||||
fmla v1.2d, v25.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #32
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmla v2.2d, v26.2d, alphaV0_R
|
||||
fmls v2.2d, v27.2d, alphaV0_I
|
||||
fmla v3.2d, v26.2d, alphaV1_I
|
||||
fmla v3.2d, v27.2d, alphaV1_R
|
||||
fmla v3.2d, v26.2d, alphaV0_I
|
||||
fmla v3.2d, v27.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
add pCRow2, pCRow2, #32
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
ld2 {v4.2d, v5.2d}, [pCRow3]
|
||||
fmla v4.2d, v28.2d, alphaV0_R
|
||||
fmls v4.2d, v29.2d, alphaV0_I
|
||||
fmla v5.2d, v28.2d, alphaV1_I
|
||||
fmla v5.2d, v29.2d, alphaV1_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmla v5.2d, v28.2d, alphaV0_I
|
||||
fmla v5.2d, v29.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow3]
|
||||
|
||||
add pCRow3, pCRow3, #32
|
||||
|
||||
ld2 {v6.2d, v7.2d}, [pCRow3]
|
||||
fmla v6.2d, v30.2d, alphaV0_R
|
||||
fmls v6.2d, v31.2d, alphaV0_I
|
||||
fmla v7.2d, v30.2d, alphaV1_I
|
||||
fmla v7.2d, v31.2d, alphaV1_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmla v7.2d, v30.2d, alphaV0_I
|
||||
fmla v7.2d, v31.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow3]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
add pCRow3, pCRow3, #32
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
|
@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
|
@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmla v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmla v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
|
@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v24.2d, alphaV0_R
|
||||
fmls v0.2d, v25.2d, alphaV0_I
|
||||
fmla v1.2d, v24.2d, alphaV1_I
|
||||
fmla v1.2d, v25.2d, alphaV1_R
|
||||
fmla v1.2d, v24.2d, alphaV0_I
|
||||
fmla v1.2d, v25.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
|
@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v28.2d, alphaV0_R
|
||||
fmls v4.2d, v29.2d, alphaV0_I
|
||||
fmla v5.2d, v28.2d, alphaV1_I
|
||||
fmla v5.2d, v29.2d, alphaV1_R
|
||||
fmla v5.2d, v28.2d, alphaV0_I
|
||||
fmla v5.2d, v29.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
|
@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.d, v1.d}[0], [pCRow1]
|
||||
fmla d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmla d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmla d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
|
@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.d, v5.d}[0], [pCRow1]
|
||||
fmla d4, d20, alphaV0_R
|
||||
fmls d4, d21, alphaV0_I
|
||||
fmla d5, d20, alphaV1_I
|
||||
fmla d5, d21, alphaV1_R
|
||||
fmla d5, d20, alphaV0_I
|
||||
fmla d5, d21, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
|
@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.d, v1.d}[0], [pCRow1]
|
||||
fmla d0, d24, alphaV0_R
|
||||
fmls d0, d25, alphaV0_I
|
||||
fmla d1, d24, alphaV1_I
|
||||
fmla d1, d25, alphaV1_R
|
||||
fmla d1, d24, alphaV0_I
|
||||
fmla d1, d25, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
|
@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.d, v5.d}[0], [pCRow1]
|
||||
fmla d4, d28, alphaV0_R
|
||||
fmls d4, d29, alphaV0_I
|
||||
fmla d5, d28, alphaV1_I
|
||||
fmla d5, d29, alphaV1_R
|
||||
fmla d5, d28, alphaV0_I
|
||||
fmla d5, d29, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
|
@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmla v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmla v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
fmla v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
|
@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmla v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmla v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmla v6.2d, v22.2d, alphaV0_R
|
||||
fmls v6.2d, v23.2d, alphaV0_I
|
||||
fmla v7.2d, v22.2d, alphaV1_I
|
||||
fmla v7.2d, v23.2d, alphaV1_R
|
||||
fmla v7.2d, v22.2d, alphaV0_I
|
||||
fmla v7.2d, v23.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
|
@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
|
@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmla v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmla v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
|
@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.d, v1.d}[0], [pCRow1]
|
||||
fmla d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmla d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmla d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
|
@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.d, v5.d}[0], [pCRow1]
|
||||
fmla d4, d20, alphaV0_R
|
||||
fmls d4, d21, alphaV0_I
|
||||
fmla d5, d20, alphaV1_I
|
||||
fmla d5, d21, alphaV1_R
|
||||
fmla d5, d20, alphaV0_I
|
||||
fmla d5, d21, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
|
@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmla v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmla v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
fmla v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
|
@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
|
@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.d, v1.d}[0], [pCRow1]
|
||||
fmla d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmla d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmla d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
|
@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
fmov alpha_save_R, d0
|
||||
fmov alpha_save_I, d1
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alphaR, d0
|
||||
fmov alphaI, d1
|
||||
|
||||
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
|
||||
|
||||
|
|
@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ble zgemm_kernel_L2_BEGIN
|
||||
|
||||
zgemm_kernel_L4_BEGIN:
|
||||
mov pCRow0, pC // pCRow0 = C
|
||||
add pC, pC, LDC, lsl #2
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow3, pCRow2, LDC
|
||||
|
||||
add pC, pCRow3, LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
zgemm_kernel_L4_M4_BEGIN:
|
||||
|
|
@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN:
|
|||
cmp counterI, #0
|
||||
ble zgemm_kernel_L4_M2_BEGIN
|
||||
|
||||
.align 5
|
||||
zgemm_kernel_L4_M4_20:
|
||||
|
||||
mov pB, origPB
|
||||
asr counterL , origK, #1 // L = K / 2
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
asr counterL , origK, #3
|
||||
cmp counterL , #2
|
||||
blt zgemm_kernel_L4_M4_32
|
||||
|
||||
KERNEL4x4_I // do one in the K
|
||||
KERNEL4x4_M2 // do another in the K
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble zgemm_kernel_L4_M4_22a
|
||||
.align 5
|
||||
|
||||
.align 5
|
||||
zgemm_kernel_L4_M4_22:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt zgemm_kernel_L4_M4_22
|
||||
|
||||
|
||||
.align 5
|
||||
zgemm_kernel_L4_M4_22a:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b zgemm_kernel_L4_M4_44
|
||||
|
||||
.align 5
|
||||
zgemm_kernel_L4_M4_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble zgemm_kernel_L4_M4_40
|
||||
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b zgemm_kernel_L4_M4_44
|
||||
|
|
@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40:
|
|||
|
||||
zgemm_kernel_L4_M4_44:
|
||||
|
||||
ands counterL , origK, #1
|
||||
ands counterL , origK, #7
|
||||
ble zgemm_kernel_L4_M4_100
|
||||
|
||||
.align 5
|
||||
zgemm_kernel_L4_M4_46:
|
||||
KERNEL4x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne zgemm_kernel_L4_M4_46
|
||||
|
||||
zgemm_kernel_L4_M4_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVE4x4
|
||||
|
||||
|
|
|
|||
|
|
@ -43,6 +43,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define Y_OPTR x13 /* loop Y vector address */
|
||||
#define X_PTR x14 /* loop X vector address */
|
||||
|
||||
#define A_PRE_SIZE 768
|
||||
#define Y_PRE_SIZE 768
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
|
@ -50,14 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if !defined(DOUBLE)
|
||||
#define ALPHA_R s0
|
||||
#define ALPHA_I s1
|
||||
#define ALPHA_R_COPY s7
|
||||
#define ALPHA_I_COPY s8
|
||||
#define SHZ 3
|
||||
#else
|
||||
#define ALPHA_R d0
|
||||
#define ALPHA_I d1
|
||||
#define ALPHA_R_COPY d7
|
||||
#define ALPHA_I_COPY d8
|
||||
#define SHZ 4
|
||||
#endif
|
||||
|
||||
|
|
@ -95,20 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
.macro INIT
|
||||
/********** INIT FOR F4 LOOP **********/
|
||||
fmov ALPHA_R_COPY, ALPHA_R
|
||||
fmov ALPHA_I_COPY, ALPHA_I
|
||||
#if !defined(DOUBLE)
|
||||
ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA)
|
||||
ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA)
|
||||
ins v7.d[1], v7.d[0]
|
||||
ins v8.d[1], v8.d[0]
|
||||
#else
|
||||
ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA)
|
||||
ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA)
|
||||
#endif
|
||||
|
||||
/******* INIT FOR F1 AND S1 LOOP ******/
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
|
|
@ -129,47 +114,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro INIT_LOOP
|
||||
/********** INIT_LOOP FOR F4 LOOP **********/
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v9.2s}, [X_PTR] // [I(X), R(X)]
|
||||
ins v10.s[0], v9.s[1]
|
||||
ins v9.s[1], v9.s[0] // [R(X), R(X)]
|
||||
ins v10.s[1], v10.s[0] // [I(X), I(X)]
|
||||
ins v9.d[1], v9.d[0]
|
||||
ins v10.d[1], v10.d[0]
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
|
||||
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
|
||||
fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
|
||||
#else
|
||||
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
|
||||
fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
|
||||
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
|
||||
fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
|
||||
fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
|
||||
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
|
||||
#else
|
||||
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
|
||||
eor v12.16b, v12.16b, v12.16b
|
||||
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
|
||||
fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
|
||||
ld1 {v2.2s}, [X_PTR] // [I(X), R(X)]
|
||||
ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)]
|
||||
fmul v2.2s, v0.2s, v2.2s
|
||||
fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)]
|
||||
ins v3.s[0], v2.s[1]
|
||||
|
||||
/********** INIT_LOOP FOR F4 LOOP **********/
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
dup v21.4s, v2.s[0] // R[TEMP]
|
||||
dup v22.4s, v2.s[0] // R[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub s25, s25, s3
|
||||
dup v23.4s, v25.s[0] // -I[TEMP]
|
||||
dup v24.4s, v3.s[0] // I[TEMP]
|
||||
#else
|
||||
dup v21.4s, v2.s[0] // R[TEMP]
|
||||
dup v22.4s, v2.s[0] // R[TEMP]
|
||||
dup v23.4s, v3.s[0] // I[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub s25, s25, s3
|
||||
dup v24.4s, v25.s[0] // -I[TEMP]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
dup v21.4s, v2.s[0] // R[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub s25, s25, s2
|
||||
dup v22.4s, v25.s[0] // R[TEMP]
|
||||
dup v23.4s, v3.s[0] // I[TEMP]
|
||||
dup v24.4s, v3.s[0] // I[TEMP]
|
||||
#else
|
||||
dup v21.4s, v2.s[0] // R[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub s25, s25, s2
|
||||
dup v22.4s, v25.s[0] // R[TEMP]
|
||||
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub s25, s25, s3
|
||||
dup v23.4s, v25.s[0] // I[TEMP]
|
||||
dup v24.4s, v25.s[0] // I[TEMP]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
|
||||
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
eor v4.16b, v4.16b, v4.16b
|
||||
|
|
@ -200,45 +191,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif // CONJ
|
||||
|
||||
#else // DOUBLE
|
||||
|
||||
/********** INIT_LOOP FOR F4 LOOP **********/
|
||||
ld1 {v9.2d}, [X_PTR] // [I(X), R(X)]
|
||||
ins v10.d[0], v9.d[1]
|
||||
ins v9.d[1], v9.d[0] // [R(X), R(X)]
|
||||
ins v10.d[1], v10.d[0] // [I(X), I(X)]
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
|
||||
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
|
||||
fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
|
||||
#else
|
||||
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
|
||||
fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
|
||||
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
|
||||
fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
|
||||
fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
|
||||
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
|
||||
#else
|
||||
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
|
||||
eor v12.16b, v12.16b, v12.16b
|
||||
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
|
||||
fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
|
||||
ld1 {v2.2d}, [X_PTR] // [I(X), R(X)]
|
||||
ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)]
|
||||
fmul v2.2d, v0.2d, v2.2d
|
||||
fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)]
|
||||
ins v3.d[0], v2.d[1] // I(TEMP)
|
||||
|
||||
/****** INIT_LOOP FOR F4 LOOP ******/
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
dup v21.2d, v2.d[0] // R[TEMP]
|
||||
dup v22.2d, v2.d[0] // R[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub d25, d25, d3
|
||||
dup v23.2d, v25.d[0] // -I[TEMP]
|
||||
dup v24.2d, v3.d[0] // I[TEMP]
|
||||
#else
|
||||
dup v21.2d, v2.d[0] // R[TEMP]
|
||||
dup v22.2d, v2.d[0] // R[TEMP]
|
||||
dup v23.2d, v3.d[0] // I[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub d25, d25, d3
|
||||
dup v24.2d, v25.d[0] // -I[TEMP]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
dup v21.2d, v2.d[0] // R[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub d25, d25, d2
|
||||
dup v22.2d, v25.d[0] // R[TEMP]
|
||||
dup v23.2d, v3.d[0] // I[TEMP]
|
||||
dup v24.2d, v3.d[0] // I[TEMP]
|
||||
#else
|
||||
dup v21.2d, v2.d[0] // R[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub d25, d25, d2
|
||||
dup v22.2d, v25.d[0] // R[TEMP]
|
||||
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub d25, d25, d3
|
||||
dup v23.2d, v25.d[0] // I[TEMP]
|
||||
dup v24.2d, v25.d[0] // I[TEMP]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
|
||||
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
eor v4.16b, v4.16b, v4.16b
|
||||
|
|
@ -276,91 +274,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
ld2 {v13.4s, v14.4s}, [A_PTR], #32
|
||||
ld2 {v15.4s, v16.4s}, [Y_IPTR], #32
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
|
||||
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
|
||||
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
|
||||
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
|
||||
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
|
||||
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
|
||||
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
|
||||
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
|
||||
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
|
||||
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
|
||||
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
|
||||
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
|
||||
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
|
||||
|
||||
fmla v15.4s, v21.4s, v13.4s
|
||||
fmla v15.4s, v23.4s, v14.4s
|
||||
fmla v16.4s, v22.4s, v14.4s
|
||||
fmla v16.4s, v24.4s, v13.4s
|
||||
|
||||
st2 {v15.4s, v16.4s}, [Y_OPTR], #32
|
||||
|
||||
#else // DOUBLE
|
||||
|
||||
ld2 {v13.2d, v14.2d}, [A_PTR], #32
|
||||
ld2 {v15.2d, v16.2d}, [Y_IPTR], #32
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
|
||||
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
|
||||
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
|
||||
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
|
||||
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
|
||||
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
|
||||
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
|
||||
|
||||
fmla v15.2d, v21.2d, v13.2d
|
||||
fmla v15.2d, v23.2d, v14.2d
|
||||
fmla v16.2d, v22.2d, v14.2d
|
||||
fmla v16.2d, v24.2d, v13.2d
|
||||
|
||||
st2 {v15.2d, v16.2d}, [Y_OPTR], #32
|
||||
|
||||
ld2 {v17.2d, v18.2d}, [A_PTR], #32
|
||||
ld2 {v19.2d, v20.2d}, [Y_IPTR], #32
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
|
||||
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
|
||||
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
|
||||
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
|
||||
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
|
||||
|
||||
fmla v19.2d, v21.2d, v17.2d
|
||||
fmla v19.2d, v23.2d, v18.2d
|
||||
fmla v20.2d, v22.2d, v18.2d
|
||||
fmla v20.2d, v24.2d, v17.2d
|
||||
|
||||
st2 {v19.2d, v20.2d}, [Y_OPTR], #32
|
||||
|
||||
#endif
|
||||
|
|
@ -445,10 +391,7 @@ zgemv_n_kernel_F_LOOP:
|
|||
|
||||
zgemv_n_kernel_F4:
|
||||
|
||||
KERNEL_F1
|
||||
KERNEL_F1
|
||||
KERNEL_F1
|
||||
KERNEL_F1
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne zgemv_n_kernel_F4
|
||||
|
|
|
|||
|
|
@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define J x11 /* loop variable */
|
||||
#define I x12 /* loop variable */
|
||||
|
||||
#define A_PRE_SIZE 768
|
||||
#define X_PRE_SIZE 768
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
|
@ -139,6 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
ld2 {v11.4s, v12.4s}, [X_PTR], #32
|
||||
ld2 {v13.4s, v14.4s}, [A_PTR], #32
|
||||
prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE]
|
||||
prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE]
|
||||
|
||||
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
|
||||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
|
||||
|
|
@ -155,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else // DOUBLE
|
||||
ld2 {v11.2d, v12.2d}, [X_PTR], #32
|
||||
ld2 {v13.2d, v14.2d}, [A_PTR], #32
|
||||
prfm PLDL1STRM, [X_PTR, #512]
|
||||
prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE]
|
||||
|
||||
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
|
||||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
|
||||
|
|
@ -171,7 +176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
ld2 {v17.2d, v18.2d}, [X_PTR], #32
|
||||
ld2 {v19.2d, v20.2d}, [A_PTR], #32
|
||||
prfm PLDL1STRM, [A_PTR, #512]
|
||||
prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE]
|
||||
|
||||
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
|
||||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
|
||||
|
|
|
|||
|
|
@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
#define pA x15
|
||||
#define alpha_save_R x16
|
||||
#define alpha_save_I x17
|
||||
#define temp x18
|
||||
#define tempOffset x19
|
||||
#define tempK x20
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alphaR x17
|
||||
#define alphaI x18
|
||||
#define temp x19
|
||||
#define tempOffset x20
|
||||
#define tempK x21
|
||||
|
||||
#define alpha0_R d10
|
||||
#define alphaV0_R v10.d[0]
|
||||
#define alpha0_I d11
|
||||
#define alphaV0_I v11.d[0]
|
||||
|
||||
#define alpha1_R d14
|
||||
#define alphaV1_R v14.d[0]
|
||||
#define alpha1_I d15
|
||||
#define alphaV1_I v15.d[0]
|
||||
|
||||
#define A_PRE_SIZE 2560
|
||||
#define B_PRE_SIZE 448
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define OP_rr fmla
|
||||
|
|
@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 04 origPB
|
||||
// 05 pC
|
||||
// 06 origLDC -> LDC
|
||||
// 07 offset
|
||||
// 07 offset -> temp
|
||||
// 08 counterL
|
||||
// 09 counterI
|
||||
// 10 counterJ
|
||||
|
|
@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 pA
|
||||
// 16 alpha_save_R
|
||||
// 17 alpha_save_I
|
||||
// 18 must save temp
|
||||
// 19 must save tempOffset
|
||||
// 20 must save tempK
|
||||
// 21 must save
|
||||
// 15 pCRow3
|
||||
// 16 pA
|
||||
// 17 alpha_save_R
|
||||
// 18 must save alpha_save_I
|
||||
// 19 must save temp
|
||||
// 20 must save tempOffset
|
||||
// 21 must save tempK
|
||||
// 22 must save
|
||||
// 23 must save
|
||||
// 24 must save
|
||||
|
|
@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL4x4_I
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
|
|
@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
|
|
@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
|
@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
|
@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
|
@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
|
@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmul v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
|
@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
|
|
@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB] // For next round
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
|
|
@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA] // For next round
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
|
|
@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA] // For next round
|
||||
add pA, pA, #32
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
OP_rr v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
|
|
@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v17.2d, v4.2d, v13.d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
ld2 {v8.2d, v9.2d}, [pB] // For next round
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v6.2d, v12.d[0]
|
||||
|
|
@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v19.2d, v6.2d, v13.d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v4.2d, v12.d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA] // For next round
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
|
|
@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v24.2d, v4.2d, v14.d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
OP_rr v28.2d, v4.2d, v14.d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.d[1]
|
||||
|
|
@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
|
|
@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
|
|
@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL4x4_SUB
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
|
|
@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
||||
fmul v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmul v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmul v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
fmul v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmul v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmul v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
|
||||
add pCRow1, pCRow1, #32
|
||||
|
||||
fmul v6.2d, v22.2d, alphaV0_R
|
||||
fmls v6.2d, v23.2d, alphaV0_I
|
||||
fmul v7.2d, v22.2d, alphaV1_I
|
||||
fmla v7.2d, v23.2d, alphaV1_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmul v7.2d, v22.2d, alphaV0_I
|
||||
fmla v7.2d, v23.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, #32
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
fmul v0.2d, v24.2d, alphaV0_R
|
||||
fmls v0.2d, v25.2d, alphaV0_I
|
||||
fmul v1.2d, v24.2d, alphaV1_I
|
||||
fmla v1.2d, v25.2d, alphaV1_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v1.2d, v24.2d, alphaV0_I
|
||||
fmla v1.2d, v25.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #32
|
||||
|
||||
fmul v2.2d, v26.2d, alphaV0_R
|
||||
fmls v2.2d, v27.2d, alphaV0_I
|
||||
fmul v3.2d, v26.2d, alphaV1_I
|
||||
fmla v3.2d, v27.2d, alphaV1_R
|
||||
fmul v3.2d, v26.2d, alphaV0_I
|
||||
fmla v3.2d, v27.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
add pCRow2, pCRow2, #32
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
fmul v4.2d, v28.2d, alphaV0_R
|
||||
fmls v4.2d, v29.2d, alphaV0_I
|
||||
fmul v5.2d, v28.2d, alphaV1_I
|
||||
fmla v5.2d, v29.2d, alphaV1_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v5.2d, v28.2d, alphaV0_I
|
||||
fmla v5.2d, v29.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow3]
|
||||
|
||||
add pCRow3, pCRow3, #32
|
||||
|
||||
fmul v6.2d, v30.2d, alphaV0_R
|
||||
fmls v6.2d, v31.2d, alphaV0_I
|
||||
fmul v7.2d, v30.2d, alphaV1_I
|
||||
fmla v7.2d, v31.2d, alphaV1_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmul v7.2d, v30.2d, alphaV0_I
|
||||
fmla v7.2d, v31.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow3]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
add pCRow3, pCRow3, #32
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
|
@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmul v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmul v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul v0.2d, v24.2d, alphaV0_R
|
||||
fmls v0.2d, v25.2d, alphaV0_I
|
||||
fmul v1.2d, v24.2d, alphaV1_I
|
||||
fmla v1.2d, v25.2d, alphaV1_R
|
||||
fmul v1.2d, v24.2d, alphaV0_I
|
||||
fmla v1.2d, v25.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul v4.2d, v28.2d, alphaV0_R
|
||||
fmls v4.2d, v29.2d, alphaV0_I
|
||||
fmul v5.2d, v28.2d, alphaV1_I
|
||||
fmla v5.2d, v29.2d, alphaV1_R
|
||||
fmul v5.2d, v28.2d, alphaV0_I
|
||||
fmla v5.2d, v29.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
|
@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmul d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmul d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul d4, d20, alphaV0_R
|
||||
fmls d4, d21, alphaV0_I
|
||||
fmul d5, d20, alphaV1_I
|
||||
fmla d5, d21, alphaV1_R
|
||||
fmul d5, d20, alphaV0_I
|
||||
fmla d5, d21, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul d0, d24, alphaV0_R
|
||||
fmls d0, d25, alphaV0_I
|
||||
fmul d1, d24, alphaV1_I
|
||||
fmla d1, d25, alphaV1_R
|
||||
fmul d1, d24, alphaV0_I
|
||||
fmla d1, d25, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul d4, d28, alphaV0_R
|
||||
fmls d4, d29, alphaV0_I
|
||||
fmul d5, d28, alphaV1_I
|
||||
fmla d5, d29, alphaV1_R
|
||||
fmul d5, d28, alphaV0_I
|
||||
fmla d5, d29, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
|
@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmul v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
fmul v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmul v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmul v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v6.2d, v22.2d, alphaV0_R
|
||||
fmls v6.2d, v23.2d, alphaV0_I
|
||||
fmul v7.2d, v22.2d, alphaV1_I
|
||||
fmla v7.2d, v23.2d, alphaV1_R
|
||||
fmul v7.2d, v22.2d, alphaV0_I
|
||||
fmla v7.2d, v23.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
|
@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmul v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmul v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
|
@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmul d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmul d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul d4, d20, alphaV0_R
|
||||
fmls d4, d21, alphaV0_I
|
||||
fmul d5, d20, alphaV1_I
|
||||
fmla d5, d21, alphaV1_R
|
||||
fmul d5, d20, alphaV0_I
|
||||
fmla d5, d21, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
|
@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmul v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
fmul v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
|
@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
|
@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmul d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmul d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
|
@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
fmov alpha_save_R, d0
|
||||
fmov alpha_save_I, d1
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alphaR, d0
|
||||
fmov alphaI, d1
|
||||
|
||||
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
|
||||
|
||||
|
|
@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ble ztrmm_kernel_L2_BEGIN
|
||||
|
||||
ztrmm_kernel_L4_BEGIN:
|
||||
mov pCRow0, pC // pCRow0 = C
|
||||
add pC, pC, LDC, lsl #2
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow3, pCRow2, LDC
|
||||
|
||||
add pC, pCRow3, LDC
|
||||
|
||||
|
||||
#if defined(LEFT)
|
||||
mov tempOffset, offset
|
||||
|
|
@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN:
|
|||
cmp counterI, #0
|
||||
ble ztrmm_kernel_L4_M2_BEGIN
|
||||
|
||||
.align 5
|
||||
ztrmm_kernel_L4_M4_20:
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
|
@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20:
|
|||
add tempK, tempOffset, #4
|
||||
#endif
|
||||
|
||||
asr counterL , tempK, #1 // L = K / 2
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
asr counterL , tempK, #3
|
||||
cmp counterL , #2
|
||||
blt ztrmm_kernel_L4_M4_32
|
||||
|
||||
KERNEL4x4_I // do one in the K
|
||||
KERNEL4x4_M2 // do another in the K
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs counterL, counterL, #2
|
||||
ble ztrmm_kernel_L4_M4_22a
|
||||
.align 5
|
||||
|
||||
.align 5
|
||||
ztrmm_kernel_L4_M4_22:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt ztrmm_kernel_L4_M4_22
|
||||
|
||||
|
||||
.align 5
|
||||
ztrmm_kernel_L4_M4_22a:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b ztrmm_kernel_L4_M4_44
|
||||
|
||||
.align 5
|
||||
ztrmm_kernel_L4_M4_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble ztrmm_kernel_L4_M4_40
|
||||
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b ztrmm_kernel_L4_M4_44
|
||||
|
|
@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40:
|
|||
|
||||
ztrmm_kernel_L4_M4_44:
|
||||
|
||||
ands counterL , tempK, #1
|
||||
ands counterL , tempK, #7
|
||||
ble ztrmm_kernel_L4_M4_100
|
||||
|
||||
.align 5
|
||||
ztrmm_kernel_L4_M4_46:
|
||||
KERNEL4x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne ztrmm_kernel_L4_M4_46
|
||||
|
||||
ztrmm_kernel_L4_M4_100:
|
||||
|
||||
SAVE4x4
|
||||
|
|
@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100:
|
|||
add tempOffset, tempOffset, #4
|
||||
#endif
|
||||
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
ztrmm_kernel_L4_M4_END:
|
||||
subs counterI, counterI, #1
|
||||
bne ztrmm_kernel_L4_M4_20
|
||||
|
|
|
|||
|
|
@ -0,0 +1,46 @@
|
|||
ifndef SNRM2KERNEL
|
||||
SNRM2KERNEL = nrm2.c
|
||||
endif
|
||||
|
||||
ifndef DNRM2KERNEL
|
||||
DNRM2KERNEL = nrm2.c
|
||||
endif
|
||||
|
||||
ifndef CNRM2KERNEL
|
||||
CNRM2KERNEL = znrm2.c
|
||||
endif
|
||||
|
||||
ifndef ZNRM2KERNEL
|
||||
ZNRM2KERNEL = znrm2.c
|
||||
endif
|
||||
|
||||
ifndef SCABS_KERNEL
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
endif
|
||||
|
||||
ifndef DCABS_KERNEL
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
endif
|
||||
|
||||
ifndef QCABS_KERNEL
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
endif
|
||||
|
||||
ifndef LSAME_KERNEL
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
endif
|
||||
|
||||
ifndef SGEMM_BETA
|
||||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
endif
|
||||
ifndef DGEMM_BETA
|
||||
DGEMM_BETA = ../generic/gemm_beta.c
|
||||
endif
|
||||
ifndef CGEMM_BETA
|
||||
CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
endif
|
||||
ifndef ZGEMM_BETA
|
||||
ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
endif
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,221 @@
|
|||
SAMAXKERNEL = ../mips/amax.c
|
||||
DAMAXKERNEL = ../mips/amax.c
|
||||
CAMAXKERNEL = ../mips/zamax.c
|
||||
ZAMAXKERNEL = ../mips/zamax.c
|
||||
|
||||
SAMINKERNEL = ../mips/amin.c
|
||||
DAMINKERNEL = ../mips/amin.c
|
||||
CAMINKERNEL = ../mips/zamin.c
|
||||
ZAMINKERNEL = ../mips/zamin.c
|
||||
|
||||
SMAXKERNEL = ../mips/max.c
|
||||
DMAXKERNEL = ../mips/max.c
|
||||
|
||||
SMINKERNEL = ../mips/min.c
|
||||
DMINKERNEL = ../mips/min.c
|
||||
|
||||
ISAMAXKERNEL = ../mips/iamax.c
|
||||
IDAMAXKERNEL = ../mips/iamax.c
|
||||
ICAMAXKERNEL = ../mips/izamax.c
|
||||
IZAMAXKERNEL = ../mips/izamax.c
|
||||
|
||||
ISAMINKERNEL = ../mips/iamin.c
|
||||
IDAMINKERNEL = ../mips/iamin.c
|
||||
ICAMINKERNEL = ../mips/izamin.c
|
||||
IZAMINKERNEL = ../mips/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../mips/imax.c
|
||||
IDMAXKERNEL = ../mips/imax.c
|
||||
|
||||
ISMINKERNEL = ../mips/imin.c
|
||||
IDMINKERNEL = ../mips/imin.c
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SASUMKERNEL = ../mips/sasum_msa.c
|
||||
DASUMKERNEL = ../mips/dasum_msa.c
|
||||
CASUMKERNEL = ../mips/casum_msa.c
|
||||
ZASUMKERNEL = ../mips/zasum_msa.c
|
||||
else
|
||||
SASUMKERNEL = ../mips/asum.c
|
||||
DASUMKERNEL = ../mips/asum.c
|
||||
CASUMKERNEL = ../mips/asum.c
|
||||
ZASUMKERNEL = ../mips/asum.c
|
||||
endif
|
||||
|
||||
SAXPYKERNEL = ../mips/axpy.c
|
||||
DAXPYKERNEL = ../mips/axpy.c
|
||||
CAXPYKERNEL = ../mips/zaxpy.c
|
||||
ZAXPYKERNEL = ../mips/zaxpy.c
|
||||
|
||||
SCOPYKERNEL = ../mips/copy.c
|
||||
DCOPYKERNEL = ../mips/copy.c
|
||||
CCOPYKERNEL = ../mips/zcopy.c
|
||||
ZCOPYKERNEL = ../mips/zcopy.c
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SDOTKERNEL = ../mips/sdot_msa.c
|
||||
DDOTKERNEL = ../mips/ddot_msa.c
|
||||
CDOTKERNEL = ../mips/cdot_msa.c
|
||||
ZDOTKERNEL = ../mips/zdot_msa.c
|
||||
else
|
||||
SDOTKERNEL = ../mips/dot.c
|
||||
DDOTKERNEL = ../mips/dot.c
|
||||
CDOTKERNEL = ../mips/zdot.c
|
||||
ZDOTKERNEL = ../mips/zdot.c
|
||||
endif
|
||||
|
||||
SNRM2KERNEL = ../mips/nrm2.c
|
||||
DNRM2KERNEL = ../mips/nrm2.c
|
||||
CNRM2KERNEL = ../mips/znrm2.c
|
||||
ZNRM2KERNEL = ../mips/znrm2.c
|
||||
|
||||
SROTKERNEL = ../mips/rot.c
|
||||
DROTKERNEL = ../mips/rot.c
|
||||
CROTKERNEL = ../mips/zrot.c
|
||||
ZROTKERNEL = ../mips/zrot.c
|
||||
|
||||
SSCALKERNEL = ../mips/scal.c
|
||||
DSCALKERNEL = ../mips/scal.c
|
||||
CSCALKERNEL = ../mips/zscal.c
|
||||
ZSCALKERNEL = ../mips/zscal.c
|
||||
|
||||
SSWAPKERNEL = ../mips/swap.c
|
||||
DSWAPKERNEL = ../mips/swap.c
|
||||
CSWAPKERNEL = ../mips/zswap.c
|
||||
ZSWAPKERNEL = ../mips/zswap.c
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SGEMVNKERNEL = ../mips/sgemv_n_msa.c
|
||||
DGEMVNKERNEL = ../mips/dgemv_n_msa.c
|
||||
CGEMVNKERNEL = ../mips/cgemv_n_msa.c
|
||||
ZGEMVNKERNEL = ../mips/zgemv_n_msa.c
|
||||
else
|
||||
SGEMVNKERNEL = ../mips/gemv_n.c
|
||||
DGEMVNKERNEL = ../mips/gemv_n.c
|
||||
CGEMVNKERNEL = ../mips/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../mips/zgemv_n.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SGEMVTKERNEL = ../mips/sgemv_t_msa.c
|
||||
DGEMVTKERNEL = ../mips/dgemv_t_msa.c
|
||||
CGEMVTKERNEL = ../mips/cgemv_t_msa.c
|
||||
ZGEMVTKERNEL = ../mips/zgemv_t_msa.c
|
||||
else
|
||||
SGEMVTKERNEL = ../mips/gemv_t.c
|
||||
DGEMVTKERNEL = ../mips/gemv_t.c
|
||||
CGEMVTKERNEL = ../mips/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../mips/zgemv_t.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c
|
||||
SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c
|
||||
SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
else
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c
|
||||
DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c
|
||||
DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c
|
||||
DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c
|
||||
DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
else
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c
|
||||
CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c
|
||||
CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c
|
||||
CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c
|
||||
CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
else
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c
|
||||
ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c
|
||||
ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
else
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
|
||||
STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
|
||||
STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
|
||||
STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c
|
||||
else
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
|
||||
DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
|
||||
DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c
|
||||
DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c
|
||||
else
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
else
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
else
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
clean ::
|
||||
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT maxf=0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(maxf);
|
||||
|
||||
maxf=ABS(x[0]);
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( ABS(x[ix]) > maxf )
|
||||
{
|
||||
maxf = ABS(x[ix]);
|
||||
}
|
||||
ix += inc_x;
|
||||
i++;
|
||||
}
|
||||
return(maxf);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT minf=0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(minf);
|
||||
|
||||
minf=ABS(x[0]);
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( ABS(x[ix]) < minf )
|
||||
{
|
||||
minf = ABS(x[ix]);
|
||||
}
|
||||
ix += inc_x;
|
||||
i++;
|
||||
}
|
||||
return(minf);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
n *= inc_x;
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[i]);
|
||||
i += inc_x;
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix,iy;
|
||||
|
||||
if ( n < 0 ) return(0);
|
||||
|
||||
ix = 0;
|
||||
iy = 0;
|
||||
|
||||
if ( beta == 0.0 )
|
||||
{
|
||||
|
||||
if ( alpha == 0.0 )
|
||||
{
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = 0.0 ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = alpha * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
if ( alpha == 0.0 )
|
||||
{
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = beta * y[iy] ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = alpha * x[ix] + beta * y[iy] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix,iy;
|
||||
|
||||
if ( n < 0 ) return(0);
|
||||
if ( da == 0.0 ) return(0);
|
||||
|
||||
ix = 0;
|
||||
iy = 0;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[iy] += da * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,338 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include "macros_msa.h"
|
||||
|
||||
#define AND_VEC_W(in) ((v4f32) ((v4i32) in & and_vec))
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i, inc_x2;
|
||||
FLOAT sumf = 0.0;
|
||||
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
|
||||
v4f32 zero_v = {0};
|
||||
v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (sumf);
|
||||
|
||||
if (1 == inc_x)
|
||||
{
|
||||
if (n > 15)
|
||||
{
|
||||
n -= 16;
|
||||
|
||||
LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 = AND_VEC_W(src0);
|
||||
sum_abs1 = AND_VEC_W(src1);
|
||||
sum_abs2 = AND_VEC_W(src2);
|
||||
sum_abs3 = AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
sum_abs2 += AND_VEC_W(src6);
|
||||
sum_abs3 += AND_VEC_W(src7);
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = zero_v;
|
||||
sum_abs1 = zero_v;
|
||||
sum_abs2 = zero_v;
|
||||
sum_abs3 = zero_v;
|
||||
}
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
sum_abs2 += AND_VEC_W(src6);
|
||||
sum_abs3 += AND_VEC_W(src7);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if ((n & 8) && (n & 4) && (n & 2))
|
||||
{
|
||||
LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
sum_abs2 += AND_VEC_W(src6);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if ((n & 8) && (n & 4))
|
||||
{
|
||||
LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if ((n & 8) && (n & 2))
|
||||
{
|
||||
LD_SP5_INC(x, 4, src0, src1, src2, src3, src4);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_SP3_INC(x, 4, src0, src1, src2);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if (n & 8)
|
||||
{
|
||||
LD_SP4_INC(x, 4, src0, src1, src2, src3);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_SP2_INC(x, 4, src0, src1);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
src0 = LD_SP(x); x += 4;
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
sumf += fabsf(*(x + 0));
|
||||
sumf += fabsf(*(x + 1));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
if (n > 8)
|
||||
{
|
||||
n -= 8;
|
||||
|
||||
LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 = AND_VEC_W(src0);
|
||||
sum_abs1 = AND_VEC_W(src1);
|
||||
sum_abs2 = AND_VEC_W(src2);
|
||||
sum_abs3 = AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
sum_abs2 += AND_VEC_W(src6);
|
||||
sum_abs3 += AND_VEC_W(src7);
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = zero_v;
|
||||
sum_abs1 = zero_v;
|
||||
sum_abs2 = zero_v;
|
||||
sum_abs3 = zero_v;
|
||||
}
|
||||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
sum_abs2 += AND_VEC_W(src6);
|
||||
sum_abs3 += AND_VEC_W(src7);
|
||||
}
|
||||
|
||||
if (n & 7)
|
||||
{
|
||||
if ((n & 4) && (n & 2) && (n & 1))
|
||||
{
|
||||
LD_SP7_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
sum_abs2 += AND_VEC_W(src6);
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_SP6_INC(x, inc_x2, src0, src1, src2, src3, src4, src5);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
}
|
||||
else if ((n & 4) && (n & 1))
|
||||
{
|
||||
LD_SP5_INC(x, inc_x2, src0, src1, src2, src3, src4);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
}
|
||||
else if ((n & 2) && (n & 1))
|
||||
{
|
||||
LD_SP3_INC(x, inc_x2, src0, src1, src2);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_SP4_INC(x, inc_x2, src0, src1, src2, src3);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_SP2_INC(x, inc_x2, src0, src1);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
src0 = LD_SP(x); x += inc_x2;
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
}
|
||||
}
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0] + sum_abs0[1];
|
||||
}
|
||||
|
||||
return (sumf);
|
||||
}
|
||||
|
|
@ -0,0 +1,361 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#if !defined(CONJ)
|
||||
#define OP2 +=
|
||||
#define OP3 -
|
||||
#define OP4 +
|
||||
#else
|
||||
#define OP2 -=
|
||||
#define OP3 +
|
||||
#define OP4 -
|
||||
#endif
|
||||
|
||||
#define DOT16_KERNEL(OPR0, OPR1) \
|
||||
dot0 += (vx0r * vy0r); \
|
||||
dot0 OPR0## = (vx0i * vy0i); \
|
||||
dot1 OPR1## = (vx0i * vy0r); \
|
||||
dot1 += (vx0r * vy0i); \
|
||||
\
|
||||
dot0 += (vx1r * vy1r); \
|
||||
dot0 OPR0## = (vx1i * vy1i); \
|
||||
dot1 OPR1## = (vx1i * vy1r); \
|
||||
dot1 += (vx1r * vy1i); \
|
||||
\
|
||||
dot0 += (vx2r * vy2r); \
|
||||
dot0 OPR0## = (vx2i * vy2i); \
|
||||
dot1 OPR1## = (vx2i * vy2r); \
|
||||
dot1 += (vx2r * vy2i); \
|
||||
\
|
||||
dot0 += (vx3r * vy3r); \
|
||||
dot0 OPR0## = (vx3i * vy3i); \
|
||||
dot1 OPR1## = (vx3i * vy3r); \
|
||||
dot1 += (vx3r * vy3i);
|
||||
|
||||
#define DOT12_KERNEL(OPR0, OPR1) \
|
||||
dot0 += (vx0r * vy0r); \
|
||||
dot0 OPR0## = (vx0i * vy0i); \
|
||||
dot1 OPR1## = (vx0i * vy0r); \
|
||||
dot1 += (vx0r * vy0i); \
|
||||
\
|
||||
dot0 += (vx1r * vy1r); \
|
||||
dot0 OPR0## = (vx1i * vy1i); \
|
||||
dot1 OPR1## = (vx1i * vy1r); \
|
||||
dot1 += (vx1r * vy1i); \
|
||||
\
|
||||
dot0 += (vx2r * vy2r); \
|
||||
dot0 OPR0## = (vx2i * vy2i); \
|
||||
dot1 OPR1## = (vx2i * vy2r); \
|
||||
dot1 += (vx2r * vy2i);
|
||||
|
||||
#define DOT8_KERNEL(OPR0, OPR1) \
|
||||
dot0 += (vx0r * vy0r); \
|
||||
dot0 OPR0## = (vx0i * vy0i); \
|
||||
dot1 OPR1## = (vx0i * vy0r); \
|
||||
dot1 += (vx0r * vy0i); \
|
||||
\
|
||||
dot0 += (vx1r * vy1r); \
|
||||
dot0 OPR0## = (vx1i * vy1i); \
|
||||
dot1 OPR1## = (vx1i * vy1r); \
|
||||
dot1 += (vx1r * vy1i);
|
||||
|
||||
#define DOT4_KERNEL(OPR0, OPR1) \
|
||||
dot0 += (vx0r * vy0r); \
|
||||
dot0 OPR0## = (vx0i * vy0i); \
|
||||
dot1 OPR1## = (vx0i * vy0r); \
|
||||
dot1 += (vx0r * vy0i);
|
||||
|
||||
/* return float, x,y float */
|
||||
/* cdotc - CONJ */
|
||||
/* cdotu - !CONJ */
|
||||
#ifndef _MSC_VER
|
||||
#include <complex.h>
|
||||
FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#else
|
||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#endif
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
FLOAT dot[2];
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG inc_y2;
|
||||
FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
|
||||
v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
|
||||
v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
|
||||
v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
|
||||
v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
|
||||
v4f32 dot0 = {0, 0, 0, 0};
|
||||
v4f32 dot1 = {0, 0, 0, 0};
|
||||
openblas_complex_float result;
|
||||
|
||||
dot[0] = 0.0;
|
||||
dot[1] = 0.0;
|
||||
|
||||
__real__(result) = 0.0;
|
||||
__imag__(result) = 0.0;
|
||||
|
||||
if ( n < 1 ) return(result);
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
|
||||
LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
|
||||
|
||||
PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
|
||||
PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
|
||||
PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
|
||||
PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i);
|
||||
|
||||
PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
|
||||
PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
|
||||
PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
|
||||
PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i);
|
||||
|
||||
#if !defined(CONJ)
|
||||
DOT16_KERNEL(-, +);
|
||||
#else
|
||||
DOT16_KERNEL(+, -);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if ((n & 8) && (n & 4))
|
||||
{
|
||||
LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
|
||||
LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
|
||||
LD_SP2_INC(x, 4, vx4, vx5);
|
||||
LD_SP2_INC(y, 4, vy4, vy5);
|
||||
|
||||
PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
|
||||
PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
|
||||
PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
|
||||
|
||||
PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
|
||||
PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
|
||||
PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
|
||||
|
||||
#if !defined(CONJ)
|
||||
DOT12_KERNEL(-, +);
|
||||
#else
|
||||
DOT12_KERNEL(+, -);
|
||||
#endif
|
||||
}
|
||||
else if (n & 8)
|
||||
{
|
||||
LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
|
||||
LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
|
||||
|
||||
PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
|
||||
PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
|
||||
|
||||
PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
|
||||
PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
|
||||
|
||||
#if !defined(CONJ)
|
||||
DOT8_KERNEL(-, +);
|
||||
#else
|
||||
DOT8_KERNEL(+, -);
|
||||
#endif
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_SP2_INC(x, 4, vx0, vx1);
|
||||
LD_SP2_INC(y, 4, vy0, vy1);
|
||||
PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
|
||||
PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
|
||||
|
||||
#if !defined(CONJ)
|
||||
DOT4_KERNEL(-, +);
|
||||
#else
|
||||
DOT4_KERNEL(+, -);
|
||||
#endif
|
||||
}
|
||||
|
||||
if ((n & 2) && (n & 1))
|
||||
{
|
||||
LD_GP6_INC(x, 1, x0, x1, x2, x3, x4, x5);
|
||||
LD_GP6_INC(y, 1, y0, y1, y2, y3, y4, y5);
|
||||
|
||||
dot[0] += ( x0 * y0 OP3 x1 * y1 );
|
||||
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
|
||||
|
||||
dot[0] += ( x2 * y2 OP3 x3 * y3 );
|
||||
dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
|
||||
|
||||
dot[0] += ( x4 * y4 OP3 x5 * y5 );
|
||||
dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_GP4_INC(x, 1, x0, x1, x2, x3);
|
||||
LD_GP4_INC(y, 1, y0, y1, y2, y3);
|
||||
|
||||
dot[0] += ( x0 * y0 OP3 x1 * y1 );
|
||||
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
|
||||
|
||||
dot[0] += ( x2 * y2 OP3 x3 * y3 );
|
||||
dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(x, 1, x0, x1);
|
||||
LD_GP2_INC(y, 1, y0, y1);
|
||||
|
||||
dot[0] += ( x0 * y0 OP3 x1 * y1 );
|
||||
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
|
||||
}
|
||||
}
|
||||
|
||||
dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]);
|
||||
dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]);
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
x0 = *x;
|
||||
x1 = *(x + 1);
|
||||
x += inc_x2;
|
||||
x2 = *x;
|
||||
x3 = *(x + 1);
|
||||
x += inc_x2;
|
||||
x4 = *x;
|
||||
x5 = *(x + 1);
|
||||
x += inc_x2;
|
||||
x6 = *x;
|
||||
x7 = *(x + 1);
|
||||
x += inc_x2;
|
||||
|
||||
y0 = *y;
|
||||
y1 = *(y + 1);
|
||||
y += inc_y2;
|
||||
y2 = *y;
|
||||
y3 = *(y + 1);
|
||||
y += inc_y2;
|
||||
y4 = *y;
|
||||
y5 = *(y + 1);
|
||||
y += inc_y2;
|
||||
y6 = *y;
|
||||
y7 = *(y + 1);
|
||||
y += inc_y2;
|
||||
|
||||
dot[0] += ( x0 * y0 OP3 x1 * y1 );
|
||||
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
|
||||
|
||||
dot[0] += ( x2 * y2 OP3 x3 * y3 );
|
||||
dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
|
||||
|
||||
dot[0] += ( x4 * y4 OP3 x5 * y5 );
|
||||
dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
|
||||
|
||||
dot[0] += ( x6 * y6 OP3 x7 * y7 );
|
||||
dot[1] OP2 ( x7 * y6 OP4 x6 * y7 );
|
||||
}
|
||||
|
||||
if ((n & 2) && (n & 1))
|
||||
{
|
||||
x0 = *x;
|
||||
x1 = *(x + 1);
|
||||
x += inc_x2;
|
||||
x2 = *x;
|
||||
x3 = *(x + 1);
|
||||
x += inc_x2;
|
||||
x4 = *x;
|
||||
x5 = *(x + 1);
|
||||
x += inc_x2;
|
||||
|
||||
y0 = *y;
|
||||
y1 = *(y + 1);
|
||||
y += inc_y2;
|
||||
y2 = *y;
|
||||
y3 = *(y + 1);
|
||||
y += inc_y2;
|
||||
y4 = *y;
|
||||
y5 = *(y + 1);
|
||||
y += inc_y2;
|
||||
|
||||
dot[0] += ( x0 * y0 OP3 x1 * y1 );
|
||||
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
|
||||
|
||||
dot[0] += ( x2 * y2 OP3 x3 * y3 );
|
||||
dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
|
||||
|
||||
dot[0] += ( x4 * y4 OP3 x5 * y5 );
|
||||
dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
x0 = *x;
|
||||
x1 = *(x + 1);
|
||||
x += inc_x2;
|
||||
x2 = *x;
|
||||
x3 = *(x + 1);
|
||||
x += inc_x2;
|
||||
|
||||
y0 = *y;
|
||||
y1 = *(y + 1);
|
||||
y += inc_y2;
|
||||
y2 = *y;
|
||||
y3 = *(y + 1);
|
||||
y += inc_y2;
|
||||
|
||||
dot[0] += ( x0 * y0 OP3 x1 * y1 );
|
||||
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
|
||||
|
||||
dot[0] += ( x2 * y2 OP3 x3 * y3 );
|
||||
dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
x0 = *x;
|
||||
x1 = *(x + 1);
|
||||
x += inc_x2;
|
||||
|
||||
y0 = *y;
|
||||
y1 = *(y + 1);
|
||||
y += inc_y2;
|
||||
|
||||
dot[0] += ( x0 * y0 OP3 x1 * y1 );
|
||||
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
|
||||
}
|
||||
}
|
||||
|
||||
__real__(result) = dot[0];
|
||||
__imag__(result) = dot[1];
|
||||
|
||||
return(result);
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,195 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v4f32 dst0, dst1, dst4, dst5;
|
||||
|
||||
psrc0 = src;
|
||||
pdst = dst;
|
||||
lda *= 2;
|
||||
|
||||
for (j = (n >> 2); j--;)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc3 = psrc2 + lda;
|
||||
psrc4 = psrc3 + lda;
|
||||
psrc0 += 4 * lda;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
LD_SP2_INC(psrc2, 4, src2, src3);
|
||||
LD_SP2_INC(psrc3, 4, src4, src5);
|
||||
LD_SP2_INC(psrc4, 4, src6, src7);
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_SP(src6, src4, dst1, dst5);
|
||||
|
||||
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
|
||||
|
||||
ILVRL_D2_SP(src3, src1, dst0, dst4);
|
||||
ILVRL_D2_SP(src7, src5, dst1, dst5);
|
||||
|
||||
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src2 = LD_SP(psrc2);
|
||||
src4 = LD_SP(psrc3);
|
||||
src6 = LD_SP(psrc4);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
psrc3 += 4;
|
||||
psrc4 += 4;
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_SP(src6, src4, dst1, dst5);
|
||||
|
||||
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
ctemp05 = *(psrc3 + 0);
|
||||
ctemp06 = *(psrc3 + 1);
|
||||
ctemp07 = *(psrc4 + 0);
|
||||
ctemp08 = *(psrc4 + 1);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
psrc3 += 2;
|
||||
psrc4 += 2;
|
||||
|
||||
*(pdst + 0) = ctemp01;
|
||||
*(pdst + 1) = ctemp02;
|
||||
*(pdst + 2) = ctemp03;
|
||||
*(pdst + 3) = ctemp04;
|
||||
*(pdst + 4) = ctemp05;
|
||||
*(pdst + 5) = ctemp06;
|
||||
*(pdst + 6) = ctemp07;
|
||||
*(pdst + 7) = ctemp08;
|
||||
pdst += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc0 += 2 * lda;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
LD_SP2_INC(psrc2, 4, src2, src3);
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
|
||||
ST_SP2_INC(dst0, dst4, pdst, 4);
|
||||
|
||||
ILVRL_D2_SP(src3, src1, dst0, dst4);
|
||||
|
||||
ST_SP2_INC(dst0, dst4, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src2 = LD_SP(psrc2);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
|
||||
ST_SP2_INC(dst0, dst4, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
|
||||
*(pdst + 0) = ctemp01;
|
||||
*(pdst + 1) = ctemp02;
|
||||
*(pdst + 2) = ctemp03;
|
||||
*(pdst + 3) = ctemp04;
|
||||
pdst += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
ST_SP2_INC(src0, src1, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
psrc1 += 4;
|
||||
|
||||
ST_SP(src0, pdst);
|
||||
pdst += 4;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
psrc1 += 2;
|
||||
|
||||
*(pdst + 0) = ctemp01;
|
||||
*(pdst + 1) = ctemp02;
|
||||
pdst += 2;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,310 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
|
||||
FLOAT *psrc8, *pdst;
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04, ctemp05, ctemp06, ctemp07;
|
||||
FLOAT ctemp08, ctemp09, ctemp10, ctemp11, ctemp12, ctemp13, ctemp14;
|
||||
FLOAT ctemp15, ctemp16;
|
||||
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
|
||||
v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
|
||||
|
||||
psrc0 = src;
|
||||
pdst = dst;
|
||||
lda *= 2;
|
||||
|
||||
for (j = (n >> 3); j--;)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc3 = psrc2 + lda;
|
||||
psrc4 = psrc3 + lda;
|
||||
psrc5 = psrc4 + lda;
|
||||
psrc6 = psrc5 + lda;
|
||||
psrc7 = psrc6 + lda;
|
||||
psrc8 = psrc7 + lda;
|
||||
psrc0 += 8 * lda;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
LD_SP2_INC(psrc2, 4, src2, src3);
|
||||
LD_SP2_INC(psrc3, 4, src4, src5);
|
||||
LD_SP2_INC(psrc4, 4, src6, src7);
|
||||
LD_SP2_INC(psrc5, 4, src8, src9);
|
||||
LD_SP2_INC(psrc6, 4, src10, src11);
|
||||
LD_SP2_INC(psrc7, 4, src12, src13);
|
||||
LD_SP2_INC(psrc8, 4, src14, src15);
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_SP(src6, src4, dst1, dst5);
|
||||
ILVRL_D2_SP(src10, src8, dst2, dst6);
|
||||
ILVRL_D2_SP(src14, src12, dst3, dst7);
|
||||
|
||||
ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
|
||||
|
||||
ILVRL_D2_SP(src3, src1, dst0, dst4);
|
||||
ILVRL_D2_SP(src7, src5, dst1, dst5);
|
||||
ILVRL_D2_SP(src11, src9, dst2, dst6);
|
||||
ILVRL_D2_SP(src15, src13, dst3, dst7);
|
||||
|
||||
ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src2 = LD_SP(psrc2);
|
||||
src4 = LD_SP(psrc3);
|
||||
src6 = LD_SP(psrc4);
|
||||
src8 = LD_SP(psrc5);
|
||||
src10 = LD_SP(psrc6);
|
||||
src12 = LD_SP(psrc7);
|
||||
src14 = LD_SP(psrc8);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
psrc3 += 4;
|
||||
psrc4 += 4;
|
||||
psrc5 += 4;
|
||||
psrc6 += 4;
|
||||
psrc7 += 4;
|
||||
psrc8 += 4;
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_SP(src6, src4, dst1, dst5);
|
||||
ILVRL_D2_SP(src10, src8, dst2, dst6);
|
||||
ILVRL_D2_SP(src14, src12, dst3, dst7);
|
||||
|
||||
ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
ctemp05 = *(psrc3 + 0);
|
||||
ctemp06 = *(psrc3 + 1);
|
||||
ctemp07 = *(psrc4 + 0);
|
||||
ctemp08 = *(psrc4 + 1);
|
||||
ctemp09 = *(psrc5 + 0);
|
||||
ctemp10 = *(psrc5 + 1);
|
||||
ctemp11 = *(psrc6 + 0);
|
||||
ctemp12 = *(psrc6 + 1);
|
||||
ctemp13 = *(psrc7 + 0);
|
||||
ctemp14 = *(psrc7 + 1);
|
||||
ctemp15 = *(psrc8 + 0);
|
||||
ctemp16 = *(psrc8 + 1);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
psrc3 += 2;
|
||||
psrc4 += 2;
|
||||
psrc5 += 2;
|
||||
psrc6 += 2;
|
||||
psrc7 += 2;
|
||||
psrc8 += 2;
|
||||
|
||||
*(pdst + 0) = ctemp01;
|
||||
*(pdst + 1) = ctemp02;
|
||||
*(pdst + 2) = ctemp03;
|
||||
*(pdst + 3) = ctemp04;
|
||||
*(pdst + 4) = ctemp05;
|
||||
*(pdst + 5) = ctemp06;
|
||||
*(pdst + 6) = ctemp07;
|
||||
*(pdst + 7) = ctemp08;
|
||||
*(pdst + 8) = ctemp09;
|
||||
*(pdst + 9) = ctemp10;
|
||||
*(pdst + 10) = ctemp11;
|
||||
*(pdst + 11) = ctemp12;
|
||||
*(pdst + 12) = ctemp13;
|
||||
*(pdst + 13) = ctemp14;
|
||||
*(pdst + 14) = ctemp15;
|
||||
*(pdst + 15) = ctemp16;
|
||||
pdst += 16;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc3 = psrc2 + lda;
|
||||
psrc4 = psrc3 + lda;
|
||||
psrc0 += 4 * lda;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
LD_SP2_INC(psrc2, 4, src2, src3);
|
||||
LD_SP2_INC(psrc3, 4, src4, src5);
|
||||
LD_SP2_INC(psrc4, 4, src6, src7);
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_SP(src6, src4, dst1, dst5);
|
||||
|
||||
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
|
||||
|
||||
ILVRL_D2_SP(src3, src1, dst0, dst4);
|
||||
ILVRL_D2_SP(src7, src5, dst1, dst5);
|
||||
|
||||
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src2 = LD_SP(psrc2);
|
||||
src4 = LD_SP(psrc3);
|
||||
src6 = LD_SP(psrc4);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
psrc3 += 4;
|
||||
psrc4 += 4;
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_SP(src6, src4, dst1, dst5);
|
||||
|
||||
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
ctemp05 = *(psrc3 + 0);
|
||||
ctemp06 = *(psrc3 + 1);
|
||||
ctemp07 = *(psrc4 + 0);
|
||||
ctemp08 = *(psrc4 + 1);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
psrc3 += 2;
|
||||
psrc4 += 2;
|
||||
|
||||
*(pdst + 0) = ctemp01;
|
||||
*(pdst + 1) = ctemp02;
|
||||
*(pdst + 2) = ctemp03;
|
||||
*(pdst + 3) = ctemp04;
|
||||
*(pdst + 4) = ctemp05;
|
||||
*(pdst + 5) = ctemp06;
|
||||
*(pdst + 6) = ctemp07;
|
||||
*(pdst + 7) = ctemp08;
|
||||
pdst += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc0 += 2 * lda;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
LD_SP2_INC(psrc2, 4, src2, src3);
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
|
||||
ST_SP2_INC(dst0, dst4, pdst, 4);
|
||||
|
||||
ILVRL_D2_SP(src3, src1, dst0, dst4);
|
||||
|
||||
ST_SP2_INC(dst0, dst4, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src2 = LD_SP(psrc2);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
|
||||
ST_SP2_INC(dst0, dst4, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
|
||||
*(pdst + 0) = ctemp01;
|
||||
*(pdst + 1) = ctemp02;
|
||||
*(pdst + 2) = ctemp03;
|
||||
*(pdst + 3) = ctemp04;
|
||||
pdst += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
ST_SP2_INC(src0, src1, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
psrc1 += 4;
|
||||
|
||||
ST_SP(src0, pdst);
|
||||
pdst += 4;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
psrc1 += 2;
|
||||
|
||||
*(pdst + 0) = ctemp01;
|
||||
*(pdst + 1) = ctemp02;
|
||||
pdst += 2;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,125 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0;
|
||||
FLOAT *psrc1, *psrc2;
|
||||
FLOAT *pdst0;
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
v4f32 src0, src1, src2, src3;
|
||||
|
||||
psrc0 = src;
|
||||
pdst0 = dst;
|
||||
lda *= 2;
|
||||
|
||||
for (j = (n >> 2); j--;)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc0 + lda;
|
||||
psrc0 += 8;
|
||||
|
||||
for (i = (m >> 1); i--;)
|
||||
{
|
||||
LD_SP2(psrc1, 4, src0, src1);
|
||||
LD_SP2(psrc2, 4, src2, src3);
|
||||
ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
LD_SP2(psrc1, 4, src0, src1);
|
||||
ST_SP2_INC(src0, src1, pdst0, 4);
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc0 + lda;
|
||||
psrc0 += 4;
|
||||
|
||||
for (i = (m >> 1); i--;)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src1 = LD_SP(psrc2);
|
||||
ST_SP2_INC(src0, src1, pdst0, 4);
|
||||
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
ST_SP(src0, pdst0);
|
||||
pdst0 += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc0 + lda;
|
||||
psrc0 += 2;
|
||||
|
||||
for (i = (m >> 1); i--;)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
|
||||
*(pdst0 + 0) = ctemp01;
|
||||
*(pdst0 + 1) = ctemp02;
|
||||
*(pdst0 + 2) = ctemp03;
|
||||
*(pdst0 + 3) = ctemp04;
|
||||
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
pdst0 += 4;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
|
||||
*(pdst0 + 0) = ctemp01;
|
||||
*(pdst0 + 1) = ctemp02;
|
||||
pdst0 += 2;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,214 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *pdst0;
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
|
||||
|
||||
psrc0 = src;
|
||||
pdst0 = dst;
|
||||
lda *= 2;
|
||||
|
||||
for (j = (n >> 3); j--;)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc0 + lda;
|
||||
psrc0 += 16;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP4(psrc1, 4, src0, src1, src2, src3);
|
||||
LD_SP4(psrc2, 4, src4, src5, src6, src7);
|
||||
LD_SP4(psrc1 + 2 * lda, 4, src8, src9, src10, src11);
|
||||
LD_SP4(psrc2 + 2 * lda, 4, src12, src13, src14, src15);
|
||||
ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4);
|
||||
ST_SP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, pdst0, 4);
|
||||
psrc1 += 4 * lda;
|
||||
psrc2 += 4 * lda;
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
LD_SP4(psrc1, 4, src0, src1, src2, src3);
|
||||
LD_SP4(psrc2, 4, src4, src5, src6, src7);
|
||||
ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4);
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
LD_SP4(psrc1, 4, src0, src1, src2, src3);
|
||||
ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc0 + lda;
|
||||
psrc0 += 8;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2(psrc1, 4, src0, src1);
|
||||
LD_SP2(psrc2, 4, src2, src3);
|
||||
LD_SP2(psrc1 + 2 * lda, 4, src4, src5);
|
||||
LD_SP2(psrc2 + 2 * lda, 4, src6, src7);
|
||||
|
||||
ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
|
||||
ST_SP4_INC(src4, src5, src6, src7, pdst0, 4);
|
||||
psrc1 += 4 * lda;
|
||||
psrc2 += 4 * lda;
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
LD_SP2(psrc1, 4, src0, src1);
|
||||
LD_SP2(psrc2, 4, src2, src3);
|
||||
ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
LD_SP2(psrc1, 4, src0, src1);
|
||||
ST_SP2_INC(src0, src1, pdst0, 4);
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc0 + lda;
|
||||
psrc0 += 4;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src1 = LD_SP(psrc2);
|
||||
src2 = LD_SP(psrc1 + 2 * lda);
|
||||
src3 = LD_SP(psrc2 + 2 * lda);
|
||||
ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
|
||||
|
||||
psrc1 += 4 * lda;
|
||||
psrc2 += 4 * lda;
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src1 = LD_SP(psrc2);
|
||||
ST_SP2_INC(src0, src1, pdst0, 4);
|
||||
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
ST_SP(src0, pdst0);
|
||||
pdst0 += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc0 + lda;
|
||||
psrc0 += 2;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
|
||||
*(pdst0 + 0) = ctemp01;
|
||||
*(pdst0 + 1) = ctemp02;
|
||||
*(pdst0 + 2) = ctemp03;
|
||||
*(pdst0 + 3) = ctemp04;
|
||||
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
pdst0 += 4;
|
||||
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
|
||||
*(pdst0 + 0) = ctemp01;
|
||||
*(pdst0 + 1) = ctemp02;
|
||||
*(pdst0 + 2) = ctemp03;
|
||||
*(pdst0 + 3) = ctemp04;
|
||||
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
pdst0 += 4;
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
|
||||
*(pdst0 + 0) = ctemp01;
|
||||
*(pdst0 + 1) = ctemp02;
|
||||
*(pdst0 + 2) = ctemp03;
|
||||
*(pdst0 + 3) = ctemp04;
|
||||
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
pdst0 += 4;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
|
||||
*(pdst0 + 0) = ctemp01;
|
||||
*(pdst0 + 1) = ctemp02;
|
||||
pdst0 += 2;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,611 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#undef OP0
|
||||
#undef OP1
|
||||
#undef OP2
|
||||
#undef OP3
|
||||
#undef OP4
|
||||
|
||||
#if !defined(XCONJ)
|
||||
#define OP3 -=
|
||||
#define OP4 +=
|
||||
#else
|
||||
#define OP3 +=
|
||||
#define OP4 -=
|
||||
#endif
|
||||
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
#define OP0 -=
|
||||
#define OP1 +=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 +=
|
||||
#define OP1 +=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
#else
|
||||
#if !defined(XCONJ)
|
||||
#define OP0 +=
|
||||
#define OP1 -=
|
||||
#define OP2 -=
|
||||
#else
|
||||
#define OP0 -=
|
||||
#define OP1 -=
|
||||
#define OP2 +=
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define CGEMV_N_8x4() \
|
||||
LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
|
||||
LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
|
||||
LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \
|
||||
LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
|
||||
PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
|
||||
PCKEVOD_W2_SP(t11, t10, src5r, src5i); \
|
||||
PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
|
||||
PCKEVOD_W2_SP(t15, t14, src7r, src7i); \
|
||||
\
|
||||
y0r += tp0r * src0r; \
|
||||
y1r += tp0r * src1r; \
|
||||
y0r += tp1r * src2r; \
|
||||
y1r += tp1r * src3r; \
|
||||
y0r += tp2r * src4r; \
|
||||
y1r += tp2r * src5r; \
|
||||
y0r += tp3r * src6r; \
|
||||
y1r += tp3r * src7r; \
|
||||
\
|
||||
y0r OP0 tp0i * src0i; \
|
||||
y1r OP0 tp0i * src1i; \
|
||||
y0r OP0 tp1i * src2i; \
|
||||
y1r OP0 tp1i * src3i; \
|
||||
y0r OP0 tp2i * src4i; \
|
||||
y1r OP0 tp2i * src5i; \
|
||||
y0r OP0 tp3i * src6i; \
|
||||
y1r OP0 tp3i * src7i; \
|
||||
\
|
||||
y0i OP1 tp0r * src0i; \
|
||||
y1i OP1 tp0r * src1i; \
|
||||
y0i OP1 tp1r * src2i; \
|
||||
y1i OP1 tp1r * src3i; \
|
||||
y0i OP1 tp2r * src4i; \
|
||||
y1i OP1 tp2r * src5i; \
|
||||
y0i OP1 tp3r * src6i; \
|
||||
y1i OP1 tp3r * src7i; \
|
||||
\
|
||||
y0i OP2 tp0i * src0r; \
|
||||
y1i OP2 tp0i * src1r; \
|
||||
y0i OP2 tp1i * src2r; \
|
||||
y1i OP2 tp1i * src3r; \
|
||||
y0i OP2 tp2i * src4r; \
|
||||
y1i OP2 tp2i * src5r; \
|
||||
y0i OP2 tp3i * src6r; \
|
||||
y1i OP2 tp3i * src7r; \
|
||||
|
||||
#define CGEMV_N_4x4() \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
LD_SP2(pa1 + k, 4, t4, t5); \
|
||||
LD_SP2(pa2 + k, 4, t8, t9); \
|
||||
LD_SP2(pa3 + k, 4, t12, t13); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
|
||||
PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
|
||||
\
|
||||
y0r += tp0r * src0r; \
|
||||
y0r += tp1r * src2r; \
|
||||
y0r += tp2r * src4r; \
|
||||
y0r += tp3r * src6r; \
|
||||
\
|
||||
y0r OP0 tp0i * src0i; \
|
||||
y0r OP0 tp1i * src2i; \
|
||||
y0r OP0 tp2i * src4i; \
|
||||
y0r OP0 tp3i * src6i; \
|
||||
\
|
||||
y0i OP1 tp0r * src0i; \
|
||||
y0i OP1 tp1r * src2i; \
|
||||
y0i OP1 tp2r * src4i; \
|
||||
y0i OP1 tp3r * src6i; \
|
||||
\
|
||||
y0i OP2 tp0i * src0r; \
|
||||
y0i OP2 tp1i * src2r; \
|
||||
y0i OP2 tp2i * src4r; \
|
||||
y0i OP2 tp3i * src6r; \
|
||||
|
||||
#define CGEMV_N_1x4() \
|
||||
res0 = y[0 * inc_y2]; \
|
||||
res1 = y[0 * inc_y2 + 1]; \
|
||||
\
|
||||
res0 += temp0_r * pa0[k]; \
|
||||
res0 OP0 temp0_i * pa0[k + 1]; \
|
||||
res0 += temp1_r * pa1[k]; \
|
||||
res0 OP0 temp1_i * pa1[k + 1]; \
|
||||
res0 += temp2_r * pa2[k]; \
|
||||
res0 OP0 temp2_i * pa2[k + 1]; \
|
||||
res0 += temp3_r * pa3[k]; \
|
||||
res0 OP0 temp3_i * pa3[k + 1]; \
|
||||
\
|
||||
res1 OP1 temp0_r * pa0[k + 1]; \
|
||||
res1 OP2 temp0_i * pa0[k]; \
|
||||
res1 OP1 temp1_r * pa1[k + 1]; \
|
||||
res1 OP2 temp1_i * pa1[k]; \
|
||||
res1 OP1 temp2_r * pa2[k + 1]; \
|
||||
res1 OP2 temp2_i * pa2[k]; \
|
||||
res1 OP1 temp3_r * pa3[k + 1]; \
|
||||
res1 OP2 temp3_i * pa3[k]; \
|
||||
\
|
||||
y[0 * inc_y2] = res0; \
|
||||
y[0 * inc_y2 + 1] = res1; \
|
||||
|
||||
#define CGEMV_N_8x2() \
|
||||
LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
|
||||
LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
|
||||
\
|
||||
y0r += tp0r * src0r; \
|
||||
y1r += tp0r * src1r; \
|
||||
y0r += tp1r * src2r; \
|
||||
y1r += tp1r * src3r; \
|
||||
\
|
||||
y0r OP0 tp0i * src0i; \
|
||||
y1r OP0 tp0i * src1i; \
|
||||
y0r OP0 tp1i * src2i; \
|
||||
y1r OP0 tp1i * src3i; \
|
||||
\
|
||||
y0i OP1 tp0r * src0i; \
|
||||
y1i OP1 tp0r * src1i; \
|
||||
y0i OP1 tp1r * src2i; \
|
||||
y1i OP1 tp1r * src3i; \
|
||||
\
|
||||
y0i OP2 tp0i * src0r; \
|
||||
y1i OP2 tp0i * src1r; \
|
||||
y0i OP2 tp1i * src2r; \
|
||||
y1i OP2 tp1i * src3r; \
|
||||
|
||||
#define CGEMV_N_4x2() \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
LD_SP2(pa1 + k, 4, t4, t5); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
\
|
||||
y0r += tp0r * src0r; \
|
||||
y0r += tp1r * src2r; \
|
||||
\
|
||||
y0r OP0 tp0i * src0i; \
|
||||
y0r OP0 tp1i * src2i; \
|
||||
\
|
||||
y0i OP1 tp0r * src0i; \
|
||||
y0i OP1 tp1r * src2i; \
|
||||
\
|
||||
y0i OP2 tp0i * src0r; \
|
||||
y0i OP2 tp1i * src2r; \
|
||||
|
||||
#define CGEMV_N_1x2() \
|
||||
res0 = y[0 * inc_y2]; \
|
||||
res1 = y[0 * inc_y2 + 1]; \
|
||||
\
|
||||
res0 += temp0_r * pa0[k]; \
|
||||
res0 OP0 temp0_i * pa0[k + 1]; \
|
||||
res0 += temp1_r * pa1[k]; \
|
||||
res0 OP0 temp1_i * pa1[k + 1]; \
|
||||
\
|
||||
res1 OP1 temp0_r * pa0[k + 1]; \
|
||||
res1 OP2 temp0_i * pa0[k]; \
|
||||
res1 OP1 temp1_r * pa1[k + 1]; \
|
||||
res1 OP2 temp1_i * pa1[k]; \
|
||||
\
|
||||
y[0 * inc_y2] = res0; \
|
||||
y[0 * inc_y2 + 1] = res1; \
|
||||
|
||||
#define CGEMV_N_1x1() \
|
||||
res0 = y[0 * inc_y2]; \
|
||||
res1 = y[0 * inc_y2 + 1]; \
|
||||
\
|
||||
res0 += temp_r * pa0[k]; \
|
||||
res0 OP0 temp_i * pa0[k + 1]; \
|
||||
\
|
||||
res1 OP1 temp_r * pa0[k + 1]; \
|
||||
res1 OP2 temp_i * pa0[k]; \
|
||||
\
|
||||
y[0 * inc_y2] = res0; \
|
||||
y[0 * inc_y2 + 1] = res1; \
|
||||
|
||||
#define CLOAD_X4_SCALE_VECTOR() \
|
||||
LD_SP2(x, 4, x0, x1); \
|
||||
\
|
||||
PCKEVOD_W2_SP(x1, x0, x0r, x0i); \
|
||||
\
|
||||
tp4r = alphar * x0r; \
|
||||
tp4r OP3 alphai * x0i; \
|
||||
tp4i = alphar * x0i; \
|
||||
tp4i OP4 alphai * x0r; \
|
||||
\
|
||||
SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \
|
||||
SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \
|
||||
|
||||
#define CLOAD_X4_SCALE_GP() \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \
|
||||
\
|
||||
tp4r = alphar * x0r; \
|
||||
tp4r OP3 alphai * x0i; \
|
||||
tp4i = alphar * x0i; \
|
||||
tp4i OP4 alphai * x0r; \
|
||||
\
|
||||
SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \
|
||||
SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \
|
||||
|
||||
#define CLOAD_X2_SCALE_GP() \
|
||||
temp0_r = alpha_r * x[0 * inc_x2]; \
|
||||
temp0_r OP3 alpha_i * x[0 * inc_x2 + 1]; \
|
||||
temp0_i = alpha_r * x[0 * inc_x2 + 1]; \
|
||||
temp0_i OP4 alpha_i * x[0 * inc_x2]; \
|
||||
\
|
||||
temp1_r = alpha_r * x[1 * inc_x2]; \
|
||||
temp1_r OP3 alpha_i * x[1 * inc_x2 + 1]; \
|
||||
temp1_i = alpha_r * x[1 * inc_x2 + 1]; \
|
||||
temp1_i OP4 alpha_i * x[1 * inc_x2]; \
|
||||
\
|
||||
tp0r = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_r); \
|
||||
tp0i = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_i); \
|
||||
tp1r = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_r); \
|
||||
tp1i = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_i); \
|
||||
|
||||
#define CLOAD_X1_SCALE_GP() \
|
||||
temp_r = alpha_r * x[0 * inc_x2]; \
|
||||
temp_r OP3 alpha_i * x[0 * inc_x2 + 1]; \
|
||||
temp_i = alpha_r * x[0 * inc_x2 + 1]; \
|
||||
temp_i OP4 alpha_i * x[0 * inc_x2]; \
|
||||
|
||||
#define CLOAD_Y8_VECTOR() \
|
||||
LD_SP4(y, 4, y0, y1, y2, y3); \
|
||||
PCKEVOD_W2_SP(y1, y0, y0r, y0i); \
|
||||
PCKEVOD_W2_SP(y3, y2, y1r, y1i); \
|
||||
|
||||
#define CLOAD_Y4_VECTOR() \
|
||||
LD_SP2(y, 4, y0, y1); \
|
||||
PCKEVOD_W2_SP(y1, y0, y0r, y0i); \
|
||||
|
||||
#define CSTORE_Y8_VECTOR() \
|
||||
ILVRL_W2_SP(y0i, y0r, y0, y1); \
|
||||
ILVRL_W2_SP(y1i, y1r, y2, y3); \
|
||||
ST_SP4(y0, y1, y2, y3, y, 4); \
|
||||
|
||||
#define CSTORE_Y4_VECTOR() \
|
||||
ILVRL_W2_SP(y0i, y0r, y0, y1); \
|
||||
ST_SP2(y0, y1, y, 4); \
|
||||
|
||||
#define CLOAD_Y8_GP() \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \
|
||||
y1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2))); \
|
||||
y1r = (v4f32) __msa_insert_w((v4i32) y1r, 1, *((int *)(y + 5 * inc_y2))); \
|
||||
y1r = (v4f32) __msa_insert_w((v4i32) y1r, 2, *((int *)(y + 6 * inc_y2))); \
|
||||
y1r = (v4f32) __msa_insert_w((v4i32) y1r, 3, *((int *)(y + 7 * inc_y2))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \
|
||||
y1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2 + 1))); \
|
||||
y1i = (v4f32) __msa_insert_w((v4i32) y1i, 1, *((int *)(y + 5 * inc_y2 + 1))); \
|
||||
y1i = (v4f32) __msa_insert_w((v4i32) y1i, 2, *((int *)(y + 6 * inc_y2 + 1))); \
|
||||
y1i = (v4f32) __msa_insert_w((v4i32) y1i, 3, *((int *)(y + 7 * inc_y2 + 1))); \
|
||||
|
||||
#define CLOAD_Y4_GP() \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \
|
||||
|
||||
#define CSTORE_Y8_GP() \
|
||||
*((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \
|
||||
*((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \
|
||||
*((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \
|
||||
*((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \
|
||||
*((int *)(y + 4 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 0); \
|
||||
*((int *)(y + 5 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 1); \
|
||||
*((int *)(y + 6 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 2); \
|
||||
*((int *)(y + 7 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 3); \
|
||||
*((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \
|
||||
*((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \
|
||||
*((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \
|
||||
*((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \
|
||||
*((int *)(y + 4 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 0); \
|
||||
*((int *)(y + 5 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 1); \
|
||||
*((int *)(y + 6 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 2); \
|
||||
*((int *)(y + 7 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 3); \
|
||||
|
||||
#define CSTORE_Y4_GP() \
|
||||
*((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \
|
||||
*((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \
|
||||
*((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \
|
||||
*((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \
|
||||
*((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \
|
||||
*((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \
|
||||
*((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \
|
||||
*((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \
|
||||
|
||||
#define CGEMV_N_MSA() \
|
||||
for (j = (n >> 2); j--;) \
|
||||
{ \
|
||||
CLOAD_X4_SCALE(); \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
CLOAD_Y8() \
|
||||
CGEMV_N_8x4(); \
|
||||
CSTORE_Y8(); \
|
||||
\
|
||||
k += 2 * 8; \
|
||||
y += inc_y2 * 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
CLOAD_Y4(); \
|
||||
CGEMV_N_4x4(); \
|
||||
CSTORE_Y4(); \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
y += inc_y2 * 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 3) \
|
||||
{ \
|
||||
temp0_r = tp4r[0]; \
|
||||
temp1_r = tp4r[1]; \
|
||||
temp2_r = tp4r[2]; \
|
||||
temp3_r = tp4r[3]; \
|
||||
\
|
||||
temp0_i = tp4i[0]; \
|
||||
temp1_i = tp4i[1]; \
|
||||
temp2_i = tp4i[2]; \
|
||||
temp3_i = tp4i[3]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
CGEMV_N_1x4(); \
|
||||
\
|
||||
k += 2; \
|
||||
y += inc_y2; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
pa0 += 4 * lda2; \
|
||||
pa1 += 4 * lda2; \
|
||||
pa2 += 4 * lda2; \
|
||||
pa3 += 4 * lda2; \
|
||||
\
|
||||
x += 4 * inc_x2; \
|
||||
} \
|
||||
\
|
||||
if (n & 2) \
|
||||
{ \
|
||||
CLOAD_X2_SCALE(); \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
CLOAD_Y8(); \
|
||||
CGEMV_N_8x2(); \
|
||||
CSTORE_Y8(); \
|
||||
\
|
||||
k += 2 * 8; \
|
||||
y += inc_y2 * 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
CLOAD_Y4(); \
|
||||
CGEMV_N_4x2(); \
|
||||
CSTORE_Y4(); \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
y += inc_y2 * 4; \
|
||||
} \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
CGEMV_N_1x2(); \
|
||||
\
|
||||
k += 2; \
|
||||
y += inc_y2; \
|
||||
} \
|
||||
\
|
||||
pa0 += 2 * lda2; \
|
||||
pa1 += 2 * lda2; \
|
||||
\
|
||||
x += 2 * inc_x2; \
|
||||
} \
|
||||
\
|
||||
if (n & 1) \
|
||||
{ \
|
||||
CLOAD_X1_SCALE(); \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = m; i--;) \
|
||||
{ \
|
||||
CGEMV_N_1x1(); \
|
||||
\
|
||||
k += 2; \
|
||||
y += inc_y2; \
|
||||
} \
|
||||
\
|
||||
pa0 += lda2; \
|
||||
x += inc_x2; \
|
||||
} \
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y,
|
||||
BLASLONG inc_y2, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
FLOAT *y_org = y;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3;
|
||||
FLOAT temp_r, temp_i, res0, res1, temp0_r;
|
||||
FLOAT temp0_i, temp1_r, temp1_i, temp2_r, temp2_i, temp3_r, temp3_i;
|
||||
v4f32 alphar, alphai;
|
||||
v4f32 x0, x1, y0, y1, y2, y3, x0r, x0i, y0r, y1r, y0i, y1i;
|
||||
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
|
||||
v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
|
||||
v4f32 tp0r, tp1r, tp2r, tp3r, tp4r, tp0i, tp1i, tp2i, tp3i, tp4i;
|
||||
|
||||
lda2 = 2 * lda2;
|
||||
inc_x2 = 2 * inc_x2;
|
||||
inc_y2 = 2 * inc_y2;
|
||||
|
||||
pa0 = A;
|
||||
pa1 = A + lda2;
|
||||
pa2 = A + 2 * lda2;
|
||||
pa3 = A + 3 * lda2;
|
||||
|
||||
alphar = COPY_FLOAT_TO_VECTOR(alpha_r);
|
||||
alphai = COPY_FLOAT_TO_VECTOR(alpha_i);
|
||||
|
||||
if ((2 == inc_x2) && (2 == inc_y2))
|
||||
{
|
||||
#define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR
|
||||
#define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
|
||||
#define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
|
||||
#define CLOAD_Y8 CLOAD_Y8_VECTOR
|
||||
#define CLOAD_Y4 CLOAD_Y4_VECTOR
|
||||
#define CSTORE_Y8 CSTORE_Y8_VECTOR
|
||||
#define CSTORE_Y4 CSTORE_Y4_VECTOR
|
||||
|
||||
CGEMV_N_MSA();
|
||||
|
||||
#undef CLOAD_X4_SCALE
|
||||
#undef CLOAD_X2_SCALE
|
||||
#undef CLOAD_X1_SCALE
|
||||
#undef CLOAD_Y8
|
||||
#undef CLOAD_Y4
|
||||
#undef CSTORE_Y8
|
||||
#undef CSTORE_Y4
|
||||
}
|
||||
else if (2 == inc_x2)
|
||||
{
|
||||
#define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR
|
||||
#define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
|
||||
#define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
|
||||
#define CLOAD_Y8 CLOAD_Y8_GP
|
||||
#define CLOAD_Y4 CLOAD_Y4_GP
|
||||
#define CSTORE_Y8 CSTORE_Y8_GP
|
||||
#define CSTORE_Y4 CSTORE_Y4_GP
|
||||
|
||||
CGEMV_N_MSA();
|
||||
|
||||
#undef CLOAD_X4_SCALE
|
||||
#undef CLOAD_X2_SCALE
|
||||
#undef CLOAD_X1_SCALE
|
||||
#undef CLOAD_Y8
|
||||
#undef CLOAD_Y4
|
||||
#undef CSTORE_Y8
|
||||
#undef CSTORE_Y4
|
||||
}
|
||||
else if (2 == inc_y2)
|
||||
{
|
||||
#define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP
|
||||
#define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
|
||||
#define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
|
||||
#define CLOAD_Y8 CLOAD_Y8_VECTOR
|
||||
#define CLOAD_Y4 CLOAD_Y4_VECTOR
|
||||
#define CSTORE_Y8 CSTORE_Y8_VECTOR
|
||||
#define CSTORE_Y4 CSTORE_Y4_VECTOR
|
||||
|
||||
CGEMV_N_MSA();
|
||||
|
||||
#undef CLOAD_X4_SCALE
|
||||
#undef CLOAD_X2_SCALE
|
||||
#undef CLOAD_X1_SCALE
|
||||
#undef CLOAD_Y8
|
||||
#undef CLOAD_Y4
|
||||
#undef CSTORE_Y8
|
||||
#undef CSTORE_Y4
|
||||
}
|
||||
else
|
||||
{
|
||||
#define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP
|
||||
#define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
|
||||
#define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
|
||||
#define CLOAD_Y8 CLOAD_Y8_GP
|
||||
#define CLOAD_Y4 CLOAD_Y4_GP
|
||||
#define CSTORE_Y8 CSTORE_Y8_GP
|
||||
#define CSTORE_Y4 CSTORE_Y4_GP
|
||||
|
||||
CGEMV_N_MSA();
|
||||
|
||||
#undef CLOAD_X4_SCALE
|
||||
#undef CLOAD_X2_SCALE
|
||||
#undef CLOAD_X1_SCALE
|
||||
#undef CLOAD_Y8
|
||||
#undef CLOAD_Y4
|
||||
#undef CSTORE_Y8
|
||||
#undef CSTORE_Y4
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
#undef OP0
|
||||
#undef OP1
|
||||
#undef OP2
|
||||
#undef OP3
|
||||
#undef OP4
|
||||
|
|
@ -0,0 +1,583 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#undef OP0
|
||||
#undef OP1
|
||||
#undef OP2
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
#define OP0 -=
|
||||
#define OP1 +=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 +=
|
||||
#define OP1 +=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
|
||||
#define CGEMV_T_8x4() \
|
||||
LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
|
||||
LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
|
||||
LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \
|
||||
LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
|
||||
PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
|
||||
PCKEVOD_W2_SP(t11, t10, src5r, src5i); \
|
||||
PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
|
||||
PCKEVOD_W2_SP(t15, t14, src7r, src7i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r += src1r * x1r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
tp0r OP0 src1i * x1i; \
|
||||
\
|
||||
tp1r += src2r * x0r; \
|
||||
tp1r += src3r * x1r; \
|
||||
tp1r OP0 src2i * x0i; \
|
||||
tp1r OP0 src3i * x1i; \
|
||||
\
|
||||
tp2r += src4r * x0r; \
|
||||
tp2r += src5r * x1r; \
|
||||
tp2r OP0 src4i * x0i; \
|
||||
tp2r OP0 src5i * x1i; \
|
||||
\
|
||||
tp3r += src6r * x0r; \
|
||||
tp3r += src7r * x1r; \
|
||||
tp3r OP0 src6i * x0i; \
|
||||
tp3r OP0 src7i * x1i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP1 src1r * x1i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
tp0i OP2 src1i * x1r; \
|
||||
\
|
||||
tp1i OP1 src2r * x0i; \
|
||||
tp1i OP1 src3r * x1i; \
|
||||
tp1i OP2 src2i * x0r; \
|
||||
tp1i OP2 src3i * x1r; \
|
||||
\
|
||||
tp2i OP1 src4r * x0i; \
|
||||
tp2i OP1 src5r * x1i; \
|
||||
tp2i OP2 src4i * x0r; \
|
||||
tp2i OP2 src5i * x1r; \
|
||||
\
|
||||
tp3i OP1 src6r * x0i; \
|
||||
tp3i OP1 src7r * x1i; \
|
||||
tp3i OP2 src6i * x0r; \
|
||||
tp3i OP2 src7i * x1r; \
|
||||
|
||||
#define CGEMV_T_8x2() \
|
||||
LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
|
||||
LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r += src1r * x1r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
tp0r OP0 src1i * x1i; \
|
||||
\
|
||||
tp1r += src2r * x0r; \
|
||||
tp1r += src3r * x1r; \
|
||||
tp1r OP0 src2i * x0i; \
|
||||
tp1r OP0 src3i * x1i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP1 src1r * x1i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
tp0i OP2 src1i * x1r; \
|
||||
\
|
||||
tp1i OP1 src2r * x0i; \
|
||||
tp1i OP1 src3r * x1i; \
|
||||
tp1i OP2 src2i * x0r; \
|
||||
tp1i OP2 src3i * x1r; \
|
||||
|
||||
#define CGEMV_T_8x1() \
|
||||
LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r += src1r * x1r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
tp0r OP0 src1i * x1i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP1 src1r * x1i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
tp0i OP2 src1i * x1r; \
|
||||
|
||||
#define CGEMV_T_4x4() \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
LD_SP2(pa1 + k, 4, t4, t5); \
|
||||
LD_SP2(pa2 + k, 4, t8, t9); \
|
||||
LD_SP2(pa3 + k, 4, t12, t13); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
|
||||
PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
\
|
||||
tp1r += src2r * x0r; \
|
||||
tp1r OP0 src2i * x0i; \
|
||||
\
|
||||
tp2r += src4r * x0r; \
|
||||
tp2r OP0 src4i * x0i; \
|
||||
\
|
||||
tp3r += src6r * x0r; \
|
||||
tp3r OP0 src6i * x0i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
\
|
||||
tp1i OP1 src2r * x0i; \
|
||||
tp1i OP2 src2i * x0r; \
|
||||
\
|
||||
tp2i OP1 src4r * x0i; \
|
||||
tp2i OP2 src4i * x0r; \
|
||||
\
|
||||
tp3i OP1 src6r * x0i; \
|
||||
tp3i OP2 src6i * x0r; \
|
||||
|
||||
#define CGEMV_T_4x2() \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
LD_SP2(pa1 + k, 4, t4, t5); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
\
|
||||
tp1r += src2r * x0r; \
|
||||
tp1r OP0 src2i * x0i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
\
|
||||
tp1i OP1 src2r * x0i; \
|
||||
tp1i OP2 src2i * x0r; \
|
||||
|
||||
#define CGEMV_T_4x1() \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
|
||||
#define CGEMV_T_1x4() \
|
||||
temp0r += pa0[k + 0] * x[0 * inc_x2]; \
|
||||
temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
temp1r += pa1[k + 0] * x[0 * inc_x2]; \
|
||||
temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
temp2r += pa2[k + 0] * x[0 * inc_x2]; \
|
||||
temp2r OP0 pa2[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
temp3r += pa3[k + 0] * x[0 * inc_x2]; \
|
||||
temp3r OP0 pa3[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
\
|
||||
temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
|
||||
temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \
|
||||
temp2i OP1 pa2[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp2i OP2 pa2[k + 1] * x[0 * inc_x2]; \
|
||||
temp3i OP1 pa3[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp3i OP2 pa3[k + 1] * x[0 * inc_x2]; \
|
||||
|
||||
#define CGEMV_T_1x2() \
|
||||
temp0r += pa0[k + 0] * x[0 * inc_x2]; \
|
||||
temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
temp1r += pa1[k + 0] * x[0 * inc_x2]; \
|
||||
temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
\
|
||||
temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
|
||||
temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \
|
||||
|
||||
#define CGEMV_T_1x1() \
|
||||
temp0r += pa0[k + 0] * x[0 * inc_x2]; \
|
||||
temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
\
|
||||
temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
|
||||
|
||||
#define CSCALE_STORE_Y4_GP() \
|
||||
res0r = y[0 * inc_y2]; \
|
||||
res1r = y[1 * inc_y2]; \
|
||||
res2r = y[2 * inc_y2]; \
|
||||
res3r = y[3 * inc_y2]; \
|
||||
\
|
||||
res0i = y[0 * inc_y2 + 1]; \
|
||||
res1i = y[1 * inc_y2 + 1]; \
|
||||
res2i = y[2 * inc_y2 + 1]; \
|
||||
res3i = y[3 * inc_y2 + 1]; \
|
||||
\
|
||||
res0r += alphar * temp0r; \
|
||||
res0r OP0 alphai * temp0i; \
|
||||
res1r += alphar * temp1r; \
|
||||
res1r OP0 alphai * temp1i; \
|
||||
res2r += alphar * temp2r; \
|
||||
res2r OP0 alphai * temp2i; \
|
||||
res3r += alphar * temp3r; \
|
||||
res3r OP0 alphai * temp3i; \
|
||||
\
|
||||
res0i OP1 alphar * temp0i; \
|
||||
res0i OP2 alphai * temp0r; \
|
||||
res1i OP1 alphar * temp1i; \
|
||||
res1i OP2 alphai * temp1r; \
|
||||
res2i OP1 alphar * temp2i; \
|
||||
res2i OP2 alphai * temp2r; \
|
||||
res3i OP1 alphar * temp3i; \
|
||||
res3i OP2 alphai * temp3r; \
|
||||
\
|
||||
y[0 * inc_y2] = res0r; \
|
||||
y[1 * inc_y2] = res1r; \
|
||||
y[2 * inc_y2] = res2r; \
|
||||
y[3 * inc_y2] = res3r; \
|
||||
\
|
||||
y[0 * inc_y2 + 1] = res0i; \
|
||||
y[1 * inc_y2 + 1] = res1i; \
|
||||
y[2 * inc_y2 + 1] = res2i; \
|
||||
y[3 * inc_y2 + 1] = res3i; \
|
||||
|
||||
#define CSCALE_STORE_Y2_GP() \
|
||||
res0r = y[0 * inc_y2]; \
|
||||
res1r = y[1 * inc_y2]; \
|
||||
\
|
||||
res0i = y[0 * inc_y2 + 1]; \
|
||||
res1i = y[1 * inc_y2 + 1]; \
|
||||
\
|
||||
res0r += alphar * temp0r; \
|
||||
res0r OP0 alphai * temp0i; \
|
||||
res1r += alphar * temp1r; \
|
||||
res1r OP0 alphai * temp1i; \
|
||||
\
|
||||
res0i OP1 alphar * temp0i; \
|
||||
res0i OP2 alphai * temp0r; \
|
||||
res1i OP1 alphar * temp1i; \
|
||||
res1i OP2 alphai * temp1r; \
|
||||
\
|
||||
y[0 * inc_y2] = res0r; \
|
||||
y[1 * inc_y2] = res1r; \
|
||||
\
|
||||
y[0 * inc_y2 + 1] = res0i; \
|
||||
y[1 * inc_y2 + 1] = res1i; \
|
||||
|
||||
|
||||
#define CSCALE_STORE_Y1_GP() \
|
||||
res0r = y[0 * inc_y2]; \
|
||||
res0i = y[0 * inc_y2 + 1]; \
|
||||
\
|
||||
res0r += alphar * temp0r; \
|
||||
res0r OP0 alphai * temp0i; \
|
||||
\
|
||||
res0i OP1 alphar * temp0i; \
|
||||
res0i OP2 alphai * temp0r; \
|
||||
\
|
||||
y[0 * inc_y2] = res0r; \
|
||||
y[0 * inc_y2 + 1] = res0i; \
|
||||
|
||||
#define CLOAD_X8_VECTOR() \
|
||||
LD_SP4(x, 4, x0, x1, x2, x3); \
|
||||
PCKEVOD_W2_SP(x1, x0, x0r, x0i); \
|
||||
PCKEVOD_W2_SP(x3, x2, x1r, x1i); \
|
||||
|
||||
#define CLOAD_X4_VECTOR() \
|
||||
LD_SP2(x, 4, x0, x1); \
|
||||
PCKEVOD_W2_SP(x1, x0, x0r, x0i); \
|
||||
|
||||
#define CLOAD_X8_GP() \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \
|
||||
x1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2))); \
|
||||
x1r = (v4f32) __msa_insert_w((v4i32) x1r, 1, *((int *) (x + 5 * inc_x2))); \
|
||||
x1r = (v4f32) __msa_insert_w((v4i32) x1r, 2, *((int *) (x + 6 * inc_x2))); \
|
||||
x1r = (v4f32) __msa_insert_w((v4i32) x1r, 3, *((int *) (x + 7 * inc_x2))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \
|
||||
x1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2 + 1))); \
|
||||
x1i = (v4f32) __msa_insert_w((v4i32) x1i, 1, *((int *) (x + 5 * inc_x2 + 1))); \
|
||||
x1i = (v4f32) __msa_insert_w((v4i32) x1i, 2, *((int *) (x + 6 * inc_x2 + 1))); \
|
||||
x1i = (v4f32) __msa_insert_w((v4i32) x1i, 3, *((int *) (x + 7 * inc_x2 + 1))); \
|
||||
|
||||
#define CLOAD_X4_GP() \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \
|
||||
|
||||
#define CGEMV_T_MSA() \
|
||||
for (j = (n >> 2); j--;) \
|
||||
{ \
|
||||
tp0r = tp1r = tp2r = tp3r = zero; \
|
||||
tp0i = tp1i = tp2i = tp3i = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
CLOAD_X8() \
|
||||
CGEMV_T_8x4(); \
|
||||
\
|
||||
k += 2 * 8; \
|
||||
x += inc_x2 * 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
CLOAD_X4(); \
|
||||
\
|
||||
CGEMV_T_4x4(); \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
x += inc_x2 * 4; \
|
||||
} \
|
||||
\
|
||||
TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp2r, tp3r, \
|
||||
tp0r, tp1r, tp2r, tp3r); \
|
||||
TRANSPOSE4x4_SP_SP(tp0i, tp1i, tp2i, tp3i, \
|
||||
tp0i, tp1i, tp2i, tp3i); \
|
||||
\
|
||||
tp0r += tp1r; \
|
||||
tp0r += tp2r; \
|
||||
tp0r += tp3r; \
|
||||
tp0i += tp1i; \
|
||||
tp0i += tp2i; \
|
||||
tp0i += tp3i; \
|
||||
\
|
||||
temp0r = tp0r[0]; \
|
||||
temp1r = tp0r[1]; \
|
||||
temp2r = tp0r[2]; \
|
||||
temp3r = tp0r[3]; \
|
||||
temp0i = tp0i[0]; \
|
||||
temp1i = tp0i[1]; \
|
||||
temp2i = tp0i[2]; \
|
||||
temp3i = tp0i[3]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
CGEMV_T_1x4(); \
|
||||
\
|
||||
k += 2; \
|
||||
x += inc_x2; \
|
||||
} \
|
||||
\
|
||||
CSCALE_STORE_Y4_GP(); \
|
||||
\
|
||||
pa0 += 4 * lda2; \
|
||||
pa1 += 4 * lda2; \
|
||||
pa2 += 4 * lda2; \
|
||||
pa3 += 4 * lda2; \
|
||||
y += 4 * inc_y2; \
|
||||
} \
|
||||
\
|
||||
if (n & 2) \
|
||||
{ \
|
||||
tp0r = tp1r = zero; \
|
||||
tp0i = tp1i = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
CLOAD_X8(); \
|
||||
\
|
||||
CGEMV_T_8x2(); \
|
||||
\
|
||||
k += 2 * 8; \
|
||||
x += inc_x2 * 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
CLOAD_X4(); \
|
||||
\
|
||||
CGEMV_T_4x2(); \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
x += inc_x2 * 4; \
|
||||
} \
|
||||
\
|
||||
TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp0i, tp1i, \
|
||||
tp0r, tp1r, tp0i, tp1i); \
|
||||
\
|
||||
tp0r += tp1r; \
|
||||
tp0r += tp0i; \
|
||||
tp0r += tp1i; \
|
||||
\
|
||||
temp0r = tp0r[0]; \
|
||||
temp1r = tp0r[1]; \
|
||||
temp0i = tp0r[2]; \
|
||||
temp1i = tp0r[3]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
CGEMV_T_1x2(); \
|
||||
\
|
||||
k += 2; \
|
||||
x += inc_x2; \
|
||||
} \
|
||||
\
|
||||
CSCALE_STORE_Y2_GP(); \
|
||||
\
|
||||
pa0 += 2 * lda2; \
|
||||
pa1 += 2 * lda2; \
|
||||
y += 2 * inc_y2; \
|
||||
} \
|
||||
\
|
||||
if (n & 1) \
|
||||
{ \
|
||||
tp0r = zero; \
|
||||
tp0i = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
CLOAD_X8(); \
|
||||
\
|
||||
CGEMV_T_8x1(); \
|
||||
\
|
||||
k += 2 * 8; \
|
||||
x += inc_x2 * 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
CLOAD_X4(); \
|
||||
\
|
||||
CGEMV_T_4x1(); \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
x += inc_x2 * 4; \
|
||||
} \
|
||||
\
|
||||
ILVRL_W2_SP(tp0i, tp0r, t0, t1); \
|
||||
\
|
||||
t0 += t1; \
|
||||
\
|
||||
temp0r = t0[0] + t0[2]; \
|
||||
temp0i = t0[1] + t0[3]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
CGEMV_T_1x1(); \
|
||||
\
|
||||
k += 2; \
|
||||
x += inc_x2; \
|
||||
} \
|
||||
\
|
||||
CSCALE_STORE_Y1_GP(); \
|
||||
\
|
||||
pa0 += lda2; \
|
||||
y += inc_y2; \
|
||||
} \
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai,
|
||||
FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||
BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3;
|
||||
FLOAT *srcx_org = x;
|
||||
FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i;
|
||||
FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i;
|
||||
BLASLONG inc_x2, inc_y2, lda2;
|
||||
v4f32 zero = {0};
|
||||
v4f32 x0, x1, x2, x3, x0r, x1r, x0i, x1i;
|
||||
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
|
||||
v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
|
||||
v4f32 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i;
|
||||
|
||||
lda2 = 2 * lda;
|
||||
|
||||
pa0 = A;
|
||||
pa1 = A + lda2;
|
||||
pa2 = A + 2 * lda2;
|
||||
pa3 = A + 3 * lda2;
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
if (2 == inc_x2)
|
||||
{
|
||||
#define CLOAD_X8 CLOAD_X8_VECTOR
|
||||
#define CLOAD_X4 CLOAD_X4_VECTOR
|
||||
|
||||
CGEMV_T_MSA();
|
||||
|
||||
#undef CLOAD_X8
|
||||
#undef CLOAD_X4
|
||||
}
|
||||
else
|
||||
{
|
||||
#define CLOAD_X8 CLOAD_X8_GP
|
||||
#define CLOAD_X4 CLOAD_X4_GP
|
||||
|
||||
CGEMV_T_MSA();
|
||||
|
||||
#undef CLOAD_X8
|
||||
#undef CLOAD_X4
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
#undef OP0
|
||||
#undef OP1
|
||||
#undef OP2
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n < 0 ) return(0);
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[iy] = x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,278 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include "macros_msa.h"
|
||||
|
||||
#define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec))
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT sumf = 0.0;
|
||||
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
|
||||
v2f64 zero_v = {0};
|
||||
v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF};
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (sumf);
|
||||
|
||||
if (1 == inc_x)
|
||||
{
|
||||
if (n > 15)
|
||||
{
|
||||
n -= 16;
|
||||
|
||||
LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 = AND_VEC_D(src0);
|
||||
sum_abs1 = AND_VEC_D(src1);
|
||||
sum_abs2 = AND_VEC_D(src2);
|
||||
sum_abs3 = AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
sum_abs2 += AND_VEC_D(src6);
|
||||
sum_abs3 += AND_VEC_D(src7);
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = zero_v;
|
||||
sum_abs1 = zero_v;
|
||||
sum_abs2 = zero_v;
|
||||
sum_abs3 = zero_v;
|
||||
}
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
sum_abs2 += AND_VEC_D(src6);
|
||||
sum_abs3 += AND_VEC_D(src7);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if ((n & 8) && (n & 4) && (n & 2))
|
||||
{
|
||||
LD_DP7_INC(x, 2, src0, src1, src2, src3, src4, src5, src6);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
sum_abs2 += AND_VEC_D(src6);
|
||||
}
|
||||
else if ((n & 8) && (n & 4))
|
||||
{
|
||||
LD_DP6_INC(x, 2, src0, src1, src2, src3, src4, src5);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
}
|
||||
else if ((n & 8) && (n & 2))
|
||||
{
|
||||
LD_DP5_INC(x, 2, src0, src1, src2, src3, src4);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_DP3_INC(x, 2, src0, src1, src2);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
}
|
||||
else if (n & 8)
|
||||
{
|
||||
LD_DP4_INC(x, 2, src0, src1, src2, src3);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_DP2_INC(x, 2, src0, src1);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
src0 = LD_DP(x); x += 2;
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
}
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0] + sum_abs0[1];
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
sumf += fabs(*x);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0] + sum_abs0[1];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (n > 8)
|
||||
{
|
||||
n -= 8;
|
||||
|
||||
LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 = AND_VEC_D(src0);
|
||||
sum_abs1 = AND_VEC_D(src1);
|
||||
sum_abs2 = AND_VEC_D(src2);
|
||||
sum_abs3 = AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
sum_abs2 += AND_VEC_D(src6);
|
||||
sum_abs3 += AND_VEC_D(src7);
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = zero_v;
|
||||
sum_abs1 = zero_v;
|
||||
sum_abs2 = zero_v;
|
||||
sum_abs3 = zero_v;
|
||||
}
|
||||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
sum_abs2 += AND_VEC_D(src6);
|
||||
sum_abs3 += AND_VEC_D(src7);
|
||||
}
|
||||
|
||||
if (n & 7)
|
||||
{
|
||||
if ((n & 4) && (n & 2) && (n & 1))
|
||||
{
|
||||
LD_DP7_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
sum_abs2 += AND_VEC_D(src6);
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_DP6_INC(x, inc_x, src0, src1, src2, src3, src4, src5);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
}
|
||||
else if ((n & 4) && (n & 1))
|
||||
{
|
||||
LD_DP5_INC(x, inc_x, src0, src1, src2, src3, src4);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
}
|
||||
else if ((n & 2) && (n & 1))
|
||||
{
|
||||
LD_DP3_INC(x, inc_x, src0, src1, src2);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(x, inc_x, src0, src1, src2, src3);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(x, inc_x, src0, src1);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
src0 = LD_DP(x);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
}
|
||||
}
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
}
|
||||
|
||||
return (sumf);
|
||||
}
|
||||
|
|
@ -0,0 +1,189 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
/* return float, x,y float */
|
||||
#if defined(DSDOT)
|
||||
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#else
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#endif
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
double dot = 0.0;
|
||||
FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
|
||||
v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
|
||||
v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
|
||||
v2f64 dot0 = {0, 0};
|
||||
|
||||
if (n < 0) return (dot);
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
|
||||
LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
dot0 += (vy3 * vx3);
|
||||
dot0 += (vy4 * vx4);
|
||||
dot0 += (vy5 * vx5);
|
||||
dot0 += (vy6 * vx6);
|
||||
dot0 += (vy7 * vx7);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if ((n & 8) && (n & 4) && (n & 2))
|
||||
{
|
||||
LD_DP7_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6);
|
||||
LD_DP7_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
dot0 += (vy3 * vx3);
|
||||
dot0 += (vy4 * vx4);
|
||||
dot0 += (vy5 * vx5);
|
||||
dot0 += (vy6 * vx6);
|
||||
}
|
||||
else if ((n & 8) && (n & 4))
|
||||
{
|
||||
LD_DP6_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5);
|
||||
LD_DP6_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
dot0 += (vy3 * vx3);
|
||||
dot0 += (vy4 * vx4);
|
||||
dot0 += (vy5 * vx5);
|
||||
}
|
||||
else if ((n & 8) && (n & 2))
|
||||
{
|
||||
LD_DP5_INC(x, 2, vx0, vx1, vx2, vx3, vx4);
|
||||
LD_DP5_INC(y, 2, vy0, vy1, vy2, vy3, vy4);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
dot0 += (vy3 * vx3);
|
||||
dot0 += (vy4 * vx4);
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_DP3_INC(x, 2, vx0, vx1, vx2);
|
||||
LD_DP3_INC(y, 2, vy0, vy1, vy2);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
}
|
||||
else if (n & 8)
|
||||
{
|
||||
LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3);
|
||||
LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
dot0 += (vy3 * vx3);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_DP2_INC(x, 2, vx0, vx1);
|
||||
LD_DP2_INC(y, 2, vy0, vy1);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
vx0 = LD_DP(x); x += 2;
|
||||
vy0 = LD_DP(y); y += 2;
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
x0 = *x;
|
||||
y0 = *y;
|
||||
|
||||
dot += (y0 * x0);
|
||||
}
|
||||
}
|
||||
|
||||
dot += dot0[0];
|
||||
dot += dot0[1];
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
|
||||
LD_GP4_INC(y, inc_y, y0, y1, y2, y3);
|
||||
|
||||
dot += (y0 * x0);
|
||||
dot += (y1 * x1);
|
||||
dot += (y2 * x2);
|
||||
dot += (y3 * x3);
|
||||
}
|
||||
|
||||
if ((n & 2) && (n & 1))
|
||||
{
|
||||
LD_GP3_INC(x, inc_x, x0, x1, x2);
|
||||
LD_GP3_INC(y, inc_y, y0, y1, y2);
|
||||
|
||||
dot += (y0 * x0);
|
||||
dot += (y1 * x1);
|
||||
dot += (y2 * x2);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(x, inc_x, x0, x1);
|
||||
LD_GP2_INC(y, inc_y, y0, y1);
|
||||
|
||||
dot += (y0 * x0);
|
||||
dot += (y1 * x1);
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
x0 = *x;
|
||||
y0 = *y;
|
||||
|
||||
dot += (y0 * x0);
|
||||
}
|
||||
}
|
||||
|
||||
return (dot);
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,118 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
||||
FLOAT * __restrict dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
|
||||
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
|
||||
|
||||
psrc0 = src;
|
||||
pdst = dst;
|
||||
|
||||
for (j = (n >> 2); j--;)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc3 = psrc2 + lda;
|
||||
psrc4 = psrc3 + lda;
|
||||
psrc0 += 4 * lda;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
LD_DP2_INC(psrc3, 2, src4, src5);
|
||||
LD_DP2_INC(psrc4, 2, src6, src7);
|
||||
|
||||
ILVRL_D2_DP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_DP(src6, src4, dst1, dst5);
|
||||
ILVRL_D2_DP(src3, src1, dst2, dst6);
|
||||
ILVRL_D2_DP(src7, src5, dst3, dst7);
|
||||
|
||||
ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
|
||||
}
|
||||
|
||||
for (i = (m & 3); i--;)
|
||||
{
|
||||
*pdst++ = *psrc1++;
|
||||
*pdst++ = *psrc2++;
|
||||
*pdst++ = *psrc3++;
|
||||
*pdst++ = *psrc4++;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc0 += 2 * lda;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
|
||||
ILVRL_D2_DP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_DP(src3, src1, dst1, dst5);
|
||||
|
||||
ST_DP4_INC(dst0, dst4, dst1, dst5, pdst, 2);
|
||||
}
|
||||
|
||||
for (i = (m & 3); i--;)
|
||||
{
|
||||
*pdst++ = *psrc1++;
|
||||
*pdst++ = *psrc2++;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_DP2(psrc1, 2, src0, src1);
|
||||
psrc1 += 4;
|
||||
|
||||
ST_DP2(src0, src1, pdst, 2);
|
||||
pdst += 4;
|
||||
}
|
||||
|
||||
for (i = (m & 3); i--;)
|
||||
{
|
||||
*pdst++ = *psrc1++;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,186 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
||||
FLOAT * __restrict dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
|
||||
FLOAT *psrc8, *pdst;
|
||||
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
|
||||
v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
|
||||
|
||||
psrc0 = src;
|
||||
pdst = dst;
|
||||
|
||||
for (j = (n >> 3); j--;)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc3 = psrc2 + lda;
|
||||
psrc4 = psrc3 + lda;
|
||||
psrc5 = psrc4 + lda;
|
||||
psrc6 = psrc5 + lda;
|
||||
psrc7 = psrc6 + lda;
|
||||
psrc8 = psrc7 + lda;
|
||||
psrc0 += 8 * lda;
|
||||
|
||||
for (i = (m >> 3); i--;)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
LD_DP2_INC(psrc3, 2, src4, src5);
|
||||
LD_DP2_INC(psrc4, 2, src6, src7);
|
||||
LD_DP2_INC(psrc5, 2, src8, src9);
|
||||
LD_DP2_INC(psrc6, 2, src10, src11);
|
||||
LD_DP2_INC(psrc7, 2, src12, src13);
|
||||
LD_DP2_INC(psrc8, 2, src14, src15);
|
||||
|
||||
ILVRL_D2_DP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_DP(src6, src4, dst1, dst5);
|
||||
ILVRL_D2_DP(src10, src8, dst2, dst6);
|
||||
ILVRL_D2_DP(src14, src12, dst3, dst7);
|
||||
|
||||
ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
|
||||
|
||||
ILVRL_D2_DP(src3, src1, dst0, dst4);
|
||||
ILVRL_D2_DP(src7, src5, dst1, dst5);
|
||||
ILVRL_D2_DP(src11, src9, dst2, dst6);
|
||||
ILVRL_D2_DP(src15, src13, dst3, dst7);
|
||||
|
||||
ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
|
||||
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
LD_DP2_INC(psrc3, 2, src4, src5);
|
||||
LD_DP2_INC(psrc4, 2, src6, src7);
|
||||
LD_DP2_INC(psrc5, 2, src8, src9);
|
||||
LD_DP2_INC(psrc6, 2, src10, src11);
|
||||
LD_DP2_INC(psrc7, 2, src12, src13);
|
||||
LD_DP2_INC(psrc8, 2, src14, src15);
|
||||
|
||||
ILVRL_D2_DP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_DP(src6, src4, dst1, dst5);
|
||||
ILVRL_D2_DP(src10, src8, dst2, dst6);
|
||||
ILVRL_D2_DP(src14, src12, dst3, dst7);
|
||||
|
||||
ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
|
||||
|
||||
ILVRL_D2_DP(src3, src1, dst0, dst4);
|
||||
ILVRL_D2_DP(src7, src5, dst1, dst5);
|
||||
ILVRL_D2_DP(src11, src9, dst2, dst6);
|
||||
ILVRL_D2_DP(src15, src13, dst3, dst7);
|
||||
|
||||
ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
|
||||
}
|
||||
|
||||
for (i = (m & 7); i--;)
|
||||
{
|
||||
*pdst++ = *psrc1++;
|
||||
*pdst++ = *psrc2++;
|
||||
*pdst++ = *psrc3++;
|
||||
*pdst++ = *psrc4++;
|
||||
*pdst++ = *psrc5++;
|
||||
*pdst++ = *psrc6++;
|
||||
*pdst++ = *psrc7++;
|
||||
*pdst++ = *psrc8++;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc3 = psrc2 + lda;
|
||||
psrc4 = psrc3 + lda;
|
||||
psrc0 += 4 * lda;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
LD_DP2_INC(psrc3, 2, src4, src5);
|
||||
LD_DP2_INC(psrc4, 2, src6, src7);
|
||||
|
||||
ILVRL_D2_DP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_DP(src6, src4, dst1, dst5);
|
||||
ILVRL_D2_DP(src3, src1, dst2, dst6);
|
||||
ILVRL_D2_DP(src7, src5, dst3, dst7);
|
||||
|
||||
ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
|
||||
}
|
||||
|
||||
for (i = (m & 3); i--;)
|
||||
{
|
||||
*pdst++ = *psrc1++;
|
||||
*pdst++ = *psrc2++;
|
||||
*pdst++ = *psrc3++;
|
||||
*pdst++ = *psrc4++;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc0 += 2 * lda;
|
||||
|
||||
for (i = (m >> 1); i--;)
|
||||
{
|
||||
src0 = LD_DP(psrc1);
|
||||
src1 = LD_DP(psrc2);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
|
||||
ILVRL_D2_DP(src1, src0, dst0, dst1);
|
||||
|
||||
ST_DP2_INC(dst0, dst1, pdst, 2);
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
*pdst++ = *psrc1++;
|
||||
*pdst++ = *psrc2++;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
|
||||
for (i = m; i--;)
|
||||
{
|
||||
*pdst++ = *psrc1++;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,153 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
||||
FLOAT * __restrict dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
|
||||
FLOAT *pdst0, *pdst1, *pdst2, *pdst3;
|
||||
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
|
||||
psrc0 = src;
|
||||
pdst0 = dst;
|
||||
|
||||
pdst2 = dst + m * (n & ~3);
|
||||
pdst3 = dst + m * (n & ~1);
|
||||
|
||||
for (j = (m >> 2); j--;)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc3 = psrc2 + lda;
|
||||
psrc4 = psrc3 + lda;
|
||||
psrc0 += 4 * lda;
|
||||
|
||||
pdst1 = pdst0;
|
||||
pdst0 += 16;
|
||||
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
LD_DP2_INC(psrc3, 2, src4, src5);
|
||||
LD_DP2_INC(psrc4, 2, src6, src7);
|
||||
|
||||
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
|
||||
pdst1 += m * 4;
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
src0 = LD_DP(psrc1);
|
||||
src1 = LD_DP(psrc2);
|
||||
src2 = LD_DP(psrc3);
|
||||
src3 = LD_DP(psrc4);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
psrc3 += 2;
|
||||
psrc4 += 2;
|
||||
|
||||
ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*pdst3++ = *psrc1++;
|
||||
*pdst3++ = *psrc2++;
|
||||
*pdst3++ = *psrc3++;
|
||||
*pdst3++ = *psrc4++;
|
||||
}
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc0 += 2 * lda;
|
||||
|
||||
pdst1 = pdst0;
|
||||
pdst0 += 8;
|
||||
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
|
||||
ST_DP4(src0, src1, src2, src3, pdst1, 2);
|
||||
pdst1 += m * 4;
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
src0 = LD_DP(psrc1);
|
||||
src1 = LD_DP(psrc2);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
|
||||
ST_DP2_INC(src0, src1, pdst2, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*pdst3++ = *psrc1++;
|
||||
*pdst3++ = *psrc2++;
|
||||
}
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
pdst1 = pdst0;
|
||||
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
|
||||
ST_DP2(src0, src1, pdst1, 2);
|
||||
pdst1 += 4 * m;
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
src0 = LD_DP(psrc1);
|
||||
psrc1 += 2;
|
||||
|
||||
ST_DP(src0, pdst2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*pdst3 = *psrc1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,276 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
||||
FLOAT * __restrict dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
|
||||
FLOAT *psrc5, *psrc6, *psrc7, *psrc8;
|
||||
FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4;
|
||||
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
|
||||
|
||||
psrc0 = src;
|
||||
pdst0 = dst;
|
||||
|
||||
pdst2 = dst + m * (n & ~7);
|
||||
pdst3 = dst + m * (n & ~3);
|
||||
pdst4 = dst + m * (n & ~1);
|
||||
|
||||
for (j = (m >> 3); j--;)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc3 = psrc2 + lda;
|
||||
psrc4 = psrc3 + lda;
|
||||
psrc5 = psrc4 + lda;
|
||||
psrc6 = psrc5 + lda;
|
||||
psrc7 = psrc6 + lda;
|
||||
psrc8 = psrc7 + lda;
|
||||
psrc0 += 8 * lda;
|
||||
|
||||
pdst1 = pdst0;
|
||||
pdst0 += 64;
|
||||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
|
||||
LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
|
||||
LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
|
||||
LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
|
||||
|
||||
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
|
||||
ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
|
||||
pdst1 + 16, 2);
|
||||
|
||||
LD_DP4_INC(psrc5, 2, src0, src1, src2, src3);
|
||||
LD_DP4_INC(psrc6, 2, src4, src5, src6, src7);
|
||||
LD_DP4_INC(psrc7, 2, src8, src9, src10, src11);
|
||||
LD_DP4_INC(psrc8, 2, src12, src13, src14, src15);
|
||||
|
||||
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1 + 32,
|
||||
2);
|
||||
ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
|
||||
pdst1 + 48, 2);
|
||||
pdst1 += m * 8;
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
LD_DP2_INC(psrc3, 2, src4, src5);
|
||||
LD_DP2_INC(psrc4, 2, src6, src7);
|
||||
LD_DP2_INC(psrc5, 2, src8, src9);
|
||||
LD_DP2_INC(psrc6, 2, src10, src11);
|
||||
LD_DP2_INC(psrc7, 2, src12, src13);
|
||||
LD_DP2_INC(psrc8, 2, src14, src15);
|
||||
|
||||
ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
|
||||
ST_DP8_INC(src8, src9, src10, src11, src12, src13, src14, src15,
|
||||
pdst2, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
src0 = LD_DP(psrc1);
|
||||
src1 = LD_DP(psrc2);
|
||||
src2 = LD_DP(psrc3);
|
||||
src3 = LD_DP(psrc4);
|
||||
src4 = LD_DP(psrc5);
|
||||
src5 = LD_DP(psrc6);
|
||||
src6 = LD_DP(psrc7);
|
||||
src7 = LD_DP(psrc8);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
psrc3 += 2;
|
||||
psrc4 += 2;
|
||||
psrc5 += 2;
|
||||
psrc6 += 2;
|
||||
psrc7 += 2;
|
||||
psrc8 += 2;
|
||||
|
||||
ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*pdst4++ = *psrc1++;
|
||||
*pdst4++ = *psrc2++;
|
||||
*pdst4++ = *psrc3++;
|
||||
*pdst4++ = *psrc4++;
|
||||
*pdst4++ = *psrc5++;
|
||||
*pdst4++ = *psrc6++;
|
||||
*pdst4++ = *psrc7++;
|
||||
*pdst4++ = *psrc8++;
|
||||
}
|
||||
}
|
||||
|
||||
if (m & 4)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc3 = psrc2 + lda;
|
||||
psrc4 = psrc3 + lda;
|
||||
psrc0 += 4 * lda;
|
||||
|
||||
pdst1 = pdst0;
|
||||
pdst0 += 32;
|
||||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
|
||||
LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
|
||||
LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
|
||||
LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
|
||||
|
||||
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
|
||||
ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
|
||||
pdst1 + 16, 2);
|
||||
pdst1 += 8 * m;
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
LD_DP2_INC(psrc3, 2, src4, src5);
|
||||
LD_DP2_INC(psrc4, 2, src6, src7);
|
||||
|
||||
ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
src0 = LD_DP(psrc1);
|
||||
src1 = LD_DP(psrc2);
|
||||
src2 = LD_DP(psrc3);
|
||||
src3 = LD_DP(psrc4);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
psrc3 += 2;
|
||||
psrc4 += 2;
|
||||
|
||||
ST_DP4_INC(src0, src1, src2, src3, pdst3, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*pdst4++ = *psrc1++;
|
||||
*pdst4++ = *psrc2++;
|
||||
*pdst4++ = *psrc3++;
|
||||
*pdst4++ = *psrc4++;
|
||||
}
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc0 += 2 * lda;
|
||||
|
||||
pdst1 = pdst0;
|
||||
pdst0 += 16;
|
||||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
|
||||
LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
|
||||
|
||||
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
|
||||
pdst1 += 8 * m;
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
|
||||
ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
src0 = LD_DP(psrc1);
|
||||
src1 = LD_DP(psrc2);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
|
||||
ST_DP2_INC(src0, src1, pdst3, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*pdst4++ = *psrc1++;
|
||||
*pdst4++ = *psrc2++;
|
||||
}
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc0 += lda;
|
||||
|
||||
pdst1 = pdst0;
|
||||
pdst0 += 8;
|
||||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
|
||||
|
||||
ST_DP4(src0, src1, src2, src3, pdst1, 2);
|
||||
pdst1 += 8 * m;
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
|
||||
ST_DP2_INC(src0, src1, pdst2, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
src0 = LD_DP(psrc1);
|
||||
psrc1 += 2;
|
||||
|
||||
ST_DP(src0, pdst3);
|
||||
pdst3 += 2;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*pdst4++ = *psrc1++;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,577 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#define DGEMV_N_8x8() \
|
||||
{ \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
|
||||
LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
|
||||
LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \
|
||||
LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \
|
||||
LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \
|
||||
LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y1 += tp0 * t1; \
|
||||
y2 += tp0 * t2; \
|
||||
y3 += tp0 * t3; \
|
||||
\
|
||||
y0 += tp1 * t4; \
|
||||
y1 += tp1 * t5; \
|
||||
y2 += tp1 * t6; \
|
||||
y3 += tp1 * t7; \
|
||||
\
|
||||
y0 += tp2 * t8; \
|
||||
y1 += tp2 * t9; \
|
||||
y2 += tp2 * t10; \
|
||||
y3 += tp2 * t11; \
|
||||
\
|
||||
y0 += tp3 * t12; \
|
||||
y1 += tp3 * t13; \
|
||||
y2 += tp3 * t14; \
|
||||
y3 += tp3 * t15; \
|
||||
\
|
||||
y0 += tp4 * t16; \
|
||||
y1 += tp4 * t17; \
|
||||
y2 += tp4 * t18; \
|
||||
y3 += tp4 * t19; \
|
||||
\
|
||||
y0 += tp5 * t20; \
|
||||
y1 += tp5 * t21; \
|
||||
y2 += tp5 * t22; \
|
||||
y3 += tp5 * t23; \
|
||||
\
|
||||
y0 += tp6 * t24; \
|
||||
y1 += tp6 * t25; \
|
||||
y2 += tp6 * t26; \
|
||||
y3 += tp6 * t27; \
|
||||
\
|
||||
y0 += tp7 * t28; \
|
||||
y1 += tp7 * t29; \
|
||||
y2 += tp7 * t30; \
|
||||
y3 += tp7 * t31; \
|
||||
}
|
||||
|
||||
#define DGEMV_N_4x8() \
|
||||
{ \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
LD_DP2(pa2 + k, 2, t8, t9); \
|
||||
LD_DP2(pa3 + k, 2, t12, t13); \
|
||||
LD_DP2(pa4 + k, 2, t16, t17); \
|
||||
LD_DP2(pa5 + k, 2, t20, t21); \
|
||||
LD_DP2(pa6 + k, 2, t24, t25); \
|
||||
LD_DP2(pa7 + k, 2, t28, t29); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y1 += tp0 * t1; \
|
||||
\
|
||||
y0 += tp1 * t4; \
|
||||
y1 += tp1 * t5; \
|
||||
\
|
||||
y0 += tp2 * t8; \
|
||||
y1 += tp2 * t9; \
|
||||
\
|
||||
y0 += tp3 * t12; \
|
||||
y1 += tp3 * t13; \
|
||||
\
|
||||
y0 += tp4 * t16; \
|
||||
y1 += tp4 * t17; \
|
||||
\
|
||||
y0 += tp5 * t20; \
|
||||
y1 += tp5 * t21; \
|
||||
\
|
||||
y0 += tp6 * t24; \
|
||||
y1 += tp6 * t25; \
|
||||
\
|
||||
y0 += tp7 * t28; \
|
||||
y1 += tp7 * t29; \
|
||||
}
|
||||
|
||||
#define DGEMV_N_8x4() \
|
||||
{ \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
|
||||
LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y1 += tp0 * t1; \
|
||||
y2 += tp0 * t2; \
|
||||
y3 += tp0 * t3; \
|
||||
\
|
||||
y0 += tp1 * t4; \
|
||||
y1 += tp1 * t5; \
|
||||
y2 += tp1 * t6; \
|
||||
y3 += tp1 * t7; \
|
||||
\
|
||||
y0 += tp2 * t8; \
|
||||
y1 += tp2 * t9; \
|
||||
y2 += tp2 * t10; \
|
||||
y3 += tp2 * t11; \
|
||||
\
|
||||
y0 += tp3 * t12; \
|
||||
y1 += tp3 * t13; \
|
||||
y2 += tp3 * t14; \
|
||||
y3 += tp3 * t15; \
|
||||
}
|
||||
|
||||
#define DGEMV_N_4x4() \
|
||||
{ \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
LD_DP2(pa2 + k, 2, t8, t9); \
|
||||
LD_DP2(pa3 + k, 2, t12, t13); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y1 += tp0 * t1; \
|
||||
\
|
||||
y0 += tp1 * t4; \
|
||||
y1 += tp1 * t5; \
|
||||
\
|
||||
y0 += tp2 * t8; \
|
||||
y1 += tp2 * t9; \
|
||||
\
|
||||
y0 += tp3 * t12; \
|
||||
y1 += tp3 * t13; \
|
||||
}
|
||||
|
||||
#define DGEMV_N_8x2() \
|
||||
{ \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y1 += tp0 * t1; \
|
||||
y2 += tp0 * t2; \
|
||||
y3 += tp0 * t3; \
|
||||
\
|
||||
y0 += tp1 * t4; \
|
||||
y1 += tp1 * t5; \
|
||||
y2 += tp1 * t6; \
|
||||
y3 += tp1 * t7; \
|
||||
}
|
||||
|
||||
#define DGEMV_N_4x2() \
|
||||
{ \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y1 += tp0 * t1; \
|
||||
\
|
||||
y0 += tp1 * t4; \
|
||||
y1 += tp1 * t5; \
|
||||
}
|
||||
|
||||
#define DLOAD_X8_SCALE_GP() \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
temp2 = alpha * x[2 * inc_x]; \
|
||||
temp3 = alpha * x[3 * inc_x]; \
|
||||
temp4 = alpha * x[4 * inc_x]; \
|
||||
temp5 = alpha * x[5 * inc_x]; \
|
||||
temp6 = alpha * x[6 * inc_x]; \
|
||||
temp7 = alpha * x[7 * inc_x]; \
|
||||
\
|
||||
tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
|
||||
tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
|
||||
tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \
|
||||
tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \
|
||||
tp4 = COPY_DOUBLE_TO_VECTOR(temp4); \
|
||||
tp5 = COPY_DOUBLE_TO_VECTOR(temp5); \
|
||||
tp6 = COPY_DOUBLE_TO_VECTOR(temp6); \
|
||||
tp7 = COPY_DOUBLE_TO_VECTOR(temp7); \
|
||||
|
||||
#define DLOAD_X4_SCALE_GP() \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
temp2 = alpha * x[2 * inc_x]; \
|
||||
temp3 = alpha * x[3 * inc_x]; \
|
||||
\
|
||||
tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
|
||||
tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
|
||||
tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \
|
||||
tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \
|
||||
|
||||
#define DLOAD_X8_SCALE_VECTOR() \
|
||||
LD_DP4(x, 2, x0, x1, x2, x3); \
|
||||
\
|
||||
x0 = x0 * v_alpha; \
|
||||
x1 = x1 * v_alpha; \
|
||||
x2 = x2 * v_alpha; \
|
||||
x3 = x3 * v_alpha; \
|
||||
\
|
||||
SPLATI_D2_DP(x0, tp0, tp1); \
|
||||
SPLATI_D2_DP(x1, tp2, tp3); \
|
||||
SPLATI_D2_DP(x2, tp4, tp5); \
|
||||
SPLATI_D2_DP(x3, tp6, tp7); \
|
||||
|
||||
#define DLOAD_X4_SCALE_VECTOR() \
|
||||
LD_DP2(x, 2, x0, x1); \
|
||||
\
|
||||
x0 = x0 * v_alpha; \
|
||||
x1 = x1 * v_alpha; \
|
||||
\
|
||||
SPLATI_D2_DP(x0, tp0, tp1); \
|
||||
SPLATI_D2_DP(x1, tp2, tp3); \
|
||||
|
||||
#define DLOAD_Y8_GP() \
|
||||
y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \
|
||||
y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \
|
||||
y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \
|
||||
y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \
|
||||
y2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 4 * inc_y))); \
|
||||
y2 = (v2f64) __msa_insert_d((v2i64) y2, 1, *((long long *)(y + 5 * inc_y))); \
|
||||
y3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 6 * inc_y))); \
|
||||
y3 = (v2f64) __msa_insert_d((v2i64) y3, 1, *((long long *)(y + 7 * inc_y))); \
|
||||
|
||||
#define DLOAD_Y4_GP() \
|
||||
y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \
|
||||
y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \
|
||||
y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \
|
||||
y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \
|
||||
|
||||
#define DLOAD_Y8_VECTOR() LD_DP4(y, 2, y0, y1, y2, y3);
|
||||
#define DLOAD_Y4_VECTOR() LD_DP2(y, 2, y0, y1);
|
||||
|
||||
#define DSTORE_Y8_GP() \
|
||||
*((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \
|
||||
*((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \
|
||||
*((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \
|
||||
*((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \
|
||||
*((long long *)(y + 4 * inc_y)) = __msa_copy_s_d((v2i64) y2, 0); \
|
||||
*((long long *)(y + 5 * inc_y)) = __msa_copy_s_d((v2i64) y2, 1); \
|
||||
*((long long *)(y + 6 * inc_y)) = __msa_copy_s_d((v2i64) y3, 0); \
|
||||
*((long long *)(y + 7 * inc_y)) = __msa_copy_s_d((v2i64) y3, 1); \
|
||||
|
||||
#define DSTORE_Y4_GP() \
|
||||
*((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \
|
||||
*((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \
|
||||
*((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \
|
||||
*((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \
|
||||
|
||||
#define DSTORE_Y8_VECTOR() ST_DP4(y0, y1, y2, y3, y, 2);
|
||||
#define DSTORE_Y4_VECTOR() ST_DP2(y0, y1, y, 2);
|
||||
|
||||
#define DGEMV_N_MSA() \
|
||||
for (j = (n >> 3); j--;) \
|
||||
{ \
|
||||
DLOAD_X8_SCALE(); \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
DLOAD_Y8(); \
|
||||
DGEMV_N_8x8(); \
|
||||
DSTORE_Y8(); \
|
||||
\
|
||||
y += 8 * inc_y; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
DLOAD_Y4(); \
|
||||
DGEMV_N_4x8(); \
|
||||
DSTORE_Y4(); \
|
||||
\
|
||||
y += 4 * inc_y; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 3) \
|
||||
{ \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
temp2 = alpha * x[2 * inc_x]; \
|
||||
temp3 = alpha * x[3 * inc_x]; \
|
||||
temp4 = alpha * x[4 * inc_x]; \
|
||||
temp5 = alpha * x[5 * inc_x]; \
|
||||
temp6 = alpha * x[6 * inc_x]; \
|
||||
temp7 = alpha * x[7 * inc_x]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
temp = y[0]; \
|
||||
temp += temp0 * pa0[k]; \
|
||||
temp += temp1 * pa1[k]; \
|
||||
temp += temp2 * pa2[k]; \
|
||||
temp += temp3 * pa3[k]; \
|
||||
temp += temp4 * pa4[k]; \
|
||||
temp += temp5 * pa5[k]; \
|
||||
temp += temp6 * pa6[k]; \
|
||||
temp += temp7 * pa7[k]; \
|
||||
y[0] = temp; \
|
||||
\
|
||||
y += inc_y; \
|
||||
k++; \
|
||||
} \
|
||||
} \
|
||||
pa0 += 8 * lda; \
|
||||
pa1 += 8 * lda; \
|
||||
pa2 += 8 * lda; \
|
||||
pa3 += 8 * lda; \
|
||||
pa4 += 8 * lda; \
|
||||
pa5 += 8 * lda; \
|
||||
pa6 += 8 * lda; \
|
||||
pa7 += 8 * lda; \
|
||||
\
|
||||
x += 8 * inc_x; \
|
||||
} \
|
||||
\
|
||||
if (n & 4) \
|
||||
{ \
|
||||
DLOAD_X4_SCALE(); \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
DLOAD_Y8(); \
|
||||
DGEMV_N_8x4(); \
|
||||
DSTORE_Y8(); \
|
||||
\
|
||||
y += 8 * inc_y; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
DLOAD_Y4(); \
|
||||
DGEMV_N_4x4(); \
|
||||
DSTORE_Y4(); \
|
||||
\
|
||||
y += 4 * inc_y; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 3) \
|
||||
{ \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
temp2 = alpha * x[2 * inc_x]; \
|
||||
temp3 = alpha * x[3 * inc_x]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
temp = y[0]; \
|
||||
temp += temp0 * pa0[k]; \
|
||||
temp += temp1 * pa1[k]; \
|
||||
temp += temp2 * pa2[k]; \
|
||||
temp += temp3 * pa3[k]; \
|
||||
y[0] = temp; \
|
||||
\
|
||||
y += inc_y; \
|
||||
k++; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
pa0 += 4 * lda; \
|
||||
pa1 += 4 * lda; \
|
||||
pa2 += 4 * lda; \
|
||||
pa3 += 4 * lda; \
|
||||
\
|
||||
x += 4 * inc_x; \
|
||||
} \
|
||||
\
|
||||
if (n & 2) \
|
||||
{ \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
\
|
||||
tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
|
||||
tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
DLOAD_Y8(); \
|
||||
DGEMV_N_8x2(); \
|
||||
DSTORE_Y8(); \
|
||||
\
|
||||
y += 8 * inc_y; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
DLOAD_Y4(); \
|
||||
DGEMV_N_4x2(); \
|
||||
DSTORE_Y4(); \
|
||||
\
|
||||
y += 4 * inc_y; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 3) \
|
||||
{ \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
temp = y[0]; \
|
||||
temp += temp0 * pa0[k]; \
|
||||
temp += temp1 * pa1[k]; \
|
||||
y[0] = temp; \
|
||||
\
|
||||
y += inc_y; \
|
||||
k++; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
pa0 += 2 * lda; \
|
||||
pa1 += 2 * lda; \
|
||||
\
|
||||
x += 2 * inc_x; \
|
||||
} \
|
||||
\
|
||||
if (n & 1) \
|
||||
{ \
|
||||
temp = alpha * x[0]; \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = m; i--;) \
|
||||
{ \
|
||||
y[0] += temp * pa0[k]; \
|
||||
y += inc_y; \
|
||||
k++; \
|
||||
} \
|
||||
} \
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
|
||||
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||
FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
FLOAT *y_org = y;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
||||
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
v2f64 v_alpha;
|
||||
v2f64 x0, x1, x2, x3, y0, y1, y2, y3;
|
||||
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
|
||||
v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
|
||||
|
||||
v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);
|
||||
|
||||
pa0 = A;
|
||||
pa1 = A + lda;
|
||||
pa2 = A + 2 * lda;
|
||||
pa3 = A + 3 * lda;
|
||||
pa4 = A + 4 * lda;
|
||||
pa5 = A + 5 * lda;
|
||||
pa6 = A + 6 * lda;
|
||||
pa7 = A + 7 * lda;
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
#define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR
|
||||
#define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR
|
||||
#define DLOAD_Y8 DLOAD_Y8_VECTOR
|
||||
#define DLOAD_Y4 DLOAD_Y4_VECTOR
|
||||
#define DSTORE_Y8 DSTORE_Y8_VECTOR
|
||||
#define DSTORE_Y4 DSTORE_Y4_VECTOR
|
||||
|
||||
DGEMV_N_MSA();
|
||||
|
||||
#undef DLOAD_X8_SCALE
|
||||
#undef DLOAD_X4_SCALE
|
||||
#undef DLOAD_Y8
|
||||
#undef DLOAD_Y4
|
||||
#undef DSTORE_Y8
|
||||
#undef DSTORE_Y4
|
||||
}
|
||||
else if (1 == inc_y)
|
||||
{
|
||||
#define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP
|
||||
#define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP
|
||||
#define DLOAD_Y8 DLOAD_Y8_VECTOR
|
||||
#define DLOAD_Y4 DLOAD_Y4_VECTOR
|
||||
#define DSTORE_Y8 DSTORE_Y8_VECTOR
|
||||
#define DSTORE_Y4 DSTORE_Y4_VECTOR
|
||||
|
||||
DGEMV_N_MSA();
|
||||
|
||||
#undef DLOAD_X8_SCALE
|
||||
#undef DLOAD_X4_SCALE
|
||||
#undef DLOAD_Y8
|
||||
#undef DLOAD_Y4
|
||||
#undef DSTORE_Y8
|
||||
#undef DSTORE_Y4
|
||||
}
|
||||
else if (1 == inc_x)
|
||||
{
|
||||
#define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR
|
||||
#define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR
|
||||
#define DLOAD_Y8 DLOAD_Y8_GP
|
||||
#define DLOAD_Y4 DLOAD_Y4_GP
|
||||
#define DSTORE_Y8 DSTORE_Y8_GP
|
||||
#define DSTORE_Y4 DSTORE_Y4_GP
|
||||
|
||||
DGEMV_N_MSA();
|
||||
|
||||
#undef DLOAD_X8_SCALE
|
||||
#undef DLOAD_X4_SCALE
|
||||
#undef DLOAD_Y8
|
||||
#undef DLOAD_Y4
|
||||
#undef DSTORE_Y8
|
||||
#undef DSTORE_Y4
|
||||
}
|
||||
else
|
||||
{
|
||||
#define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP
|
||||
#define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP
|
||||
#define DLOAD_Y8 DLOAD_Y8_GP
|
||||
#define DLOAD_Y4 DLOAD_Y4_GP
|
||||
#define DSTORE_Y8 DSTORE_Y8_GP
|
||||
#define DSTORE_Y4 DSTORE_Y4_GP
|
||||
|
||||
DGEMV_N_MSA();
|
||||
|
||||
#undef DLOAD_X8_SCALE
|
||||
#undef DLOAD_X4_SCALE
|
||||
#undef DLOAD_Y8
|
||||
#undef DLOAD_Y4
|
||||
#undef DSTORE_Y8
|
||||
#undef DSTORE_Y4
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
|
@ -0,0 +1,589 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#define DGEMV_T_8x8() \
|
||||
{ \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
|
||||
LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
|
||||
LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \
|
||||
LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \
|
||||
LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \
|
||||
LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp0 += x1 * t1; \
|
||||
tp0 += x2 * t2; \
|
||||
tp0 += x3 * t3; \
|
||||
\
|
||||
tp1 += x0 * t4; \
|
||||
tp1 += x1 * t5; \
|
||||
tp1 += x2 * t6; \
|
||||
tp1 += x3 * t7; \
|
||||
\
|
||||
tp2 += x0 * t8; \
|
||||
tp2 += x1 * t9; \
|
||||
tp2 += x2 * t10; \
|
||||
tp2 += x3 * t11; \
|
||||
\
|
||||
tp3 += x0 * t12; \
|
||||
tp3 += x1 * t13; \
|
||||
tp3 += x2 * t14; \
|
||||
tp3 += x3 * t15; \
|
||||
\
|
||||
tp4 += x0 * t16; \
|
||||
tp4 += x1 * t17; \
|
||||
tp4 += x2 * t18; \
|
||||
tp4 += x3 * t19; \
|
||||
\
|
||||
tp5 += x0 * t20; \
|
||||
tp5 += x1 * t21; \
|
||||
tp5 += x2 * t22; \
|
||||
tp5 += x3 * t23; \
|
||||
\
|
||||
tp6 += x0 * t24; \
|
||||
tp6 += x1 * t25; \
|
||||
tp6 += x2 * t26; \
|
||||
tp6 += x3 * t27; \
|
||||
\
|
||||
tp7 += x0 * t28; \
|
||||
tp7 += x1 * t29; \
|
||||
tp7 += x2 * t30; \
|
||||
tp7 += x3 * t31; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_8x4() \
|
||||
{ \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
LD_DP2(pa2 + k, 2, t8, t9); \
|
||||
LD_DP2(pa3 + k, 2, t12, t13); \
|
||||
LD_DP2(pa4 + k, 2, t16, t17); \
|
||||
LD_DP2(pa5 + k, 2, t20, t21); \
|
||||
LD_DP2(pa6 + k, 2, t24, t25); \
|
||||
LD_DP2(pa7 + k, 2, t28, t29); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp0 += x1 * t1; \
|
||||
\
|
||||
tp1 += x0 * t4; \
|
||||
tp1 += x1 * t5; \
|
||||
\
|
||||
tp2 += x0 * t8; \
|
||||
tp2 += x1 * t9; \
|
||||
\
|
||||
tp3 += x0 * t12; \
|
||||
tp3 += x1 * t13; \
|
||||
\
|
||||
tp4 += x0 * t16; \
|
||||
tp4 += x1 * t17; \
|
||||
\
|
||||
tp5 += x0 * t20; \
|
||||
tp5 += x1 * t21; \
|
||||
\
|
||||
tp6 += x0 * t24; \
|
||||
tp6 += x1 * t25; \
|
||||
\
|
||||
tp7 += x0 * t28; \
|
||||
tp7 += x1 * t29; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_8x2() \
|
||||
{ \
|
||||
t0 = LD_DP(pa0 + k); \
|
||||
t4 = LD_DP(pa1 + k); \
|
||||
t8 = LD_DP(pa2 + k); \
|
||||
t12 = LD_DP(pa3 + k); \
|
||||
t16 = LD_DP(pa4 + k); \
|
||||
t20 = LD_DP(pa5 + k); \
|
||||
t24 = LD_DP(pa6 + k); \
|
||||
t28 = LD_DP(pa7 + k); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp1 += x0 * t4; \
|
||||
tp2 += x0 * t8; \
|
||||
tp3 += x0 * t12; \
|
||||
tp4 += x0 * t16; \
|
||||
tp5 += x0 * t20; \
|
||||
tp6 += x0 * t24; \
|
||||
tp7 += x0 * t28; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_4x8() \
|
||||
{ \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
|
||||
LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp0 += x1 * t1; \
|
||||
tp0 += x2 * t2; \
|
||||
tp0 += x3 * t3; \
|
||||
\
|
||||
tp1 += x0 * t4; \
|
||||
tp1 += x1 * t5; \
|
||||
tp1 += x2 * t6; \
|
||||
tp1 += x3 * t7; \
|
||||
\
|
||||
tp2 += x0 * t8; \
|
||||
tp2 += x1 * t9; \
|
||||
tp2 += x2 * t10; \
|
||||
tp2 += x3 * t11; \
|
||||
\
|
||||
tp3 += x0 * t12; \
|
||||
tp3 += x1 * t13; \
|
||||
tp3 += x2 * t14; \
|
||||
tp3 += x3 * t15; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_4x4() \
|
||||
{ \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
LD_DP2(pa2 + k, 2, t8, t9); \
|
||||
LD_DP2(pa3 + k, 2, t12, t13); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp0 += x1 * t1; \
|
||||
\
|
||||
tp1 += x0 * t4; \
|
||||
tp1 += x1 * t5; \
|
||||
\
|
||||
tp2 += x0 * t8; \
|
||||
tp2 += x1 * t9; \
|
||||
\
|
||||
tp3 += x0 * t12; \
|
||||
tp3 += x1 * t13; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_4x2() \
|
||||
{ \
|
||||
t0 = LD_DP(pa0 + k); \
|
||||
t4 = LD_DP(pa1 + k); \
|
||||
t8 = LD_DP(pa2 + k); \
|
||||
t12 = LD_DP(pa3 + k); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp1 += x0 * t4; \
|
||||
tp2 += x0 * t8; \
|
||||
tp3 += x0 * t12; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_2x8() \
|
||||
{ \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp0 += x1 * t1; \
|
||||
tp0 += x2 * t2; \
|
||||
tp0 += x3 * t3; \
|
||||
\
|
||||
tp1 += x0 * t4; \
|
||||
tp1 += x1 * t5; \
|
||||
tp1 += x2 * t6; \
|
||||
tp1 += x3 * t7; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_2x4() \
|
||||
{ \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp0 += x1 * t1; \
|
||||
\
|
||||
tp1 += x0 * t4; \
|
||||
tp1 += x1 * t5; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_2x2() \
|
||||
{ \
|
||||
t0 = LD_DP(pa0 + k); \
|
||||
t4 = LD_DP(pa1 + k); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp1 += x0 * t4; \
|
||||
}
|
||||
|
||||
#define DLOAD_X8_GP() \
|
||||
x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \
|
||||
x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \
|
||||
x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \
|
||||
x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \
|
||||
x2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 4 * inc_x))); \
|
||||
x2 = (v2f64) __msa_insert_d((v2i64) x2, 1, *((long long *)(x + 5 * inc_x))); \
|
||||
x3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 6 * inc_x))); \
|
||||
x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((long long *)(x + 7 * inc_x))); \
|
||||
|
||||
#define DLOAD_X4_GP() \
|
||||
x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \
|
||||
x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \
|
||||
x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \
|
||||
x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \
|
||||
|
||||
#define DLOAD_X2_GP() \
|
||||
x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \
|
||||
x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \
|
||||
|
||||
#define DLOAD_X8_VECTOR() LD_DP4(x, 2, x0, x1, x2, x3);
|
||||
#define DLOAD_X4_VECTOR() LD_DP2(x, 2, x0, x1);
|
||||
#define DLOAD_X2_VECTOR() x0 = LD_DP(x);
|
||||
|
||||
#define DGEMV_T_MSA() \
|
||||
for (j = (n >> 3); j--;) \
|
||||
{ \
|
||||
tp0 = zero; \
|
||||
tp1 = zero; \
|
||||
tp2 = zero; \
|
||||
tp3 = zero; \
|
||||
tp4 = zero; \
|
||||
tp5 = zero; \
|
||||
tp6 = zero; \
|
||||
tp7 = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
DLOAD_X8(); \
|
||||
DGEMV_T_8x8(); \
|
||||
\
|
||||
x += 8 * inc_x; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
DLOAD_X4(); \
|
||||
DGEMV_T_8x4(); \
|
||||
\
|
||||
x += 4 * inc_x; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 2) \
|
||||
{ \
|
||||
DLOAD_X2(); \
|
||||
DGEMV_T_8x2(); \
|
||||
\
|
||||
x += 2 * inc_x; \
|
||||
k += 2; \
|
||||
} \
|
||||
\
|
||||
ILVRL_D2_DP(tp1, tp0, t0, t4); \
|
||||
ILVRL_D2_DP(tp3, tp2, t1, t5); \
|
||||
ILVRL_D2_DP(tp5, tp4, t2, t6); \
|
||||
ILVRL_D2_DP(tp7, tp6, t3, t7); \
|
||||
ADD2(t0, t4, t1, t5, t0, t1); \
|
||||
ADD2(t2, t6, t3, t7, t2, t3); \
|
||||
\
|
||||
temp0 = t0[0]; \
|
||||
temp1 = t0[1]; \
|
||||
temp2 = t1[0]; \
|
||||
temp3 = t1[1]; \
|
||||
temp4 = t2[0]; \
|
||||
temp5 = t2[1]; \
|
||||
temp6 = t3[0]; \
|
||||
temp7 = t3[1]; \
|
||||
\
|
||||
if (m & 1) \
|
||||
{ \
|
||||
temp0 += pa0[k] * x[0]; \
|
||||
temp1 += pa1[k] * x[0]; \
|
||||
temp2 += pa2[k] * x[0]; \
|
||||
temp3 += pa3[k] * x[0]; \
|
||||
temp4 += pa4[k] * x[0]; \
|
||||
temp5 += pa5[k] * x[0]; \
|
||||
temp6 += pa6[k] * x[0]; \
|
||||
temp7 += pa7[k] * x[0]; \
|
||||
\
|
||||
x += inc_x; \
|
||||
k++; \
|
||||
} \
|
||||
\
|
||||
res0 = y[0 * inc_y]; \
|
||||
res1 = y[1 * inc_y]; \
|
||||
res2 = y[2 * inc_y]; \
|
||||
res3 = y[3 * inc_y]; \
|
||||
res4 = y[4 * inc_y]; \
|
||||
res5 = y[5 * inc_y]; \
|
||||
res6 = y[6 * inc_y]; \
|
||||
res7 = y[7 * inc_y]; \
|
||||
\
|
||||
res0 += alpha * temp0; \
|
||||
res1 += alpha * temp1; \
|
||||
res2 += alpha * temp2; \
|
||||
res3 += alpha * temp3; \
|
||||
res4 += alpha * temp4; \
|
||||
res5 += alpha * temp5; \
|
||||
res6 += alpha * temp6; \
|
||||
res7 += alpha * temp7; \
|
||||
\
|
||||
y[0 * inc_y] = res0; \
|
||||
y[1 * inc_y] = res1; \
|
||||
y[2 * inc_y] = res2; \
|
||||
y[3 * inc_y] = res3; \
|
||||
y[4 * inc_y] = res4; \
|
||||
y[5 * inc_y] = res5; \
|
||||
y[6 * inc_y] = res6; \
|
||||
y[7 * inc_y] = res7; \
|
||||
\
|
||||
y += 8 * inc_y; \
|
||||
\
|
||||
pa0 += 8 * lda; \
|
||||
pa1 += 8 * lda; \
|
||||
pa2 += 8 * lda; \
|
||||
pa3 += 8 * lda; \
|
||||
pa4 += 8 * lda; \
|
||||
pa5 += 8 * lda; \
|
||||
pa6 += 8 * lda; \
|
||||
pa7 += 8 * lda; \
|
||||
} \
|
||||
\
|
||||
if (n & 4) \
|
||||
{ \
|
||||
tp0 = zero; \
|
||||
tp1 = zero; \
|
||||
tp2 = zero; \
|
||||
tp3 = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
DLOAD_X8(); \
|
||||
DGEMV_T_4x8(); \
|
||||
\
|
||||
x += 8 * inc_x; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
DLOAD_X4(); \
|
||||
DGEMV_T_4x4(); \
|
||||
\
|
||||
x += 4 * inc_x; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 2) \
|
||||
{ \
|
||||
DLOAD_X2(); \
|
||||
DGEMV_T_4x2(); \
|
||||
\
|
||||
x += 2 * inc_x; \
|
||||
k += 2; \
|
||||
} \
|
||||
\
|
||||
ILVRL_D2_DP(tp1, tp0, t0, t4); \
|
||||
ILVRL_D2_DP(tp3, tp2, t1, t5); \
|
||||
ADD2(t0, t4, t1, t5, t0, t1); \
|
||||
\
|
||||
temp0 = t0[0]; \
|
||||
temp1 = t0[1]; \
|
||||
temp2 = t1[0]; \
|
||||
temp3 = t1[1]; \
|
||||
\
|
||||
if (m & 1) \
|
||||
{ \
|
||||
temp0 += pa0[k] * x[0]; \
|
||||
temp1 += pa1[k] * x[0]; \
|
||||
temp2 += pa2[k] * x[0]; \
|
||||
temp3 += pa3[k] * x[0]; \
|
||||
\
|
||||
x += inc_x; \
|
||||
k++; \
|
||||
} \
|
||||
\
|
||||
res0 = y[0 * inc_y]; \
|
||||
res1 = y[1 * inc_y]; \
|
||||
res2 = y[2 * inc_y]; \
|
||||
res3 = y[3 * inc_y]; \
|
||||
\
|
||||
res0 += alpha * temp0; \
|
||||
res1 += alpha * temp1; \
|
||||
res2 += alpha * temp2; \
|
||||
res3 += alpha * temp3; \
|
||||
\
|
||||
y[0 * inc_y] = res0; \
|
||||
y[1 * inc_y] = res1; \
|
||||
y[2 * inc_y] = res2; \
|
||||
y[3 * inc_y] = res3; \
|
||||
\
|
||||
y += 4 * inc_y; \
|
||||
\
|
||||
pa0 += 4 * lda; \
|
||||
pa1 += 4 * lda; \
|
||||
pa2 += 4 * lda; \
|
||||
pa3 += 4 * lda; \
|
||||
} \
|
||||
\
|
||||
if (n & 2) \
|
||||
{ \
|
||||
tp0 = zero; \
|
||||
tp1 = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
DLOAD_X8(); \
|
||||
DGEMV_T_2x8(); \
|
||||
\
|
||||
x += 8 * inc_x; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
DLOAD_X4(); \
|
||||
DGEMV_T_2x4(); \
|
||||
\
|
||||
x += 4 * inc_x; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 2) \
|
||||
{ \
|
||||
DLOAD_X2(); \
|
||||
DGEMV_T_2x2(); \
|
||||
\
|
||||
x += 2 * inc_x; \
|
||||
k += 2; \
|
||||
} \
|
||||
\
|
||||
ILVRL_D2_DP(tp1, tp0, t0, t4); \
|
||||
\
|
||||
t0 += t4; \
|
||||
\
|
||||
temp0 = t0[0]; \
|
||||
temp1 = t0[1]; \
|
||||
\
|
||||
if (m & 1) \
|
||||
{ \
|
||||
temp0 += pa0[k] * x[0]; \
|
||||
temp1 += pa1[k] * x[0]; \
|
||||
x += inc_x; \
|
||||
k++; \
|
||||
} \
|
||||
\
|
||||
res0 = y[0 * inc_y]; \
|
||||
res1 = y[1 * inc_y]; \
|
||||
\
|
||||
res0 += alpha * temp0; \
|
||||
res1 += alpha * temp1; \
|
||||
\
|
||||
y[0 * inc_y] = res0; \
|
||||
y[1 * inc_y] = res1; \
|
||||
\
|
||||
y += 2 * inc_y; \
|
||||
\
|
||||
pa0 += 2 * lda; \
|
||||
pa1 += 2 * lda; \
|
||||
} \
|
||||
\
|
||||
if (n & 1) \
|
||||
{ \
|
||||
temp0 = 0.0; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = m; i--;) \
|
||||
{ \
|
||||
temp0 += pa0[k] * x[0]; \
|
||||
x += inc_x; \
|
||||
k++; \
|
||||
} \
|
||||
\
|
||||
y[0] += alpha * temp0; \
|
||||
y += inc_y; \
|
||||
pa0 += lda; \
|
||||
}
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
|
||||
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||
FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
FLOAT *srcx_org = x;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
||||
FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
|
||||
v2f64 x0, x1, x2, x3;
|
||||
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
|
||||
v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
|
||||
v2f64 zero = {0};
|
||||
|
||||
pa0 = A + 0 * lda;
|
||||
pa1 = A + 1 * lda;
|
||||
pa2 = A + 2 * lda;
|
||||
pa3 = A + 3 * lda;
|
||||
pa4 = A + 4 * lda;
|
||||
pa5 = A + 5 * lda;
|
||||
pa6 = A + 6 * lda;
|
||||
pa7 = A + 7 * lda;
|
||||
|
||||
if (1 == inc_x)
|
||||
{
|
||||
#define DLOAD_X8 DLOAD_X8_VECTOR
|
||||
#define DLOAD_X4 DLOAD_X4_VECTOR
|
||||
#define DLOAD_X2 DLOAD_X2_VECTOR
|
||||
|
||||
DGEMV_T_MSA();
|
||||
|
||||
#undef DLOAD_X8
|
||||
#undef DLOAD_X4
|
||||
#undef DLOAD_X2
|
||||
}
|
||||
else
|
||||
{
|
||||
#define DLOAD_X8 DLOAD_X8_GP
|
||||
#define DLOAD_X4 DLOAD_X4_GP
|
||||
#define DLOAD_X2 DLOAD_X2_GP
|
||||
|
||||
DGEMV_T_MSA();
|
||||
|
||||
#undef DLOAD_X8
|
||||
#undef DLOAD_X4
|
||||
#undef DLOAD_X2
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue