Merge branch 'develop'

This commit is contained in:
Zhang Xianyi 2016-08-31 23:58:42 -04:00
commit 85636ff1a0
241 changed files with 54786 additions and 3098 deletions

View File

@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4)
project(OpenBLAS)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 2)
set(OpenBLAS_PATCH_VERSION 18)
set(OpenBLAS_PATCH_VERSION 19)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
enable_language(ASM)
@ -45,8 +45,8 @@ endif()
message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.")
include("${CMAKE_SOURCE_DIR}/cmake/utils.cmake")
include("${CMAKE_SOURCE_DIR}/cmake/system.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
@ -123,9 +123,9 @@ endforeach ()
# Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke.
# Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want.
if (NOT NOFORTRAN AND NOT NO_LAPACK)
include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/lapack.cmake")
if (NOT NO_LAPACKE)
include("${CMAKE_SOURCE_DIR}/cmake/lapacke.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/lapacke.cmake")
endif ()
endif ()
@ -137,7 +137,7 @@ endif()
# add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
include("${CMAKE_SOURCE_DIR}/cmake/export.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
# Set output for libopenblas
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)

View File

@ -150,3 +150,14 @@ In chronological order:
* theoractice <https://github.com/theoractice/>
* [2016-03-20] Fix compiler error in VisualStudio with CMake
* [2016-03-22] Fix access violation on Windows while static linking
* Paul Mustière <https://github.com/buffer51/>
* [2016-02-04] Fix Android build on ARMV7
* [2016-04-26] Android build with LAPACK for ARMV7 & ARMV8
* Shivraj Patil <https://github.com/sva-img/>
* [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA
* Kaustubh Raste <https://github.com/ksraste/>
* [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA
* [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA

View File

@ -1,4 +1,22 @@
OpenBLAS ChangeLog
====================================================================
Version 0.2.19
1-Sep-2016
common:
* Improved cross compiling.
* Fix the bug on musl libc.
POWER:
* Optimize BLAS on Power8
* Fixed Julia+OpenBLAS bugs on Power8
MIPS:
* Optimize BLAS on MIPS P5600 and I6400 (Thanks, Shivraj Patil, Kaustubh Raste)
ARM:
* Improved on ARM Cortex-A57. (Thanks, Ashwin Sekhar T K)
====================================================================
Version 0.2.18
12-Apr-2016

View File

@ -108,8 +108,6 @@ endif
tests :
ifndef NOFORTRAN
ifndef TARGET
ifndef CROSS
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
@ -119,8 +117,6 @@ ifndef NO_CBLAS
$(MAKE) -C ctest all
endif
endif
endif
endif
libs :
ifeq ($(CORE), UNKOWN)

View File

@ -20,75 +20,75 @@ lib.grd :
$(error OpenBLAS: Please run "make" firstly)
install : lib.grd
@-mkdir -p $(DESTDIR)$(PREFIX)
@-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@-mkdir -p "$(DESTDIR)$(PREFIX)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)"
@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
#for inc
@echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo \#define OPENBLAS_CONFIG_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@cat openblas_config_template.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@echo \#ifndef OPENBLAS_F77BLAS_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
@echo \#define OPENBLAS_F77BLAS_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
@echo \#include \"openblas_config.h\" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
@cat common_interface.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
@echo \#endif >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
@echo \#ifndef OPENBLAS_F77BLAS_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@echo \#define OPENBLAS_F77BLAS_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@echo \#include \"openblas_config.h\" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
ifndef NO_CBLAS
@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@sed 's/common/openblas_config/g' cblas.h > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
endif
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
endif
#for install static library
ifndef NO_STATIC
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
@install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
@cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
@cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
endif
ifeq ($(OSNAME), CYGWIN_NT)
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
@ -96,34 +96,34 @@ endif
endif
#Generating OpenBLASConfig.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
ifndef NO_SHARED
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
ifeq ($(OSNAME), Darwin)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
else
#only static
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
#Generating OpenBLASConfigVersion.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo "else ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " set (PACKAGE_VERSION_EXACT TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo Install OK!

3
Makefile.mips Normal file
View File

@ -0,0 +1,3 @@
ifdef BINARY64
else
endif

View File

@ -1,4 +1,26 @@
# CCOMMON_OPT += -DALLOC_SHM
ifdef USE_THREAD
ifeq ($(USE_THREAD), 0)
USE_OPENMP = 0
else
USE_OPENMP = 1
endif
else
USE_OPENMP = 1
endif
ifeq ($(CORE), POWER8)
ifeq ($(USE_OPENMP), 1)
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
else
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
endif
endif
FLAMEPATH = $(HOME)/flame/lib
@ -16,6 +38,16 @@ else
endif
endif
#Either uncomment below line or run make with `USE_MASS=1` to enable support of MASS library
#USE_MASS = 1
ifeq ($(USE_MASS), 1)
# Path to MASS libs, change it if the libs are installed at any other location
MASSPATH = /opt/ibm/xlmass/8.1.3/lib
COMMON_OPT += -mveclibabi=mass -ftree-vectorize -funsafe-math-optimizations -DUSE_MASS
EXTRALIB += -L$(MASSPATH) -lmass -lmassvp8 -lmass_simdp8
endif
ifdef BINARY64

View File

@ -17,14 +17,26 @@ ifdef CPUIDEMU
EXFLAGS = -DCPUIDEMU -DVENDOR=99
endif
ifeq ($(TARGET), P5600)
TARGET_FLAGS = -mips32r5
endif
ifeq ($(TARGET), I6400)
TARGET_FLAGS = -mips64r6
endif
ifeq ($(TARGET), P6600)
TARGET_FLAGS = -mips64r6
endif
all: getarch_2nd
./getarch_2nd 0 >> $(TARGET_MAKE)
./getarch_2nd 1 >> $(TARGET_CONF)
config.h : c_check f_check getarch
perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC)
perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS)
ifneq ($(ONLY_CBLAS), 1)
perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC)
perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS)
else
#When we only build CBLAS, we set NOFORTRAN=2
echo "NOFORTRAN=2" >> $(TARGET_MAKE)

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.2.18
VERSION = 0.2.19
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -52,6 +52,7 @@ VERSION = 0.2.18
# USE_THREAD = 0
# If you're going to use this library with OpenMP, please comment it in.
# This flag is always set for POWER8. Don't modify the flag
# USE_OPENMP = 1
# You can define maximum number of threads. Basically it should be
@ -153,10 +154,12 @@ NO_AFFINITY = 1
# Common Optimization Flag;
# The default -O2 is enough.
# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT
# COMMON_OPT = -O2
# gfortran option for LAPACK
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT
# FCOMMON_OPT = -frecursive
# Profiling flags

View File

@ -159,7 +159,7 @@ ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1
# Generating Makefile.conf and config.h
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) all)
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
ifndef TARGET_CORE
include $(TOPDIR)/Makefile.conf
@ -462,7 +462,7 @@ endif
endif
endif
ifeq ($(ARCH), mips64)
ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
NO_BINARY_MODE = 1
endif
@ -502,13 +502,16 @@ endif
ifdef NO_BINARY_MODE
ifeq ($(ARCH), mips64)
ifeq ($(ARCH), $(filter $(ARCH),mips64))
ifdef BINARY64
CCOMMON_OPT += -mabi=64
else
CCOMMON_OPT += -mabi=n32
endif
BINARY_DEFINED = 1
else ifeq ($(ARCH), $(filter $(ARCH),mips))
CCOMMON_OPT += -mabi=32
BINARY_DEFINED = 1
endif
ifeq ($(CORE), LOONGSON3A)
@ -521,6 +524,21 @@ CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
ifeq ($(CORE), P5600)
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
endif
ifeq ($(CORE), I6400)
CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
endif
ifeq ($(CORE), P6600)
CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS)
FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS)
endif
ifeq ($(OSNAME), AIX)
BINARY_DEFINED = 1
endif
@ -589,12 +607,14 @@ ifneq ($(NO_LAPACK), 1)
EXTRALIB += -lgfortran
endif
ifdef NO_BINARY_MODE
ifeq ($(ARCH), mips64)
ifeq ($(ARCH), $(filter $(ARCH),mips64))
ifdef BINARY64
FCOMMON_OPT += -mabi=64
else
FCOMMON_OPT += -mabi=n32
endif
else ifeq ($(ARCH), $(filter $(ARCH),mips))
FCOMMON_OPT += -mabi=32
endif
else
ifdef BINARY64
@ -677,21 +697,7 @@ FCOMMON_OPT += -i8
endif
endif
endif
ifneq ($(ARCH), mips64)
ifndef BINARY64
FCOMMON_OPT += -m32
else
FCOMMON_OPT += -m64
endif
else
ifdef BINARY64
FCOMMON_OPT += -mabi=64
else
FCOMMON_OPT += -mabi=n32
endif
endif
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -mp
endif
@ -707,7 +713,7 @@ endif
endif
endif
ifeq ($(ARCH), mips64)
ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
ifndef BINARY64
FCOMMON_OPT += -n32
else
@ -737,7 +743,7 @@ endif
ifeq ($(C_COMPILER), OPEN64)
ifeq ($(ARCH), mips64)
ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
ifndef BINARY64
CCOMMON_OPT += -n32
else
@ -1126,6 +1132,8 @@ export HAVE_VFP
export HAVE_VFPV3
export HAVE_VFPV4
export HAVE_NEON
export HAVE_MSA
export MSA_FLAGS
export KERNELDIR
export FUNCTION_PROFILE
export TARGET_CORE

View File

@ -43,6 +43,35 @@ On X86 box, compile this library for loongson3a CPU with loongcc (based on Open6
make DEBUG=1
### Compile with MASS Support on Power CPU (Optional dependency)
[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and
Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER.
The library can be installed as below -
* On Ubuntu:
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
sudo apt-get update
sudo apt-get install libxlmass-devel.8.1.3
* On RHEL/CentOS:
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
sudo rpm --import repomd.xml.key
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
sudo yum install libxlmass-devel.8.1.3
After installing MASS library, compile openblas with USE_MASS=1.
Example:
Compiling on Power8 with MASS support -
make USE_MASS=1 TARGET=POWER8
### Install to the directory (optional)
Example:
@ -82,6 +111,7 @@ Please read GotoBLAS_01Readme.txt
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
- **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
## Usages
Link with libopenblas.a or -lopenblas for shared library.

View File

@ -53,26 +53,31 @@ PPC440
PPC440FP2
CELL
3.MIPS64 CPU:
3.MIPS CPU:
P5600
4.MIPS64 CPU:
SICORTEX
LOONGSON3A
LOONGSON3B
I6400
P6600
4.IA64 CPU:
5.IA64 CPU:
ITANIUM2
5.SPARC CPU:
6.SPARC CPU:
SPARC
SPARCV7
6.ARM CPU:
7.ARM CPU:
CORTEXA15
CORTEXA9
ARMV7
ARMV6
ARMV5
7.ARM 64-bit CPU:
8.ARM 64-bit CPU:
ARMV8
CORTEXA57

View File

@ -1,4 +1,4 @@
version: 0.2.18.{build}
version: 0.2.19.{build}
#environment:

View File

@ -173,7 +173,9 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
smallscaling
smallscaling \
isamax.goto idamax.goto icamax.goto izamax.goto \
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
@ -226,7 +228,9 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \
spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \
ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas
ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \
isamax.atlas idamax.atlas icamax.atlas izamax.atlas \
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto
mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \
@ -261,7 +265,9 @@ endif
essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \
slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl
slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \
scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \
strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
@ -393,6 +399,9 @@ scholesky.mkl : scholesky.$(SUFFIX)
scholesky.veclib : scholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
scholesky.essl : scholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Dcholesky ###################################################
dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME)
@ -410,6 +419,9 @@ dcholesky.mkl : dcholesky.$(SUFFIX)
dcholesky.veclib : dcholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dcholesky.essl : dcholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ccholesky ###################################################
ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME)
@ -427,6 +439,9 @@ ccholesky.mkl : ccholesky.$(SUFFIX)
ccholesky.veclib : ccholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ccholesky.essl : ccholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Zcholesky ###################################################
@ -445,6 +460,9 @@ zcholesky.mkl : zcholesky.$(SUFFIX)
zcholesky.veclib : zcholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zcholesky.essl : zcholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Sgemm ####################################################
sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@ -683,6 +701,9 @@ strsm.mkl : strsm.$(SUFFIX)
strsm.veclib : strsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
strsm.essl : strsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Dtrsm ####################################################
dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@ -699,6 +720,9 @@ dtrsm.mkl : dtrsm.$(SUFFIX)
dtrsm.veclib : dtrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dtrsm.essl : dtrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ctrsm ####################################################
ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME)
@ -716,6 +740,9 @@ ctrsm.mkl : ctrsm.$(SUFFIX)
ctrsm.veclib : ctrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ctrsm.essl : ctrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ztrsm ####################################################
ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME)
@ -733,6 +760,9 @@ ztrsm.mkl : ztrsm.$(SUFFIX)
ztrsm.veclib : ztrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ztrsm.essl : ztrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ssyrk ####################################################
ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@ -1911,6 +1941,63 @@ zgemm3m.mkl : zgemm3m.$(SUFFIX)
zgemm3m.veclib : zgemm3m.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## ISAMAX ##############################################
isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
isamax.atlas : isamax.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## IDAMAX ##############################################
idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
idamax.atlas : idamax.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## ICAMAX ##############################################
icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
icamax.atlas : icamax.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## IZAMAX ##############################################
izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
izamax.atlas : izamax.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## SNRM2 ##############################################
snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
snrm2.atlas : snrm2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## DNRM2 ##############################################
dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
dnrm2.atlas : dnrm2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## Sscnrm2 ##############################################
scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
scnrm2.atlas : scnrm2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## Ddznrm2 ##############################################
dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
dznrm2.atlas : dznrm2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
###################################################################################################
slinpack.$(SUFFIX) : linpack.c
@ -2217,11 +2304,38 @@ cgemm3m.$(SUFFIX) : gemm3m.c
zgemm3m.$(SUFFIX) : gemm3m.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
isamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
idamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
icamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
izamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
snrm2.$(SUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
dnrm2.$(SUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
scnrm2.$(SUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
dznrm2.$(SUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
smallscaling: smallscaling.c ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread
clean ::
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling
include $(TOPDIR)/Makefile.tail

View File

@ -183,9 +183,9 @@ int main(int argc, char *argv[]){
timeg /= loops;
#ifdef COMPLEX
fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg);
#else
fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg);
#endif
}

View File

@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
}

View File

@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
" %10.2f MBytes %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}

View File

@ -184,8 +184,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
}

View File

@ -221,7 +221,7 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg);
}
}
@ -258,7 +258,7 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg);
}
}

190
benchmark/iamax.c Normal file
View File

@ -0,0 +1,190 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef IAMAX
#ifdef COMPLEX
#ifdef DOUBLE
#define IAMAX BLASFUNC(izamax)
#else
#define IAMAX BLASFUNC(icamax)
#endif
#else
#ifdef DOUBLE
#define IAMAX BLASFUNC(idamax)
#else
#define IAMAX BLASFUNC(isamax)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Time\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
IAMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr, " %10.6f secs\n", timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

190
benchmark/nrm2.c Normal file
View File

@ -0,0 +1,190 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef NRM2
#ifdef COMPLEX
#ifdef DOUBLE
#define NRM2 BLASFUNC(dznrm2)
#else
#define NRM2 BLASFUNC(scnrm2)
#endif
#else
#ifdef DOUBLE
#define NRM2 BLASFUNC(dnrm2)
#else
#define NRM2 BLASFUNC(snrm2)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Time\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
NRM2 (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr, " %10.6f secs\n", timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -186,8 +186,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg);
}

View File

@ -189,9 +189,9 @@ int main(int argc, char *argv[]){
timeg /= loops;
#ifdef COMPLEX
fprintf(stderr, " %10.2f MFlops\n", 6. * (double)m / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 6. * (double)m / timeg * 1.e-6, timeg);
#else
fprintf(stderr, " %10.2f MFlops\n", 1. * (double)m / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 1. * (double)m / timeg * 1.e-6, timeg);
#endif
}

View File

@ -5,6 +5,7 @@
#include <time.h>
#include <cblas.h>
#include <omp.h>
#include <pthread.h>
#define MIN_SIZE 5
#define MAX_SIZE 60
#define NB_SIZE 10

View File

@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
" %10.2f MBytes %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}

View File

@ -191,8 +191,8 @@ int main(int argc, char *argv[]){
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6, time1);
}

View File

@ -184,8 +184,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
}

52
c_check
View File

@ -1,5 +1,8 @@
#!/usr/bin/perl
use File::Basename;
use File::Temp qw(tempfile);
# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
@ -8,6 +11,7 @@ $hostarch = "arm" if ($hostarch =~ /^arm.*/);
$hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$tmpf = new File::Temp( UNLINK => 1 );
$binary = $ENV{"BINARY"};
$makefile = shift(@ARGV);
@ -26,14 +30,12 @@ if ($?) {
$cross_suffix = "";
if ($ARGV[0] =~ /(.*)(-[.\d]+)/) {
if ($1 =~ /(.*-)(.*)/) {
$cross_suffix = $1;
}
} else {
if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) {
$cross_suffix = $1;
}
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
if (basename($compiler_name) =~ /(.*-)(.*)/) {
$cross_suffix .= $1;
}
$compiler = "";
@ -63,7 +65,7 @@ $os = Android if ($data =~ /OS_ANDROID/);
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = power if ($data =~ /ARCH_POWER/);
$architecture = mips32 if ($data =~ /ARCH_MIPS32/);
$architecture = mips if ($data =~ /ARCH_MIPS/);
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/);
@ -79,7 +81,12 @@ if ($os eq "AIX") {
$defined = 1;
}
if (($architecture eq "mips32") || ($architecture eq "mips64")) {
if ($architecture eq "mips") {
$compiler_name .= " -mabi=32";
$defined = 1;
}
if ($architecture eq "mips64") {
$compiler_name .= " -mabi=n32" if ($binary eq "32");
$compiler_name .= " -mabi=64" if ($binary eq "64");
$defined = 1;
@ -152,10 +159,28 @@ if ($?) {
die 1;
}
$have_msa = 0;
if (($architecture eq "mips") || ($architecture eq "mips64")) {
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
} else {
$have_msa = 1;
}
unlink("$tmpf.o");
}
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = power if ($data =~ /ARCH_POWER/);
$architecture = mips32 if ($data =~ /ARCH_MIPS32/);
$architecture = mips if ($data =~ /ARCH_MIPS/);
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/);
@ -243,9 +268,11 @@ print MAKEFILE "BINARY64=\n" if $binformat ne bin64;
print MAKEFILE "BINARY32=1\n" if $binformat eq bin32;
print MAKEFILE "BINARY64=1\n" if $binformat eq bin64;
print MAKEFILE "FU=$need_fu\n" if $need_fu ne "";
print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne "";
print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne "";
print MAKEFILE "CROSS=1\n" if $cross != 0;
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
$os =~ tr/[a-z]/[A-Z]/;
$architecture =~ tr/[a-z]/[A-Z]/;
@ -257,6 +284,7 @@ print CONFFILE "#define C_$compiler\t1\n";
print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32;
print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
if ($os eq "LINUX") {

View File

@ -53,7 +53,7 @@ endif()
add_custom_command(
TARGET ${OpenBLAS_LIBNAME} PRE_LINK
COMMAND perl
ARGS "${CMAKE_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def"
ARGS "${PROJECT_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def"
COMMENT "Create openblas.def file"
VERBATIM)

View File

@ -50,20 +50,20 @@ else()
set(TARGET_CONF "config.h")
endif ()
include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/c_check.cmake")
if (NOT NOFORTRAN)
include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
endif ()
# compile getarch
set(GETARCH_SRC
${CMAKE_SOURCE_DIR}/getarch.c
${PROJECT_SOURCE_DIR}/getarch.c
${CPUIDEMO}
)
if (NOT MSVC)
list(APPEND GETARCH_SRC ${CMAKE_SOURCE_DIR}/cpuid.S)
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
endif ()
if (MSVC)
@ -76,7 +76,7 @@ set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH_DIR})
try_compile(GETARCH_RESULT ${GETARCH_DIR}
SOURCES ${GETARCH_SRC}
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${CMAKE_SOURCE_DIR}
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
)
@ -97,8 +97,8 @@ set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH2_DIR})
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
SOURCES ${CMAKE_SOURCE_DIR}/getarch_2nd.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${CMAKE_SOURCE_DIR}
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH2_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
)

View File

@ -3,7 +3,7 @@
## Description: Ported from OpenBLAS/Makefile.system
##
set(NETLIB_LAPACK_DIR "${CMAKE_SOURCE_DIR}/lapack-netlib")
set(NETLIB_LAPACK_DIR "${PROJECT_SOURCE_DIR}/lapack-netlib")
# TODO: Makefile.system detects Darwin (mac) and switches to clang here -hpa
# http://stackoverflow.com/questions/714100/os-detecting-makefile
@ -78,7 +78,7 @@ else ()
set(ONLY_CBLAS 0)
endif ()
include("${CMAKE_SOURCE_DIR}/cmake/prebuild.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
if (NOT DEFINED NUM_THREADS)
set(NUM_THREADS ${NUM_CORES})
@ -124,17 +124,17 @@ set(OBJCOPY "${CROSS_SUFFIX}objcopy")
set(OBJCONV "${CROSS_SUFFIX}objconv")
# OS dependent settings
include("${CMAKE_SOURCE_DIR}/cmake/os.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/os.cmake")
# Architecture dependent settings
include("${CMAKE_SOURCE_DIR}/cmake/arch.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake")
# C Compiler dependent settings
include("${CMAKE_SOURCE_DIR}/cmake/cc.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
if (NOT NOFORTRAN)
# Fortran Compiler dependent settings
include("${CMAKE_SOURCE_DIR}/cmake/fc.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
endif ()
if (BINARY64)
@ -247,10 +247,10 @@ if (NOT DEFINED SYMBOLSUFFIX)
set(SYMBOLSUFFIX "")
endif ()
set(KERNELDIR "${CMAKE_SOURCE_DIR}/kernel/${ARCH}")
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
# TODO: nead to convert these Makefiles
# include ${CMAKE_SOURCE_DIR}/cmake/${ARCH}.cmake
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
if (${CORE} STREQUAL "PPC440")
set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_QALLOC")
@ -410,8 +410,8 @@ set(LIBDEFNAME "${LIBNAME}.${LIBSUFFIX}.def")
set(LIBEXPNAME "${LIBNAME}.${LIBSUFFIX}.exp")
set(LIBZIPNAME "${LIBNAME}.${LIBSUFFIX}.zip")
set(LIBS "${CMAKE_SOURCE_DIR}/${LIBNAME}")
set(LIBS_P "${CMAKE_SOURCE_DIR}/${LIBNAME_P}")
set(LIBS "${PROJECT_SOURCE_DIR}/${LIBNAME}")
set(LIBS_P "${PROJECT_SOURCE_DIR}/${LIBNAME_P}")
set(LIB_COMPONENTS BLAS)

View File

@ -332,6 +332,13 @@ typedef int blasint;
#endif
#endif
#ifdef POWER8
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
/*
#ifdef PILEDRIVER
#ifndef YIELDING
@ -397,6 +404,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_sparc.h"
#endif
#ifdef ARCH_MIPS
#include "common_mips.h"
#endif
#ifdef ARCH_MIPS64
#include "common_mips64.h"
#endif
@ -615,9 +626,14 @@ void gotoblas_profile_init(void);
void gotoblas_profile_quit(void);
#ifdef USE_OPENMP
#ifndef C_MSVC
int omp_in_parallel(void);
int omp_get_num_procs(void);
#else
__declspec(dllimport) int __cdecl omp_in_parallel(void);
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
#endif
#else
#ifdef __ELF__
int omp_in_parallel (void) __attribute__ ((weak));
int omp_get_num_procs(void) __attribute__ ((weak));

109
common_mips.h Normal file
View File

@ -0,0 +1,109 @@
/*****************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#ifndef COMMON_MIPS
#define COMMON_MIPS
#define MB
#define WMB
#define INLINE inline
#define RETURN_BY_COMPLEX
#ifndef ASSEMBLER
static void INLINE blas_lock(volatile unsigned long *address){
}
#define BLAS_LOCK_DEFINED
static inline unsigned int rpcc(void){
unsigned long ret;
__asm__ __volatile__(".set push \n"
"rdhwr %0, $30 \n"
".set pop" : "=r"(ret) : : "memory");
return ret;
}
#define RPCC_DEFINED
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
#define GET_IMAGE(res)
#define GET_IMAGE_CANCEL
#endif
#ifndef F_INTERFACE
#define REALNAME ASMNAME
#else
#define REALNAME ASMFNAME
#endif
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \
.arm ;\
.global REALNAME ;\
.func REALNAME ;\
REALNAME:
#define EPILOGUE
#define PROFCODE
#endif
#define SEEK_ADDRESS
#ifndef PAGESIZE
#define PAGESIZE ( 4 << 10)
#endif
#define HUGE_PAGESIZE ( 4 << 20)
#define BUFFER_SIZE (16 << 20)
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
#ifndef MAP_ANONYMOUS
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif

View File

@ -102,7 +102,7 @@ static void INLINE blas_lock(volatile unsigned long *address){
static inline unsigned int rpcc(void){
unsigned long ret;
#if defined(LOONGSON3A) || defined(LOONGSON3B)
// unsigned long long tmp;
//__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory");
//ret=tmp;
@ -111,17 +111,10 @@ static inline unsigned int rpcc(void){
"rdhwr %0, $2\n"
".set pop": "=r"(ret):: "memory");
#else
__asm__ __volatile__(".set push \n"
".set mips32r2\n"
"rdhwr %0, $30 \n"
".set pop" : "=r"(ret) : : "memory");
#endif
return ret;
}
#define RPCC_DEFINED
#if defined(LOONGSON3A) || defined(LOONGSON3B)
#ifndef NO_AFFINITY
#define WHEREAMI
static inline int WhereAmI(void){
@ -134,7 +127,6 @@ static inline int WhereAmI(void){
}
#endif
#endif
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;

View File

@ -39,8 +39,13 @@
#ifndef COMMON_POWER
#define COMMON_POWER
#if defined(POWER8)
#define MB __asm__ __volatile__ ("eieio":::"memory")
#define WMB __asm__ __volatile__ ("eieio":::"memory")
#else
#define MB __asm__ __volatile__ ("sync")
#define WMB __asm__ __volatile__ ("sync")
#endif
#define INLINE inline
@ -798,7 +803,7 @@ Lmcount$lazy_ptr:
#elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8)
#define BUFFER_SIZE ( 32 << 20)
#define BUFFER_SIZE ( 64 << 20)
#else
#define BUFFER_SIZE ( 16 << 20)
#endif

View File

@ -71,15 +71,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************/
#define CPU_UNKNOWN 0
#define CPU_SICORTEX 1
#define CPU_LOONGSON3A 2
#define CPU_LOONGSON3B 3
#define CPU_P5600 1
static char *cpuname[] = {
"UNKOWN",
"SICORTEX",
"LOONGSON3A",
"LOONGSON3B"
"P5600"
};
int detect(void){
@ -120,7 +116,7 @@ int detect(void){
if (strstr(p, "loongson3a"))
return CPU_LOONGSON3A;
}else{
return CPU_SICORTEX;
return CPU_UNKNOWN;
}
}
//Check model name for Loongson3
@ -149,64 +145,40 @@ char *get_corename(void){
}
void get_architecture(void){
printf("MIPS64");
printf("MIPS");
}
void get_subarchitecture(void){
if(detect()==CPU_LOONGSON3A) {
printf("LOONGSON3A");
}else if(detect()==CPU_LOONGSON3B){
printf("LOONGSON3B");
if(detect()==CPU_P5600){
printf("P5600");
}else{
printf("SICORTEX");
printf("UNKNOWN");
}
}
void get_subdirname(void){
printf("mips64");
printf("mips");
}
void get_cpuconfig(void){
if(detect()==CPU_LOONGSON3A) {
printf("#define LOONGSON3A\n");
if(detect()==CPU_P5600){
printf("#define P5600\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else if(detect()==CPU_LOONGSON3B){
printf("#define LOONGSON3B\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else{
printf("#define SICORTEX\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 32\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n");
}else{
printf("#define UNKNOWN\n");
}
}
void get_libname(void){
if(detect()==CPU_LOONGSON3A) {
printf("loongson3a\n");
}else if(detect()==CPU_LOONGSON3B) {
printf("loongson3b\n");
if(detect()==CPU_P5600) {
printf("p5600\n");
}else{
#ifdef __mips64
printf("mips64\n");
#else
printf("mips32\n");
#endif
printf("mips\n");
}
}

238
cpuid_mips64.c Normal file
View File

@ -0,0 +1,238 @@
/*****************************************************************************
Copyright (c) 2011-2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define CPU_UNKNOWN 0
#define CPU_SICORTEX 1
#define CPU_LOONGSON3A 2
#define CPU_LOONGSON3B 3
#define CPU_I6400 4
#define CPU_P6600 5
static char *cpuname[] = {
"UNKOWN",
"SICORTEX",
"LOONGSON3A",
"LOONGSON3B",
"I6400",
"P6600"
};
int detect(void){
#ifdef linux
FILE *infile;
char buffer[512], *p;
p = (char *)NULL;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("cpu", buffer, 3)){
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
#endif
break;
}
}
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}else if (strstr(p, "Loongson-3")){
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("system type", buffer, 11)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if (strstr(p, "loongson3a"))
return CPU_LOONGSON3A;
}else{
return CPU_SICORTEX;
}
}
//Check model name for Loongson3
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("model name", buffer, 10)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}
}
#endif
return CPU_UNKNOWN;
}
char *get_corename(void){
return cpuname[detect()];
}
void get_architecture(void){
printf("MIPS64");
}
void get_subarchitecture(void){
if(detect()==CPU_LOONGSON3A) {
printf("LOONGSON3A");
}else if(detect()==CPU_LOONGSON3B){
printf("LOONGSON3B");
}else if(detect()==CPU_I6400){
printf("I6400");
}else if(detect()==CPU_P6600){
printf("P6600");
}else{
printf("SICORTEX");
}
}
void get_subdirname(void){
printf("mips64");
}
void get_cpuconfig(void){
if(detect()==CPU_LOONGSON3A) {
printf("#define LOONGSON3A\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else if(detect()==CPU_LOONGSON3B){
printf("#define LOONGSON3B\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else if(detect()==CPU_I6400){
printf("#define I6400\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n");
}else if(detect()==CPU_P6600){
printf("#define P6600\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n");
}else{
printf("#define SICORTEX\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 32\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n");
}
}
void get_libname(void){
if(detect()==CPU_LOONGSON3A) {
printf("loongson3a\n");
}else if(detect()==CPU_LOONGSON3B) {
printf("loongson3b\n");
}else if(detect()==CPU_I6400) {
printf("i6400\n");
}else if(detect()==CPU_P6600) {
printf("p6600\n");
}else{
printf("mips64\n");
}
}

View File

@ -1172,6 +1172,8 @@ int get_cpuname(void){
#endif
else
return CPUTYPE_NEHALEM;
case 12:
// Braswell
case 13:
// Avoton
return CPUTYPE_NEHALEM;
@ -1678,6 +1680,8 @@ int get_coretype(void){
#endif
else
return CORE_NEHALEM;
case 12:
// Braswell
case 13:
// Avoton
return CORE_NEHALEM;

View File

@ -110,7 +110,7 @@ ARCH_MIPS64
#endif
#if defined(__mips32) || defined(__mips)
ARCH_MIPS32
ARCH_MIPS
#endif
#ifdef __alpha

View File

@ -1,4 +1,4 @@
include_directories(${CMAKE_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR})
enable_language(Fortran)

View File

@ -42,6 +42,7 @@ ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
all :: all1 all2 all3
all1: xscblat1 xdcblat1 xccblat1 xzcblat1
ifndef CROSS
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat1
OMP_NUM_THREADS=2 ./xdcblat1
@ -53,8 +54,10 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat1
OPENBLAS_NUM_THREADS=2 ./xzcblat1
endif
endif
all2: xscblat2 xdcblat2 xccblat2 xzcblat2
ifndef CROSS
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat2 < sin2
OMP_NUM_THREADS=2 ./xdcblat2 < din2
@ -66,8 +69,10 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2
OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2
endif
endif
all3: xscblat3 xdcblat3 xccblat3 xzcblat3
ifndef CROSS
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat3 < sin3
OMP_NUM_THREADS=2 ./xdcblat3 < din3
@ -88,6 +93,7 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
endif
endif

View File

@ -1,5 +1,5 @@
include_directories(${CMAKE_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR})
# sources that need to be compiled twice, once with no flags and once with LOWER
set(UL_SOURCES

View File

@ -1,4 +1,4 @@
include_directories(${CMAKE_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR})
# N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa

View File

@ -1,4 +1,4 @@
include_directories(${CMAKE_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR})
if (${CORE} STREQUAL "PPC440")
set(MEMORY memory_qalloc.c)

View File

@ -261,8 +261,8 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Avoton
if (model == 13) {
//Intel Braswell / Avoton
if (model == 12 || model == 13) {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
@ -439,7 +439,7 @@ static gotoblas_t *force_coretype(char *coretype){
char message[128];
//char mname[20];
for ( i=1 ; i <= 21; i++)
for ( i=1 ; i <= 22; i++)
{
if (!strncasecmp(coretype,corename[i],20))
{

View File

@ -361,6 +361,9 @@ static void numa_mapping(void) {
unsigned long work, bit;
int count = 0;
int bitmask_idx = 0;
int current_cpu;
int current_node = 0;
int cpu_count = 0;
for (node = 0; node < common -> num_nodes; node ++) {
core = 0;
@ -382,33 +385,84 @@ static void numa_mapping(void) {
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
#endif
h = 1;
current_cpu = sched_getcpu();
for (cpu = 0; cpu < count; cpu++) {
if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) {
current_node = READ_NODE(common -> cpu_info[cpu]);
break;
}
}
for (i = 0; i < MAX_BITMASK_LEN; i++)
cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]);
while (h < count) h = 2 * h + 1;
/*
* If all the processes can be accommodated in the
* in the current node itself, then bind to cores
* from the current node only
*/
if (numprocs <= cpu_count) {
/*
* First sort all the cores in order from the current node.
* Then take remaining nodes one by one in order,
* and sort their cores in order.
*/
for (i = 0; i < count; i++) {
for (j = 0; j < count - 1; j++) {
int node_1, node_2;
int core_1, core_2;
int swap = 0;
while (h > 1) {
h /= 2;
for (i = h; i < count; i++) {
work = common -> cpu_info[i];
bit = CPU_ISSET(i, &cpu_orig_mask[0]);
j = i - h;
while (work < common -> cpu_info[j]) {
common -> cpu_info[j + h] = common -> cpu_info[j];
if (CPU_ISSET(j, &cpu_orig_mask[0])) {
CPU_SET(j + h, &cpu_orig_mask[0]);
} else {
CPU_CLR(j + h, &cpu_orig_mask[0]);
}
j -= h;
if (j < 0) break;
}
common -> cpu_info[j + h] = work;
if (bit) {
CPU_SET(j + h, &cpu_orig_mask[0]);
} else {
CPU_CLR(j + h, &cpu_orig_mask[0]);
node_1 = READ_NODE(common -> cpu_info[j]);
node_2 = READ_NODE(common -> cpu_info[j + 1]);
core_1 = READ_CORE(common -> cpu_info[j]);
core_2 = READ_CORE(common -> cpu_info[j + 1]);
if (node_1 == node_2) {
if (core_1 > core_2)
swap = 1;
} else {
if ((node_2 == current_node) ||
((node_1 != current_node) && (node_1 > node_2)))
swap = 1;
}
if (swap) {
unsigned long temp;
temp = common->cpu_info[j];
common->cpu_info[j] = common->cpu_info[j + 1];
common->cpu_info[j + 1] = temp;
}
}
}
} else {
h = 1;
while (h < count) h = 2 * h + 1;
while (h > 1) {
h /= 2;
for (i = h; i < count; i++) {
work = common -> cpu_info[i];
bit = CPU_ISSET(i, &cpu_orig_mask[0]);
j = i - h;
while (work < common -> cpu_info[j]) {
common -> cpu_info[j + h] = common -> cpu_info[j];
if (CPU_ISSET(j, &cpu_orig_mask[0])) {
CPU_SET(j + h, &cpu_orig_mask[0]);
} else {
CPU_CLR(j + h, &cpu_orig_mask[0]);
}
j -= h;
if (j < 0) break;
}
common -> cpu_info[j + h] = work;
if (bit) {
CPU_SET(j + h, &cpu_orig_mask[0]);
} else {
CPU_CLR(j + h, &cpu_orig_mask[0]);
}
}
}
}
@ -416,7 +470,10 @@ static void numa_mapping(void) {
fprintf(stderr, "\nSorting ...\n\n");
for (cpu = 0; cpu < count; cpu++)
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu],
READ_CPU(common -> cpu_info[cpu]),
READ_CORE(common -> cpu_info[cpu]),
READ_NODE(common -> cpu_info[cpu]));
#endif
}

View File

@ -167,7 +167,7 @@ int get_L2_size(void){
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER)
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@ -251,7 +251,7 @@ int get_L2_size(void){
void blas_set_parameter(void){
int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
int size = 16;
#else
int size = get_L2_size();

View File

@ -110,9 +110,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
endif
ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2))
#only build without Fortran
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
endif
dllinit.$(SUFFIX) : dllinit.c

View File

@ -114,7 +114,7 @@ if ($compiler eq "") {
$openmp = "-mp";
}
if ($data =~ /IBM/) {
if ($data =~ /IBM XL/) {
$vendor = IBM;
$openmp = "-openmp";
}
@ -223,7 +223,12 @@ if (!$?) {
}
#For gfortran MIPS
if ($?) {
$link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
$mips_data = `$compiler_bin -E -dM - < /dev/null`;
if ($mips_data =~ /_MIPS_ISA_MIPS64/) {
$link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
} else {
$link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
}
$binary = "" if ($?);
}

View File

@ -131,6 +131,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3A */
/* #define FORCE_LOONGSON3B */
/* #define FORCE_I6400 */
/* #define FORCE_P6600 */
/* #define FORCE_P5600 */
/* #define FORCE_ITANIUM2 */
/* #define FORCE_SPARC */
/* #define FORCE_SPARCV7 */
@ -699,6 +702,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_I6400
#define FORCE
#define ARCHITECTURE "MIPS"
#define SUBARCHITECTURE "I6400"
#define SUBDIRNAME "mips64"
#define ARCHCONFIG "-DI6400 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "i6400"
#define CORENAME "I6400"
#else
#endif
#ifdef FORCE_P6600
#define FORCE
#define ARCHITECTURE "MIPS"
#define SUBARCHITECTURE "P6600"
#define SUBDIRNAME "mips64"
#define ARCHCONFIG "-DP6600 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "p6600"
#define CORENAME "P6600"
#else
#endif
#ifdef FORCE_P5600
#define FORCE
#define ARCHITECTURE "MIPS"
#define SUBARCHITECTURE "P5600"
#define SUBDIRNAME "mips"
#define ARCHCONFIG "-DP5600 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "p5600"
#define CORENAME "P5600"
#else
#endif
#ifdef FORCE_ITANIUM2
#define FORCE
#define ARCHITECTURE "IA64"
@ -888,7 +933,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef __mips__
#ifdef __mips64
#include "cpuid_mips64.c"
#else
#include "cpuid_mips.c"
#endif
#define OPENBLAS_SUPPORTED
#endif

View File

@ -1,5 +1,5 @@
include_directories(${CMAKE_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR})
set(BLAS1_SOURCES

File diff suppressed because it is too large Load Diff

View File

@ -42,6 +42,10 @@
#include "functable.h"
#endif
// Disable multi-threading as it does not show any performance
// benefits. Keep the multi-threading code for the record.
#undef SMP
#ifndef CBLAS
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){

View File

@ -243,6 +243,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
{
buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT);
// It seems to be required for some K8 or Barcelona CPU
buffer_size += 8;
if(incx != 1)
buffer_size += n * 2;
}

View File

@ -1,6 +1,6 @@
include_directories(${CMAKE_SOURCE_DIR})
include("${CMAKE_SOURCE_DIR}/cmake/kernel.cmake")
include_directories(${PROJECT_SOURCE_DIR})
include("${PROJECT_SOURCE_DIR}/cmake/kernel.cmake")
# Makefile

View File

@ -12,10 +12,6 @@ ifeq ($(ARCH), ia64)
USE_GEMM3M = 1
endif
ifeq ($(ARCH), MIPS)
USE_GEMM3M = 1
endif
ifeq ($(ARCH), arm)
USE_TRMM = 1
endif

View File

@ -40,6 +40,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
{
BLASLONG i=0,j=0;
if ( (n <= 0) || (inc_x <= 0))
return(0);
while(j < n)
{

View File

@ -43,6 +43,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
BLASLONG ip = 0;
FLOAT temp;
if ( (n <= 0) || (inc_x <= 0))
return(0);
inc_x2 = 2 * inc_x;
for ( i=0; i<n; i++ )
{

File diff suppressed because it is too large Load Diff

View File

@ -58,43 +58,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
str TMPF, [Y], #SZ
#else
#if !defined(DOUBLE)
ld1 {v0.2s}, [X], #8
st1 {v0.2s}, [Y], #8
ldr d0, [X], #8
str d0, [Y], #8
#else
ld1 {v0.2d}, [X], #16
st1 {v0.2d}, [Y], #16
ldr q0, [X], #16
str q0, [Y], #16
#endif
#endif
.endm
.macro KERNEL_F4
#if !defined(COMPLEX)
#if !defined(DOUBLE)
ld1 {v0.4s}, [X], #16
st1 {v0.4s}, [Y], #16
ldr q0, [X], #16
str q0, [Y], #16
#else // DOUBLE
ld1 {v0.4s}, [X], #16
ld1 {v1.4s}, [X], #16
st1 {v0.4s}, [Y], #16
st1 {v1.4s}, [Y], #16
ldr q0, [X], #16
str q0, [Y], #16
ldr q1, [X], #16
str q1, [Y], #16
#endif
#else // COMPLEX
#if !defined(DOUBLE)
ld1 {v0.4s}, [X], #16
ld1 {v1.4s}, [X], #16
st1 {v0.4s}, [Y], #16
st1 {v1.4s}, [Y], #16
ldr q0, [X], #16
str q0, [Y], #16
ldr q1, [X], #16
str q1, [Y], #16
#else // DOUBLE
ld1 {v0.4s}, [X], #16
ld1 {v1.4s}, [X], #16
ld1 {v2.4s}, [X], #16
ld1 {v3.4s}, [X], #16
st1 {v0.4s}, [Y], #16
st1 {v1.4s}, [Y], #16
st1 {v2.4s}, [Y], #16
st1 {v3.4s}, [Y], #16
ldr q0, [X], #16
str q0, [Y], #16
ldr q1, [X], #16
str q1, [Y], #16
ldr q2, [X], #16
str q2, [Y], #16
ldr q3, [X], #16
str q3, [Y], #16
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@ -339,7 +339,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp q0, q1, [pCRow0]
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ldp q2, q3, [pCRow0]
fmla v2.2d, v18.2d, alphaV0
@ -356,7 +355,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp q4, q5, [pCRow1]
add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ldp q6, q7, [pCRow1]
fmla v6.2d, v22.2d, alphaV0
@ -373,7 +371,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp q0, q1, [pCRow2]
add pCRow2, pCRow2, #32
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
ldp q2, q3, [pCRow2]
fmla v2.2d, v26.2d, alphaV0
@ -390,7 +387,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp q4, q5, [pCRow3]
add pCRow3, pCRow3, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
ldp q6, q7, [pCRow3]
fmla v6.2d, v30.2d, alphaV0
@ -434,33 +430,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x4
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #32
ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow1, pCRow1, #32
ld1 {v8.2d, v9.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV0
fmla v9.2d, v25.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow2, pCRow2, #32
ld1 {v12.2d, v13.2d}, [pCRow1]
ld1 {v12.2d, v13.2d}, [pCRow3]
fmla v12.2d, v28.2d, alphaV0
fmla v13.2d, v29.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
st1 {v12.2d, v13.2d}, [pCRow3]
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@ -487,29 +488,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE2x4
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #16
ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow1, pCRow1, #16
ld1 {v8.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV0
st1 {v8.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow2, pCRow2, #16
ld1 {v12.2d}, [pCRow1]
ld1 {v12.2d}, [pCRow3]
fmla v12.2d, v28.2d, alphaV0
st1 {v12.2d}, [pCRow1]
st1 {v12.2d}, [pCRow3]
add pCRow0, pCRow0, #16
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
add pCRow3, pCRow3, #16
.endm
/******************************************************************************/
@ -532,7 +538,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE1x4
fmov alpha0, alpha
add pCRow1, pCRow0, LDC
ld1 {v8.d}[0], [pCRow0]
ld1 {v8.d}[1], [pCRow1]
@ -540,16 +545,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
st1 {v8.d}[0], [pCRow0]
st1 {v8.d}[1], [pCRow1]
add pCRow2, pCRow1, LDC
add pCRow1, pCRow2, LDC
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #8
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow1, pCRow1, #8
ld1 {v12.d}[0], [pCRow2]
ld1 {v12.d}[1], [pCRow1]
ld1 {v12.d}[1], [pCRow3]
fmla v12.2d, v20.2d, alphaV0
st1 {v12.d}[0], [pCRow2]
st1 {v12.d}[1], [pCRow1]
st1 {v12.d}[1], [pCRow3]
add pCRow0, pCRow0, #8
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow2, pCRow2, #8
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
add pCRow3, pCRow3, #8
.endm
/******************************************************************************/
@ -578,6 +588,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.d[1]
@ -586,7 +598,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE8x2
fmov alpha0, alpha
add pCRow1, pCRow0, LDC
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0
@ -595,6 +606,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #64
ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0
fmla v5.2d, v21.2d, alphaV0
@ -602,7 +616,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla v7.2d, v23.2d, alphaV0
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
add pCRow0, pCRow0, #64
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow1, pCRow1, #64
.endm
/******************************************************************************/
@ -628,19 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x2
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #32
ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow1, pCRow1, #32
.endm
/******************************************************************************/
@ -663,17 +681,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE2x2
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
add pCRow1 , pCRow0, LDC
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #16
ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow1, pCRow1, #16
.endm
/******************************************************************************/
@ -694,7 +715,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE1x2
fmov alpha0, alpha
add pCRow1 , pCRow0, LDC
ld1 {v8.d}[0], [pCRow0]
ld1 {v8.d}[1], [pCRow1]
@ -702,7 +722,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
st1 {v8.d}[0], [pCRow0]
st1 {v8.d}[1], [pCRow1]
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #8
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow1, pCRow1, #8
.endm
/******************************************************************************/
@ -726,12 +749,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.d[0]
.endm
.macro SAVE8x1
fmov alpha0, alpha
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV0
@ -739,6 +764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #64
.endm
@ -763,11 +789,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x1
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #32
.endm
@ -790,10 +818,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE2x1
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #16
.endm
@ -819,6 +849,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmadd d8, d16, alpha0, d8
str d8, [pCRow0]
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #8
.endm
@ -858,6 +889,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/******************************************************************************/
.align 5
dgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
@ -989,17 +1021,26 @@ dgemm_kernel_L4_M4_20:
cmp counterL , #0
ble dgemm_kernel_L4_M4_40
.align 5
dgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x4_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x4_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x4_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x4_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
@ -1012,6 +1053,8 @@ dgemm_kernel_L4_M4_40:
dgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
@ -1022,7 +1065,6 @@ dgemm_kernel_L4_M4_100:
dgemm_kernel_L4_M4_END:
dgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
@ -1042,16 +1084,23 @@ dgemm_kernel_L4_M2_20:
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
.align 5
dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x4_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x4_SUB
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x4_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x4_SUB
subs counterL, counterL, #1
@ -1063,9 +1112,12 @@ dgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
dgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
@ -1092,15 +1144,22 @@ dgemm_kernel_L4_M1_20:
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
.align 5
dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
subs counterL, counterL, #1
@ -1112,9 +1171,11 @@ dgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
dgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
@ -1143,9 +1204,10 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pC,pC,LDC, lsl #1
add pC, pCRow1, LDC
mov pA, origPA // pA = A
@ -1156,6 +1218,7 @@ dgemm_kernel_L2_M8_BEGIN:
cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN
.align 5
dgemm_kernel_L2_M8_20:
INIT8x2
@ -1165,28 +1228,31 @@ dgemm_kernel_L2_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M8_40
.align 5
.align 5
dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL8x2_SUB
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22
dgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
@ -1221,17 +1287,23 @@ dgemm_kernel_L2_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
.align 5
.align 5
dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
subs counterL, counterL, #1
@ -1243,9 +1315,12 @@ dgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
@ -1279,19 +1354,26 @@ dgemm_kernel_L2_M2_20:
dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x2_SUB
KERNEL2x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x2_SUB
KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x2_SUB
KERNEL2x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
@ -1329,18 +1411,24 @@ dgemm_kernel_L2_M1_20:
dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x2_SUB
KERNEL1x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL1x2_SUB
KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
@ -1380,6 +1468,7 @@ dgemm_kernel_L1_M8_BEGIN:
cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN
.align 5
dgemm_kernel_L1_M8_20:
INIT8x1
@ -1388,14 +1477,16 @@ dgemm_kernel_L1_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M8_40
.align 5
.align 5
dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@ -1410,6 +1501,7 @@ dgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
@ -1443,17 +1535,23 @@ dgemm_kernel_L1_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
.align 5
.align 5
dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
subs counterL, counterL, #1
@ -1465,9 +1563,11 @@ dgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
@ -1501,18 +1601,24 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x1_SUB
KERNEL2x1_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x1_SUB
KERNEL2x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
@ -1547,14 +1653,17 @@ dgemm_kernel_L1_M1_20:
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1567,6 +1676,8 @@ dgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M1_42:
KERNEL1x1_SUB

View File

@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
#define pA x15
#define temp x16
#define tempOffset x17
#define tempK x18
#define pCRow3 x15
#define pA x16
#define alpha x17
#define temp x18
#define tempOffset x19
#define tempK x20
#define alpha0 d10
#define alphaV0 v10.d[0]
#define alpha1 d11
#define alphaV1 v11.d[0]
#define alpha2 d14
#define alphaV2 v14.d[0]
#define alpha3 d15
#define alphaV3 v15.d[0]
#define A_PRE_SIZE 2560
#define B_PRE_SIZE 448
#define C_PRE_SIZE 128
// 00 origM
// 01 origN
@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v05 pA1_2, pA1_3
//v06 pA1_4, pA1_5
//v07 pA1_6, pA1_7
//v08 must save pB0_0, pB0_1
//v09 must save pB0_2, pB0_3
//v10 must save ALPHA0
//v11 must save ALPHA1
//v12 must save pB1_0, pB1_1
//v13 must save pB1_2, pB1_3
//v14 must save ALPHA2
//v15 must save ALPHA3
//v08 must save pB0_0
//v09 must save pB0_1
//v10 must save pB0_2 --> ALPHA0
//v11 must save pB0_3
//v12 must save pB1_0
//v13 must save pB1_1
//v14 must save pB1_2
//v15 must save pB1_3
//v16 must save C00, C01
//v17 must save C02, C03
//v18 C04, C05
@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_I
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld1 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
ldp q0, q1, [pA], #32
ldp d8, d9, [pB], #16
fmul v16.2d, v0.2d, v8.d[0]
fmul v20.2d, v0.2d, v9.d[0]
ldp d10, d11, [pB], #16
fmul v17.2d, v1.2d, v8.d[0]
fmul v21.2d, v1.2d, v9.d[0]
ldp q2, q3, [pA], #32
fmul v24.2d, v0.2d, v10.d[0]
fmul v28.2d, v0.2d, v11.d[0]
ldp q4, q5, [pA], #32
fmul v25.2d, v1.2d, v10.d[0]
fmul v29.2d, v1.2d, v11.d[0]
ldp d12, d13, [pB], #16
fmul v18.2d, v2.2d, v8.d[0]
fmul v22.2d, v2.2d, v9.d[0]
ldp d14, d15, [pB], #16
fmul v26.2d, v2.2d, v10.d[0]
fmul v30.2d, v2.2d, v11.d[0]
ldp q6, q7, [pA], #32
fmul v19.2d, v3.2d, v8.d[0]
fmul v27.2d, v3.2d, v10.d[0]
fmul v20.2d, v0.2d, v8.d[1]
fmul v21.2d, v1.2d, v8.d[1]
fmul v22.2d, v2.2d, v8.d[1]
fmul v23.2d, v3.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v24.2d, v0.2d, v9.d[0]
fmul v25.2d, v1.2d, v9.d[0]
fmul v26.2d, v2.2d, v9.d[0]
fmul v27.2d, v3.2d, v9.d[0]
fmul v31.2d, v3.2d, v11.d[0]
fmul v23.2d, v3.2d, v9.d[0]
fmul v28.2d, v0.2d, v9.d[1]
fmul v29.2d, v1.2d, v9.d[1]
fmul v30.2d, v2.2d, v9.d[1]
fmul v31.2d, v3.2d, v9.d[1]
ld1 {v4.2d, v5.2d}, [pA]
add pA, pA, #32
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
ld1 {v6.2d, v7.2d}, [pA]
add pA, pA, #32
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL8x4_M1
fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v9.d[0]
ldp q4, q5, [pA], #32
fmla v24.2d, v0.2d, v10.d[0]
fmla v28.2d, v0.2d, v11.d[0]
ldp d12, d13, [pB], #16
fmla v17.2d, v1.2d, v8.d[0]
fmla v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
fmla v21.2d, v1.2d, v9.d[0]
fmla v29.2d, v1.2d, v11.d[0]
ldp d14, d15, [pB], #16
fmla v18.2d, v2.2d, v8.d[0]
fmla v22.2d, v2.2d, v9.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v26.2d, v2.2d, v10.d[0]
fmla v30.2d, v2.2d, v11.d[0]
fmla v19.2d, v3.2d, v8.d[0]
fmla v23.2d, v3.2d, v9.d[0]
fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.d[1]
fmla v23.2d, v3.2d, v8.d[1]
ldp q6, q7, [pA], #32
fmla v24.2d, v0.2d, v9.d[0]
fmla v25.2d, v1.2d, v9.d[0]
fmla v26.2d, v2.2d, v9.d[0]
fmla v27.2d, v3.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.d[1]
fmla v29.2d, v1.2d, v9.d[1]
fmla v30.2d, v2.2d, v9.d[1]
fmla v31.2d, v3.2d, v9.d[1]
ld1 {v4.2d, v5.2d}, [pA]
add pA, pA, #32
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
ld1 {v6.2d, v7.2d}, [pA]
add pA, pA, #32
prfm PLDL1KEEP, [pA, #512]
fmla v27.2d, v3.2d, v10.d[0]
fmla v31.2d, v3.2d, v11.d[0]
.endm
.macro KERNEL8x4_M2
fmla v16.2d, v4.2d, v12.d[0]
fmla v20.2d, v4.2d, v13.d[0]
fmla v24.2d, v4.2d, v14.d[0]
fmla v28.2d, v4.2d, v15.d[0]
ldp q0, q1, [pA], #32
fmla v17.2d, v5.2d, v12.d[0]
fmla v25.2d, v5.2d, v14.d[0]
ldp d8, d9, [pB], #16
fmla v21.2d, v5.2d, v13.d[0]
fmla v29.2d, v5.2d, v15.d[0]
ldp d10, d11, [pB], #16
fmla v18.2d, v6.2d, v12.d[0]
fmla v22.2d, v6.2d, v13.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v26.2d, v6.2d, v14.d[0]
fmla v30.2d, v6.2d, v15.d[0]
fmla v19.2d, v7.2d, v12.d[0]
fmla v23.2d, v7.2d, v13.d[0]
fmla v20.2d, v4.2d, v12.d[1]
fmla v21.2d, v5.2d, v12.d[1]
fmla v22.2d, v6.2d, v12.d[1]
fmla v23.2d, v7.2d, v12.d[1]
ldp q2, q3, [pA], #32
fmla v24.2d, v4.2d, v13.d[0]
fmla v25.2d, v5.2d, v13.d[0]
fmla v26.2d, v6.2d, v13.d[0]
fmla v27.2d, v7.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.d[1]
fmla v29.2d, v5.2d, v13.d[1]
fmla v30.2d, v6.2d, v13.d[1]
fmla v31.2d, v7.2d, v13.d[1]
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld1 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
prfm PLDL1KEEP, [pB, #512]
fmla v27.2d, v7.2d, v14.d[0]
fmla v31.2d, v7.2d, v15.d[0]
.endm
.macro KERNEL8x4_E
fmla v16.2d, v4.2d, v12.d[0]
fmla v20.2d, v4.2d, v13.d[0]
fmla v24.2d, v4.2d, v14.d[0]
fmla v28.2d, v4.2d, v15.d[0]
fmla v17.2d, v5.2d, v12.d[0]
fmla v25.2d, v5.2d, v14.d[0]
fmla v21.2d, v5.2d, v13.d[0]
fmla v29.2d, v5.2d, v15.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v18.2d, v6.2d, v12.d[0]
fmla v22.2d, v6.2d, v13.d[0]
fmla v26.2d, v6.2d, v14.d[0]
fmla v30.2d, v6.2d, v15.d[0]
fmla v19.2d, v7.2d, v12.d[0]
fmla v20.2d, v4.2d, v12.d[1]
fmla v21.2d, v5.2d, v12.d[1]
fmla v22.2d, v6.2d, v12.d[1]
fmla v23.2d, v7.2d, v12.d[1]
fmla v24.2d, v4.2d, v13.d[0]
fmla v25.2d, v5.2d, v13.d[0]
fmla v26.2d, v6.2d, v13.d[0]
fmla v27.2d, v7.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.d[1]
fmla v29.2d, v5.2d, v13.d[1]
fmla v30.2d, v6.2d, v13.d[1]
fmla v31.2d, v7.2d, v13.d[1]
fmla v23.2d, v7.2d, v13.d[0]
fmla v27.2d, v7.2d, v14.d[0]
fmla v31.2d, v7.2d, v15.d[0]
.endm
.macro KERNEL8x4_SUB
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld1 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
ldp q0, q1, [pA], #32
ldp d8, d9, [pB], #16
fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v9.d[0]
ldp d10, d11, [pB], #16
fmla v17.2d, v1.2d, v8.d[0]
fmla v21.2d, v1.2d, v9.d[0]
ldp q2, q3, [pA], #32
fmla v24.2d, v0.2d, v10.d[0]
fmla v28.2d, v0.2d, v11.d[0]
fmla v25.2d, v1.2d, v10.d[0]
fmla v29.2d, v1.2d, v11.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v18.2d, v2.2d, v8.d[0]
fmla v22.2d, v2.2d, v9.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
fmla v26.2d, v2.2d, v10.d[0]
fmla v30.2d, v2.2d, v11.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v19.2d, v3.2d, v8.d[0]
fmla v27.2d, v3.2d, v10.d[0]
fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.d[1]
fmla v23.2d, v3.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.d[0]
fmla v25.2d, v1.2d, v9.d[0]
fmla v26.2d, v2.2d, v9.d[0]
fmla v27.2d, v3.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.d[1]
fmla v29.2d, v1.2d, v9.d[1]
fmla v30.2d, v2.2d, v9.d[1]
fmla v31.2d, v3.2d, v9.d[1]
fmla v31.2d, v3.2d, v11.d[0]
fmla v23.2d, v3.2d, v9.d[0]
.endm
.macro SAVE8x4
add pCRow1, pCRow0, LDC
fmov alpha0, alpha
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
fmul v0.2d, v16.2d, alphaV0
fmul v1.2d, v17.2d, alphaV1
fmul v2.2d, v18.2d, alphaV2
fmul v3.2d, v19.2d, alphaV3
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmul v1.2d, v17.2d, alphaV0
stp q0, q1, [pCRow0]
add pCRow2, pCRow1, LDC
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
fmul v2.2d, v18.2d, alphaV0
fmul v3.2d, v19.2d, alphaV0
stp q2, q3, [pCRow0]
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
fmul v4.2d, v20.2d, alphaV0
fmul v5.2d, v21.2d, alphaV1
fmul v6.2d, v22.2d, alphaV2
fmul v7.2d, v23.2d, alphaV3
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
fmul v5.2d, v21.2d, alphaV0
stp q4, q5, [pCRow1]
add pCRow1, pCRow2, LDC
add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
fmul v6.2d, v22.2d, alphaV0
fmul v7.2d, v23.2d, alphaV0
stp q6, q7, [pCRow1]
add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
fmul v0.2d, v24.2d, alphaV0
fmul v1.2d, v25.2d, alphaV1
fmul v2.2d, v26.2d, alphaV2
fmul v3.2d, v27.2d, alphaV3
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2]
fmul v1.2d, v25.2d, alphaV0
stp q0, q1, [pCRow2]
add pCRow2, pCRow2, #32
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
fmul v2.2d, v26.2d, alphaV0
fmul v3.2d, v27.2d, alphaV0
stp q2, q3, [pCRow2]
add pCRow2, pCRow2, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
fmul v4.2d, v28.2d, alphaV0
fmul v5.2d, v29.2d, alphaV1
fmul v6.2d, v30.2d, alphaV2
fmul v7.2d, v31.2d, alphaV3
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
fmul v5.2d, v29.2d, alphaV0
stp q4, q5, [pCRow3]
add pCRow0, pCRow0, #64
add pCRow3, pCRow3, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
fmul v6.2d, v30.2d, alphaV0
fmul v7.2d, v31.2d, alphaV0
stp q6, q7, [pCRow3]
add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
fmul v9.2d, v17.2d, alphaV1
fmul v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
fmul v12.2d, v20.2d, alphaV2
fmul v13.2d, v21.2d, alphaV3
fmul v12.2d, v20.2d, alphaV0
fmul v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
fmul v8.2d, v24.2d, alphaV0
fmul v9.2d, v25.2d, alphaV1
fmul v9.2d, v25.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
fmul v12.2d, v28.2d, alphaV2
fmul v13.2d, v29.2d, alphaV3
fmul v12.2d, v28.2d, alphaV0
fmul v13.2d, v29.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
fmul v12.2d, v20.2d, alphaV1
fmul v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
fmul v8.2d, v24.2d, alphaV2
fmul v8.2d, v24.2d, alphaV0
st1 {v8.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
fmul v12.2d, v28.2d, alphaV3
fmul v12.2d, v28.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16
@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
fmov alpha0, alpha
add pCRow1, pCRow0, LDC
fmul v8.2d, v16.2d, alphaV0
@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow2, pCRow1, LDC
add pCRow1, pCRow2, LDC
fmul v12.2d, v20.2d, alphaV1
fmul v12.2d, v20.2d, alphaV0
st1 {v12.d}[0], [pCRow2]
st1 {v12.d}[1], [pCRow1]
@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x2
fmov alpha0, alpha
add pCRow1, pCRow0, LDC
fmul v0.2d, v16.2d, alphaV0
fmul v1.2d, v17.2d, alphaV1
fmul v2.2d, v18.2d, alphaV2
fmul v3.2d, v19.2d, alphaV3
fmul v1.2d, v17.2d, alphaV0
fmul v2.2d, v18.2d, alphaV0
fmul v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmul v4.2d, v20.2d, alphaV0
fmul v5.2d, v21.2d, alphaV1
fmul v6.2d, v22.2d, alphaV2
fmul v7.2d, v23.2d, alphaV3
fmul v5.2d, v21.2d, alphaV0
fmul v6.2d, v22.2d, alphaV0
fmul v7.2d, v23.2d, alphaV0
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
add pCRow0, pCRow0, #64
@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
fmul v9.2d, v17.2d, alphaV1
fmul v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
fmul v12.2d, v20.2d, alphaV2
fmul v13.2d, v21.2d, alphaV3
fmul v12.2d, v20.2d, alphaV0
fmul v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
add pCRow1 , pCRow0, LDC
fmul v12.2d, v20.2d, alphaV1
fmul v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16
@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
fmov alpha0, alpha
add pCRow1 , pCRow0, LDC
fmul v8.2d, v16.2d, alphaV0
@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x1
fmov alpha0, alpha
fmul v0.2d, v16.2d, alphaV0
fmul v1.2d, v17.2d, alphaV1
fmul v2.2d, v18.2d, alphaV2
fmul v3.2d, v19.2d, alphaV3
fmul v1.2d, v17.2d, alphaV0
fmul v2.2d, v18.2d, alphaV0
fmul v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
add pCRow0, pCRow0, #64
@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
fmul v9.2d, v17.2d, alphaV1
fmul v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow0, pCRow0, #32
@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
fmov alpha0, alpha
fmul d8, d16, alpha0
str d8, [pCRow0]
@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
fmov alpha0, d0
fmov alpha1, d0
fmov alpha2, d0
fmov alpha3, d0
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alpha, d0
lsl LDC, LDC, #3 // ldc = ldc * 8
@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/******************************************************************************/
dtrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
add pCRow3, pCRow2, LDC
add pC, pCRow3, LDC
#if defined(LEFT)
mov tempOffset, offset
@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN:
cmp counterI, #0
ble dtrmm_kernel_L4_M4_BEGIN
.align 5
dtrmm_kernel_L4_M8_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20:
add tempK, tempOffset, #4
#endif
asr counterL , tempK, #1 // L = K / 2
asr counterL , tempK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble dtrmm_kernel_L4_M8_22a
.align 5
.align 5
dtrmm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M8_22
.align 5
dtrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b dtrmm_kernel_L4_M8_44
.align 5
dtrmm_kernel_L4_M8_32:
tst counterL, #1
ble dtrmm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b dtrmm_kernel_L4_M8_44
@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40:
dtrmm_kernel_L4_M8_44:
ands counterL , tempK, #1
ands counterL , tempK, #7
ble dtrmm_kernel_L4_M8_100
.align 5
dtrmm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne dtrmm_kernel_L4_M8_46
dtrmm_kernel_L4_M8_100:
SAVE8x4
@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100:
#if defined(LEFT)
add tempOffset, tempOffset, #8
#endif
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
dtrmm_kernel_L4_M8_END:
subs counterI, counterI, #1

View File

@ -68,6 +68,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SHZ 3
#endif
#define A_PRE_SIZE 768
#define Y_PRE_SIZE 768
/******************************************************************************/
.macro SAVE_REGS
@ -105,36 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.4s, v3.4s}, [A_PTR], #32
ld1 {v4.4s, v5.4s}, [Y_IPTR], #32
fmla v4.4s, v1.4s, v2.4s
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
fmla v5.4s, v1.4s, v3.4s
st1 {v4.4s, v5.4s}, [Y_OPTR], #32
ld1 {v6.4s, v7.4s}, [A_PTR], #32
ld1 {v8.4s, v9.4s}, [Y_IPTR], #32
fmla v8.4s, v1.4s, v6.4s
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v9.4s, v1.4s, v7.4s
st1 {v8.4s, v9.4s}, [Y_OPTR], #32
#else //DOUBLE
ld1 {v2.2d, v3.2d}, [A_PTR], #32
ld1 {v4.2d, v5.2d}, [Y_IPTR], #32
fmla v4.2d, v1.2d, v2.2d
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
fmla v5.2d, v1.2d, v3.2d
st1 {v4.2d, v5.2d}, [Y_OPTR], #32
ld1 {v6.2d, v7.2d}, [A_PTR], #32
ld1 {v8.2d, v9.2d}, [Y_IPTR], #32
fmla v8.2d, v1.2d, v6.2d
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v9.2d, v1.2d, v7.2d
st1 {v8.2d, v9.2d}, [Y_OPTR], #32
ld1 {v10.2d, v11.2d}, [A_PTR], #32
ld1 {v12.2d, v13.2d}, [Y_IPTR], #32
fmla v12.2d, v1.2d, v10.2d
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
fmla v13.2d, v1.2d, v11.2d
st1 {v12.2d, v13.2d}, [Y_OPTR], #32
ld1 {v14.2d, v15.2d}, [A_PTR], #32
ld1 {v16.2d, v17.2d}, [Y_IPTR], #32
fmla v16.2d, v1.2d, v14.2d
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v17.2d, v1.2d, v15.2d
st1 {v16.2d, v17.2d}, [Y_OPTR], #32
#endif

View File

@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define J x11 /* loop variable */
#define I x12 /* loop variable */
#define X_PREFETCH_SIZE 768
#define A_PREFETCH_SIZE 768
/*******************************************************************************
* Macro definitions
*******************************************************************************/
@ -112,42 +115,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64
ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64
fmla v1.4s, v5.4s, v9.4s
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.4s, v6.4s, v10.4s
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.4s, v7.4s, v11.4s
ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
fmla v4.4s, v8.4s, v12.4s
ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64
fmla v1.4s, v13.4s, v17.4s
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.4s, v14.4s, v18.4s
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.4s, v15.4s, v19.4s
fmla v4.4s, v16.4s, v20.4s
#else
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
fmla v1.2d, v5.2d, v9.2d
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v6.2d, v10.2d
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v7.2d, v11.2d
fmla v4.2d, v8.2d, v12.2d
ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
fmla v1.2d, v13.2d, v17.2d
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v14.2d, v18.2d
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v15.2d, v19.2d
fmla v4.2d, v16.2d, v20.2d
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
fmla v1.2d, v5.2d, v9.2d
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v6.2d, v10.2d
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v7.2d, v11.2d
fmla v4.2d, v8.2d, v12.2d
ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
fmla v1.2d, v13.2d, v17.2d
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v14.2d, v18.2d
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v15.2d, v19.2d
fmla v4.2d, v16.2d, v20.2d
#endif

View File

@ -72,6 +72,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fabs MAXF, MAXF
.endm
.macro KERNEL_F8
#if !defined(DOUBLE)
ldp q2, q3, [X], #32
fabs v2.4s, v2.4s
fabs v3.4s, v3.4s
fmax v2.4s, v2.4s, v3.4s
fmaxv TMPF, v2.4s
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
add Z, Z, #8
#else
ldp q2, q3, [X], #32
ldp q4, q5, [X], #32
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fabs v4.2d, v4.2d
fabs v5.2d, v5.2d
fmax v2.2d, v2.2d, v3.2d
fmax v4.2d, v4.2d, v5.2d
fmax v2.2d, v2.2d, v4.2d
fmaxp TMPF, v2.2d
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
add Z, Z, #8
#endif
PRFM PLDL1KEEP, [X, #1024]
.endm
.macro KERNEL_F8_FINALIZE
sub x6, INDEX, #1
#if !defined(DOUBLE)
lsl x6, x6, #2
add x7, x7, x6
ldp q2, q3, [x7]
fabs v2.4s, v2.4s
fabs v3.4s, v3.4s
ins v4.s[0], v3.s[0]
ins v5.s[0], v3.s[1]
ins v6.s[0], v3.s[2]
ins v7.s[0], v3.s[3]
add x6, INDEX, #7
fcmp MAXF, s7
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s6
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s5
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v2.s[0]
ins v5.s[0], v2.s[1]
ins v6.s[0], v2.s[2]
ins v7.s[0], v2.s[3]
sub x6, x6, #1
fcmp MAXF, s7
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s6
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s5
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
#else
add x6, x6, #4
lsl x6, x6, #3
add x7, x7, x6
ldp q2, q3, [x7]
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
ins v4.d[0], v2.d[0]
ins v5.d[0], v2.d[1]
ins v6.d[0], v3.d[0]
ins v7.d[0], v3.d[1]
add x6, INDEX, #7
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d6
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d5
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d4
csel INDEX, x6, INDEX, eq
sub x7, x7, #32
ldp q2, q3, [x7]
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
ins v4.d[0], v2.d[0]
ins v5.d[0], v2.d[1]
ins v6.d[0], v3.d[0]
ins v7.d[0], v3.d[1]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d6
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d5
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d4
csel INDEX, x6, INDEX, eq
#endif
.endm
.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
add Z, Z, #1
@ -92,6 +234,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp INC_X, xzr
ble iamax_kernel_zero
cmp INC_X, #1
bne iamax_kernel_S_BEGIN
mov x7, X
iamax_kernel_F_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
asr I, N, #3
cmp I, xzr
beq iamax_kernel_F1
add Z, Z, #1
iamax_kernel_F8:
KERNEL_F8
subs I, I, #1
bne iamax_kernel_F8
KERNEL_F8_FINALIZE
sub Z, Z, #1
iamax_kernel_F1:
ands I, N, #7
ble iamax_kernel_L999
iamax_kernel_F10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_F10
b iamax_kernel_L999
iamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1

View File

@ -78,6 +78,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
.endm
.macro KERNEL_F8
#if !defined(DOUBLE)
ldp q2, q3, [X], #32
ldp q4, q5, [X], #32
fabs v2.4s, v2.4s
fabs v3.4s, v3.4s
fabs v4.4s, v4.4s
fabs v5.4s, v5.4s
faddp v2.4s, v2.4s, v3.4s
faddp v3.4s, v4.4s, v5.4s
fmax v2.4s, v2.4s, v3.4s
fmaxv TMPF, v2.4s
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
add Z, Z, #8
#else
ldp q2, q3, [X], #32
ldp q4, q5, [X], #32
ldp q16, q17, [X], #32
ldp q18, q19, [X], #32
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fabs v4.2d, v4.2d
fabs v5.2d, v5.2d
fabs v16.2d, v16.2d
fabs v17.2d, v17.2d
fabs v18.2d, v18.2d
fabs v19.2d, v19.2d
faddp v2.2d, v2.2d, v3.2d
faddp v3.2d, v4.2d, v5.2d
faddp v4.2d, v16.2d, v17.2d
faddp v5.2d, v18.2d, v19.2d
fmax v2.2d, v2.2d, v3.2d
fmax v4.2d, v4.2d, v5.2d
fmax v2.2d, v2.2d, v4.2d
fmaxp TMPF, v2.2d
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
add Z, Z, #8
#endif
PRFM PLDL1KEEP, [X, #1024]
.endm
.macro KERNEL_F8_FINALIZE
sub x6, INDEX, #1
#if !defined(DOUBLE)
lsl x6, x6, #3
add x7, x7, x6
ldp q2, q3, [x7]
ldp q4, q5, [x7, #32]
fabs v2.4s, v2.4s
fabs v3.4s, v3.4s
fabs v4.4s, v4.4s
fabs v5.4s, v5.4s
faddp v2.4s, v2.4s, v3.4s
faddp v3.4s, v4.4s, v5.4s
ins v4.s[0], v3.s[3]
add x6, INDEX, #7
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v3.s[2]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v3.s[1]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v3.s[0]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v2.s[3]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v2.s[2]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v2.s[1]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v2.s[0]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
#else
lsl x6, x6, #4
add x7, x7, x6
ldp q2, q3, [x7]
ldp q4, q5, [x7, #32]
ldp q16, q17, [x7, #64]
ldp q18, q19, [x7, #96]
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fabs v4.2d, v4.2d
fabs v5.2d, v5.2d
fabs v16.2d, v16.2d
fabs v17.2d, v17.2d
fabs v18.2d, v18.2d
fabs v19.2d, v19.2d
faddp v2.2d, v2.2d, v3.2d
faddp v3.2d, v4.2d, v5.2d
faddp v4.2d, v16.2d, v17.2d
faddp v5.2d, v18.2d, v19.2d
ins v7.d[0], v5.d[1]
add x6, INDEX, #7
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v5.d[0]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v4.d[1]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v4.d[0]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v3.d[1]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v3.d[0]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v2.d[1]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v2.d[0]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
#endif
.endm
.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v1.2s}, [X], INC_X
@ -107,6 +280,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp INC_X, xzr
ble iamax_kernel_zero
cmp INC_X, #1
bne iamax_kernel_S_BEGIN
mov x7, X
iamax_kernel_F_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
asr I, N, #3
cmp I, xzr
ble iamax_kernel_F1
add Z, Z, #1
iamax_kernel_F8:
KERNEL_F8
subs I, I, #1
bne iamax_kernel_F8
KERNEL_F8_FINALIZE
sub Z, Z, #1
iamax_kernel_F1:
ands I, N, #7
ble iamax_kernel_L999
iamax_kernel_F10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_F10
b iamax_kernel_L999
iamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
#define pA x15
#define alpha_save_R x16
#define alpha_save_I x17
#define pCRow3 x15
#define pA x16
#define alphaR x17
#define alphaI x18
#define alpha0_R d10
#define alphaV0_R v10.d[0]
#define alpha0_I d11
#define alphaV0_I v11.d[0]
#define alpha1_R d14
#define alphaV1_R v14.d[0]
#define alpha1_I d15
#define alphaV1_I v15.d[0]
#define A_PRE_SIZE 2560
#define B_PRE_SIZE 448
#define C_PRE_SIZE 128
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define OP_rr fmla
@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
// 15 pA
// 16 alpha_save_R
// 17 alpha_save_I
// 18 must save
// 15 pCRow3
// 16 pA
// 17 alpha_save_R
// 18 must save alpha_save_I
// 19 must save
// 20 must save
// 21 must save
@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_I
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
fmul v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.d[0]
#else
fmul v19.2d, v2.2d, v9.d[0]
#endif
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
fmul v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
fmul v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.d[0]
#else
fmul v19.2d, v2.2d, v9.d[0]
#endif
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
fmul v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v25.2d, v1.2d, v10.d[0]
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
fmul v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v27.2d, v3.2d, v10.d[0]
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
fmul v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v29.2d, v1.2d, v10.d[1]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v31.2d, v3.2d, v10.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL4x4_M1
@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
ld2 {v12.2d, v13.2d}, [pB] // For next round
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.d[0]
@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v14.2d, v15.2d}, [pB] // For next round
add pB, pB, #32
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v4.2d, v5.2d} , [pA] // For next round
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
OP_rr v22.2d, v2.2d, v8.d[1]
@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v6.2d, v7.2d} , [pA] // For next round
add pA, pA, #32
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #512]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pB, #512]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.d[0]
ld2 {v8.2d, v9.2d}, [pB] // For next round
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v6.2d, v12.d[0]
@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.d[0]
ld2 {v10.2d, v11.2d}, [pB] // For next round
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
ld2 {v0.2d, v1.2d}, [pA] // For next round
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v22.2d, v6.2d, v12.d[1]
@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.d[1]
ld2 {v2.2d, v3.2d}, [pA] // For next round
add pA, pA, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pA, #512]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #512]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.d[1]
@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.d[1]
@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_SUB
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2 {v0.2d, v1.2d}, [pCRow1]
ld2 {v0.2d, v1.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v2.2d, v3.2d}, [pCRow2]
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow0]
add pCRow0, pCRow0, #32
ld2 {v2.2d, v3.2d}, [pCRow0]
fmla v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmla v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
st2 {v2.2d, v3.2d}, [pCRow2]
fmla v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow0]
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow1, pCRow1, LDC
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmla v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmla v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v6.2d, v7.2d}, [pCRow2]
add pCRow1, pCRow1, #32
ld2 {v6.2d, v7.2d}, [pCRow1]
fmla v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
fmla v7.2d, v22.2d, alphaV1_I
fmla v7.2d, v23.2d, alphaV1_R
st2 {v6.2d, v7.2d}, [pCRow2]
fmla v7.2d, v22.2d, alphaV0_I
fmla v7.2d, v23.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
ld2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
ld2 {v0.2d, v1.2d}, [pCRow2]
fmla v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
fmla v1.2d, v24.2d, alphaV1_I
fmla v1.2d, v25.2d, alphaV1_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmla v1.2d, v24.2d, alphaV0_I
fmla v1.2d, v25.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow2]
add pCRow2, pCRow2, #32
ld2 {v2.2d, v3.2d}, [pCRow2]
fmla v2.2d, v26.2d, alphaV0_R
fmls v2.2d, v27.2d, alphaV0_I
fmla v3.2d, v26.2d, alphaV1_I
fmla v3.2d, v27.2d, alphaV1_R
fmla v3.2d, v26.2d, alphaV0_I
fmla v3.2d, v27.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow1, pCRow1, LDC
add pCRow2, pCRow2, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
ld2 {v4.2d, v5.2d}, [pCRow1]
ld2 {v4.2d, v5.2d}, [pCRow3]
fmla v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
fmla v5.2d, v28.2d, alphaV1_I
fmla v5.2d, v29.2d, alphaV1_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v6.2d, v7.2d}, [pCRow2]
fmla v5.2d, v28.2d, alphaV0_I
fmla v5.2d, v29.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow3]
add pCRow3, pCRow3, #32
ld2 {v6.2d, v7.2d}, [pCRow3]
fmla v6.2d, v30.2d, alphaV0_R
fmls v6.2d, v31.2d, alphaV0_I
fmla v7.2d, v30.2d, alphaV1_I
fmla v7.2d, v31.2d, alphaV1_R
st2 {v6.2d, v7.2d}, [pCRow2]
fmla v7.2d, v30.2d, alphaV0_I
fmla v7.2d, v31.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow3]
add pCRow0, pCRow0, #64
add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmla v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmla v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
fmla v1.2d, v24.2d, alphaV1_I
fmla v1.2d, v25.2d, alphaV1_R
fmla v1.2d, v24.2d, alphaV0_I
fmla v1.2d, v25.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
fmla v5.2d, v28.2d, alphaV1_I
fmla v5.2d, v29.2d, alphaV1_R
fmla v5.2d, v28.2d, alphaV0_I
fmla v5.2d, v29.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmla d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmla d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.d, v5.d}[0], [pCRow1]
fmla d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
fmla d5, d20, alphaV1_I
fmla d5, d21, alphaV1_R
fmla d5, d20, alphaV0_I
fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d24, alphaV0_R
fmls d0, d25, alphaV0_I
fmla d1, d24, alphaV1_I
fmla d1, d25, alphaV1_R
fmla d1, d24, alphaV0_I
fmla d1, d25, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.d, v5.d}[0], [pCRow1]
fmla d4, d28, alphaV0_R
fmls d4, d29, alphaV0_I
fmla d5, d28, alphaV1_I
fmla d5, d29, alphaV1_R
fmla d5, d28, alphaV0_I
fmla d5, d29, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v2.2d, v3.2d}, [pCRow2]
fmla v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmla v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
fmla v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow1, pCRow1, LDC
@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmla v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmla v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v6.2d, v7.2d}, [pCRow2]
fmla v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
fmla v7.2d, v22.2d, alphaV1_I
fmla v7.2d, v23.2d, alphaV1_R
fmla v7.2d, v22.2d, alphaV0_I
fmla v7.2d, v23.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmla v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmla v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmla d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmla d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.d, v5.d}[0], [pCRow1]
fmla d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
fmla d5, d20, alphaV1_I
fmla d5, d21, alphaV1_R
fmla d5, d20, alphaV0_I
fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v2.2d, v3.2d}, [pCRow2]
fmla v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmla v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
fmla v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmla d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmla d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
fmov alpha_save_R, d0
fmov alpha_save_I, d1
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alphaR, d0
fmov alphaI, d1
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble zgemm_kernel_L2_BEGIN
zgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
add pCRow3, pCRow2, LDC
add pC, pCRow3, LDC
mov pA, origPA // pA = start of A array
zgemm_kernel_L4_M4_BEGIN:
@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN:
cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN
.align 5
zgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
asr counterL , origK, #3
cmp counterL , #2
blt zgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a
.align 5
.align 5
zgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22
.align 5
zgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
.align 5
zgemm_kernel_L4_M4_32:
tst counterL, #1
ble zgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40:
zgemm_kernel_L4_M4_44:
ands counterL , origK, #1
ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100
.align 5
zgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs counterL, counterL, #1
bne zgemm_kernel_L4_M4_46
zgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE4x4

View File

@ -43,6 +43,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define Y_OPTR x13 /* loop Y vector address */
#define X_PTR x14 /* loop X vector address */
#define A_PRE_SIZE 768
#define Y_PRE_SIZE 768
/*******************************************************************************
* Macro definitions
*******************************************************************************/
@ -50,14 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(DOUBLE)
#define ALPHA_R s0
#define ALPHA_I s1
#define ALPHA_R_COPY s7
#define ALPHA_I_COPY s8
#define SHZ 3
#else
#define ALPHA_R d0
#define ALPHA_I d1
#define ALPHA_R_COPY d7
#define ALPHA_I_COPY d8
#define SHZ 4
#endif
@ -95,20 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT
/********** INIT FOR F4 LOOP **********/
fmov ALPHA_R_COPY, ALPHA_R
fmov ALPHA_I_COPY, ALPHA_I
#if !defined(DOUBLE)
ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA)
ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA)
ins v7.d[1], v7.d[0]
ins v8.d[1], v8.d[0]
#else
ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA)
ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA)
#endif
/******* INIT FOR F1 AND S1 LOOP ******/
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
eor v2.16b, v2.16b, v2.16b
@ -129,47 +114,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro INIT_LOOP
/********** INIT_LOOP FOR F4 LOOP **********/
#if !defined(DOUBLE)
ld1 {v9.2s}, [X_PTR] // [I(X), R(X)]
ins v10.s[0], v9.s[1]
ins v9.s[1], v9.s[0] // [R(X), R(X)]
ins v10.s[1], v10.s[0] // [I(X), I(X)]
ins v9.d[1], v9.d[0]
ins v10.d[1], v10.d[0]
#if !defined(CONJ)
#if !defined(XCONJ)
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
#else
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
#endif
#else // CONJ
#if !defined(XCONJ)
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
#else
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
eor v12.16b, v12.16b, v12.16b
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
#endif
#endif // CONJ
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
ld1 {v2.2s}, [X_PTR] // [I(X), R(X)]
ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)]
fmul v2.2s, v0.2s, v2.2s
fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)]
ins v3.s[0], v2.s[1]
/********** INIT_LOOP FOR F4 LOOP **********/
#if !defined(CONJ)
#if !defined(XCONJ)
dup v21.4s, v2.s[0] // R[TEMP]
dup v22.4s, v2.s[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub s25, s25, s3
dup v23.4s, v25.s[0] // -I[TEMP]
dup v24.4s, v3.s[0] // I[TEMP]
#else
dup v21.4s, v2.s[0] // R[TEMP]
dup v22.4s, v2.s[0] // R[TEMP]
dup v23.4s, v3.s[0] // I[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub s25, s25, s3
dup v24.4s, v25.s[0] // -I[TEMP]
#endif
#else // CONJ
#if !defined(XCONJ)
dup v21.4s, v2.s[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub s25, s25, s2
dup v22.4s, v25.s[0] // R[TEMP]
dup v23.4s, v3.s[0] // I[TEMP]
dup v24.4s, v3.s[0] // I[TEMP]
#else
dup v21.4s, v2.s[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub s25, s25, s2
dup v22.4s, v25.s[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub s25, s25, s3
dup v23.4s, v25.s[0] // I[TEMP]
dup v24.4s, v25.s[0] // I[TEMP]
#endif
#endif // CONJ
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
#if !defined(CONJ)
#if !defined(XCONJ)
eor v4.16b, v4.16b, v4.16b
@ -200,45 +191,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif // CONJ
#else // DOUBLE
/********** INIT_LOOP FOR F4 LOOP **********/
ld1 {v9.2d}, [X_PTR] // [I(X), R(X)]
ins v10.d[0], v9.d[1]
ins v9.d[1], v9.d[0] // [R(X), R(X)]
ins v10.d[1], v10.d[0] // [I(X), I(X)]
#if !defined(CONJ)
#if !defined(XCONJ)
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
#else
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
#endif
#else // CONJ
#if !defined(XCONJ)
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
#else
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
eor v12.16b, v12.16b, v12.16b
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
#endif
#endif // CONJ
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
ld1 {v2.2d}, [X_PTR] // [I(X), R(X)]
ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)]
fmul v2.2d, v0.2d, v2.2d
fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)]
ins v3.d[0], v2.d[1] // I(TEMP)
/****** INIT_LOOP FOR F4 LOOP ******/
#if !defined(CONJ)
#if !defined(XCONJ)
dup v21.2d, v2.d[0] // R[TEMP]
dup v22.2d, v2.d[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub d25, d25, d3
dup v23.2d, v25.d[0] // -I[TEMP]
dup v24.2d, v3.d[0] // I[TEMP]
#else
dup v21.2d, v2.d[0] // R[TEMP]
dup v22.2d, v2.d[0] // R[TEMP]
dup v23.2d, v3.d[0] // I[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub d25, d25, d3
dup v24.2d, v25.d[0] // -I[TEMP]
#endif
#else // CONJ
#if !defined(XCONJ)
dup v21.2d, v2.d[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub d25, d25, d2
dup v22.2d, v25.d[0] // R[TEMP]
dup v23.2d, v3.d[0] // I[TEMP]
dup v24.2d, v3.d[0] // I[TEMP]
#else
dup v21.2d, v2.d[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub d25, d25, d2
dup v22.2d, v25.d[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub d25, d25, d3
dup v23.2d, v25.d[0] // I[TEMP]
dup v24.2d, v25.d[0] // I[TEMP]
#endif
#endif // CONJ
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
#if !defined(CONJ)
#if !defined(XCONJ)
eor v4.16b, v4.16b, v4.16b
@ -276,91 +274,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v13.4s, v14.4s}, [A_PTR], #32
ld2 {v15.4s, v16.4s}, [Y_IPTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
#else
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
#else
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
#endif
#endif // CONJ
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v15.4s, v21.4s, v13.4s
fmla v15.4s, v23.4s, v14.4s
fmla v16.4s, v22.4s, v14.4s
fmla v16.4s, v24.4s, v13.4s
st2 {v15.4s, v16.4s}, [Y_OPTR], #32
#else // DOUBLE
ld2 {v13.2d, v14.2d}, [A_PTR], #32
ld2 {v15.2d, v16.2d}, [Y_IPTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
#endif
#endif // CONJ
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
fmla v15.2d, v21.2d, v13.2d
fmla v15.2d, v23.2d, v14.2d
fmla v16.2d, v22.2d, v14.2d
fmla v16.2d, v24.2d, v13.2d
st2 {v15.2d, v16.2d}, [Y_OPTR], #32
ld2 {v17.2d, v18.2d}, [A_PTR], #32
ld2 {v19.2d, v20.2d}, [Y_IPTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#endif
#endif // CONJ
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v19.2d, v21.2d, v17.2d
fmla v19.2d, v23.2d, v18.2d
fmla v20.2d, v22.2d, v18.2d
fmla v20.2d, v24.2d, v17.2d
st2 {v19.2d, v20.2d}, [Y_OPTR], #32
#endif
@ -445,10 +391,7 @@ zgemv_n_kernel_F_LOOP:
zgemv_n_kernel_F4:
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F4
subs I, I, #1
bne zgemv_n_kernel_F4

View File

@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define J x11 /* loop variable */
#define I x12 /* loop variable */
#define A_PRE_SIZE 768
#define X_PRE_SIZE 768
/*******************************************************************************
* Macro definitions
*******************************************************************************/
@ -139,6 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v11.4s, v12.4s}, [X_PTR], #32
ld2 {v13.4s, v14.4s}, [A_PTR], #32
prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE]
prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE]
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
@ -155,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else // DOUBLE
ld2 {v11.2d, v12.2d}, [X_PTR], #32
ld2 {v13.2d, v14.2d}, [A_PTR], #32
prfm PLDL1STRM, [X_PTR, #512]
prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE]
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
@ -171,7 +176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v17.2d, v18.2d}, [X_PTR], #32
ld2 {v19.2d, v20.2d}, [A_PTR], #32
prfm PLDL1STRM, [A_PTR, #512]
prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE]
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]

View File

@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
#define pA x15
#define alpha_save_R x16
#define alpha_save_I x17
#define temp x18
#define tempOffset x19
#define tempK x20
#define pCRow3 x15
#define pA x16
#define alphaR x17
#define alphaI x18
#define temp x19
#define tempOffset x20
#define tempK x21
#define alpha0_R d10
#define alphaV0_R v10.d[0]
#define alpha0_I d11
#define alphaV0_I v11.d[0]
#define alpha1_R d14
#define alphaV1_R v14.d[0]
#define alpha1_I d15
#define alphaV1_I v15.d[0]
#define A_PRE_SIZE 2560
#define B_PRE_SIZE 448
#define C_PRE_SIZE 128
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define OP_rr fmla
@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 04 origPB
// 05 pC
// 06 origLDC -> LDC
// 07 offset
// 07 offset -> temp
// 08 counterL
// 09 counterI
// 10 counterJ
@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
// 15 pA
// 16 alpha_save_R
// 17 alpha_save_I
// 18 must save temp
// 19 must save tempOffset
// 20 must save tempK
// 21 must save
// 15 pCRow3
// 16 pA
// 17 alpha_save_R
// 18 must save alpha_save_I
// 19 must save temp
// 20 must save tempOffset
// 21 must save tempK
// 22 must save
// 23 must save
// 24 must save
@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_I
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
fmul v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.d[0]
#else
fmul v19.2d, v2.2d, v9.d[0]
#endif
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
fmul v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
fmul v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.d[0]
#else
fmul v19.2d, v2.2d, v9.d[0]
#endif
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
fmul v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v25.2d, v1.2d, v10.d[0]
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
fmul v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v27.2d, v3.2d, v10.d[0]
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
fmul v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v29.2d, v1.2d, v10.d[1]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v31.2d, v3.2d, v10.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL4x4_M1
@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
ld2 {v12.2d, v13.2d}, [pB] // For next round
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.d[0]
@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v14.2d, v15.2d}, [pB] // For next round
add pB, pB, #32
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v4.2d, v5.2d} , [pA] // For next round
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
OP_rr v22.2d, v2.2d, v8.d[1]
@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v6.2d, v7.2d} , [pA] // For next round
add pA, pA, #32
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #512]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pB, #512]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.d[0]
ld2 {v8.2d, v9.2d}, [pB] // For next round
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v6.2d, v12.d[0]
@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.d[0]
ld2 {v10.2d, v11.2d}, [pB] // For next round
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
ld2 {v0.2d, v1.2d}, [pA] // For next round
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v22.2d, v6.2d, v12.d[1]
@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.d[1]
ld2 {v2.2d, v3.2d}, [pA] // For next round
add pA, pA, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pA, #512]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #512]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.d[1]
@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.d[1]
@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_SUB
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow0]
add pCRow0, pCRow0, #32
fmul v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmul v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
st2 {v2.2d, v3.2d}, [pCRow2]
fmul v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow0]
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmul v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmul v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
add pCRow1, pCRow1, #32
fmul v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
fmul v7.2d, v22.2d, alphaV1_I
fmla v7.2d, v23.2d, alphaV1_R
st2 {v6.2d, v7.2d}, [pCRow2]
fmul v7.2d, v22.2d, alphaV0_I
fmla v7.2d, v23.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow1]
add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow1, LDC
fmul v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
fmul v1.2d, v24.2d, alphaV1_I
fmla v1.2d, v25.2d, alphaV1_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v1.2d, v24.2d, alphaV0_I
fmla v1.2d, v25.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow2]
add pCRow2, pCRow2, #32
fmul v2.2d, v26.2d, alphaV0_R
fmls v2.2d, v27.2d, alphaV0_I
fmul v3.2d, v26.2d, alphaV1_I
fmla v3.2d, v27.2d, alphaV1_R
fmul v3.2d, v26.2d, alphaV0_I
fmla v3.2d, v27.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow1, pCRow1, LDC
add pCRow2, pCRow2, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
fmul v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
fmul v5.2d, v28.2d, alphaV1_I
fmla v5.2d, v29.2d, alphaV1_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v5.2d, v28.2d, alphaV0_I
fmla v5.2d, v29.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow3]
add pCRow3, pCRow3, #32
fmul v6.2d, v30.2d, alphaV0_R
fmls v6.2d, v31.2d, alphaV0_I
fmul v7.2d, v30.2d, alphaV1_I
fmla v7.2d, v31.2d, alphaV1_R
st2 {v6.2d, v7.2d}, [pCRow2]
fmul v7.2d, v30.2d, alphaV0_I
fmla v7.2d, v31.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow3]
add pCRow0, pCRow0, #64
add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmul v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmul v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
fmul v1.2d, v24.2d, alphaV1_I
fmla v1.2d, v25.2d, alphaV1_R
fmul v1.2d, v24.2d, alphaV0_I
fmla v1.2d, v25.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
fmul v5.2d, v28.2d, alphaV1_I
fmla v5.2d, v29.2d, alphaV1_R
fmul v5.2d, v28.2d, alphaV0_I
fmla v5.2d, v29.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmul d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmul d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
fmul d5, d20, alphaV1_I
fmla d5, d21, alphaV1_R
fmul d5, d20, alphaV0_I
fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d0, d24, alphaV0_R
fmls d0, d25, alphaV0_I
fmul d1, d24, alphaV1_I
fmla d1, d25, alphaV1_R
fmul d1, d24, alphaV0_I
fmla d1, d25, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d4, d28, alphaV0_R
fmls d4, d29, alphaV0_I
fmul d5, d28, alphaV1_I
fmla d5, d29, alphaV1_R
fmul d5, d28, alphaV0_I
fmla d5, d29, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmul v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
fmul v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmul v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmul v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
fmul v7.2d, v22.2d, alphaV1_I
fmla v7.2d, v23.2d, alphaV1_R
fmul v7.2d, v22.2d, alphaV0_I
fmla v7.2d, v23.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmul v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmul v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmul d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmul d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
fmul d5, d20, alphaV1_I
fmla d5, d21, alphaV1_R
fmul d5, d20, alphaV0_I
fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmul v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
fmul v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmul d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmul d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
fmov alpha_save_R, d0
fmov alpha_save_I, d1
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alphaR, d0
fmov alphaI, d1
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble ztrmm_kernel_L2_BEGIN
ztrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
add pCRow3, pCRow2, LDC
add pC, pCRow3, LDC
#if defined(LEFT)
mov tempOffset, offset
@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN:
cmp counterI, #0
ble ztrmm_kernel_L4_M2_BEGIN
.align 5
ztrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20:
add tempK, tempOffset, #4
#endif
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
asr counterL , tempK, #3
cmp counterL , #2
blt ztrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #2
ble ztrmm_kernel_L4_M4_22a
.align 5
.align 5
ztrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M4_22
.align 5
ztrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b ztrmm_kernel_L4_M4_44
.align 5
ztrmm_kernel_L4_M4_32:
tst counterL, #1
ble ztrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b ztrmm_kernel_L4_M4_44
@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40:
ztrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
ands counterL , tempK, #7
ble ztrmm_kernel_L4_M4_100
.align 5
ztrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs counterL, counterL, #1
bne ztrmm_kernel_L4_M4_46
ztrmm_kernel_L4_M4_100:
SAVE4x4
@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
ztrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne ztrmm_kernel_L4_M4_20

46
kernel/mips/KERNEL Normal file
View File

@ -0,0 +1,46 @@
ifndef SNRM2KERNEL
SNRM2KERNEL = nrm2.c
endif
ifndef DNRM2KERNEL
DNRM2KERNEL = nrm2.c
endif
ifndef CNRM2KERNEL
CNRM2KERNEL = znrm2.c
endif
ifndef ZNRM2KERNEL
ZNRM2KERNEL = znrm2.c
endif
ifndef SCABS_KERNEL
SCABS_KERNEL = ../generic/cabs.c
endif
ifndef DCABS_KERNEL
DCABS_KERNEL = ../generic/cabs.c
endif
ifndef QCABS_KERNEL
QCABS_KERNEL = ../generic/cabs.c
endif
ifndef LSAME_KERNEL
LSAME_KERNEL = ../generic/lsame.c
endif
ifndef SGEMM_BETA
SGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef DGEMM_BETA
DGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef CGEMM_BETA
CGEMM_BETA = ../generic/zgemm_beta.c
endif
ifndef ZGEMM_BETA
ZGEMM_BETA = ../generic/zgemm_beta.c
endif

221
kernel/mips/KERNEL.P5600 Normal file
View File

@ -0,0 +1,221 @@
SAMAXKERNEL = ../mips/amax.c
DAMAXKERNEL = ../mips/amax.c
CAMAXKERNEL = ../mips/zamax.c
ZAMAXKERNEL = ../mips/zamax.c
SAMINKERNEL = ../mips/amin.c
DAMINKERNEL = ../mips/amin.c
CAMINKERNEL = ../mips/zamin.c
ZAMINKERNEL = ../mips/zamin.c
SMAXKERNEL = ../mips/max.c
DMAXKERNEL = ../mips/max.c
SMINKERNEL = ../mips/min.c
DMINKERNEL = ../mips/min.c
ISAMAXKERNEL = ../mips/iamax.c
IDAMAXKERNEL = ../mips/iamax.c
ICAMAXKERNEL = ../mips/izamax.c
IZAMAXKERNEL = ../mips/izamax.c
ISAMINKERNEL = ../mips/iamin.c
IDAMINKERNEL = ../mips/iamin.c
ICAMINKERNEL = ../mips/izamin.c
IZAMINKERNEL = ../mips/izamin.c
ISMAXKERNEL = ../mips/imax.c
IDMAXKERNEL = ../mips/imax.c
ISMINKERNEL = ../mips/imin.c
IDMINKERNEL = ../mips/imin.c
ifdef HAVE_MSA
SASUMKERNEL = ../mips/sasum_msa.c
DASUMKERNEL = ../mips/dasum_msa.c
CASUMKERNEL = ../mips/casum_msa.c
ZASUMKERNEL = ../mips/zasum_msa.c
else
SASUMKERNEL = ../mips/asum.c
DASUMKERNEL = ../mips/asum.c
CASUMKERNEL = ../mips/asum.c
ZASUMKERNEL = ../mips/asum.c
endif
SAXPYKERNEL = ../mips/axpy.c
DAXPYKERNEL = ../mips/axpy.c
CAXPYKERNEL = ../mips/zaxpy.c
ZAXPYKERNEL = ../mips/zaxpy.c
SCOPYKERNEL = ../mips/copy.c
DCOPYKERNEL = ../mips/copy.c
CCOPYKERNEL = ../mips/zcopy.c
ZCOPYKERNEL = ../mips/zcopy.c
ifdef HAVE_MSA
SDOTKERNEL = ../mips/sdot_msa.c
DDOTKERNEL = ../mips/ddot_msa.c
CDOTKERNEL = ../mips/cdot_msa.c
ZDOTKERNEL = ../mips/zdot_msa.c
else
SDOTKERNEL = ../mips/dot.c
DDOTKERNEL = ../mips/dot.c
CDOTKERNEL = ../mips/zdot.c
ZDOTKERNEL = ../mips/zdot.c
endif
SNRM2KERNEL = ../mips/nrm2.c
DNRM2KERNEL = ../mips/nrm2.c
CNRM2KERNEL = ../mips/znrm2.c
ZNRM2KERNEL = ../mips/znrm2.c
SROTKERNEL = ../mips/rot.c
DROTKERNEL = ../mips/rot.c
CROTKERNEL = ../mips/zrot.c
ZROTKERNEL = ../mips/zrot.c
SSCALKERNEL = ../mips/scal.c
DSCALKERNEL = ../mips/scal.c
CSCALKERNEL = ../mips/zscal.c
ZSCALKERNEL = ../mips/zscal.c
SSWAPKERNEL = ../mips/swap.c
DSWAPKERNEL = ../mips/swap.c
CSWAPKERNEL = ../mips/zswap.c
ZSWAPKERNEL = ../mips/zswap.c
ifdef HAVE_MSA
SGEMVNKERNEL = ../mips/sgemv_n_msa.c
DGEMVNKERNEL = ../mips/dgemv_n_msa.c
CGEMVNKERNEL = ../mips/cgemv_n_msa.c
ZGEMVNKERNEL = ../mips/zgemv_n_msa.c
else
SGEMVNKERNEL = ../mips/gemv_n.c
DGEMVNKERNEL = ../mips/gemv_n.c
CGEMVNKERNEL = ../mips/zgemv_n.c
ZGEMVNKERNEL = ../mips/zgemv_n.c
endif
ifdef HAVE_MSA
SGEMVTKERNEL = ../mips/sgemv_t_msa.c
DGEMVTKERNEL = ../mips/dgemv_t_msa.c
CGEMVTKERNEL = ../mips/cgemv_t_msa.c
ZGEMVTKERNEL = ../mips/zgemv_t_msa.c
else
SGEMVTKERNEL = ../mips/gemv_t.c
DGEMVTKERNEL = ../mips/gemv_t.c
CGEMVTKERNEL = ../mips/zgemv_t.c
ZGEMVTKERNEL = ../mips/zgemv_t.c
endif
ifdef HAVE_MSA
SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c
SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c
SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
else
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
endif
ifdef HAVE_MSA
DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c
DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c
DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c
DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c
DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
else
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
endif
ifdef HAVE_MSA
CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c
CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c
CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c
CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c
CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c
CGEMMINCOPYOBJ = cgemm_incopy.o
CGEMMITCOPYOBJ = cgemm_itcopy.o
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
else
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
endif
ifdef HAVE_MSA
ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c
ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c
ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
else
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
endif
ifdef HAVE_MSA
STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c
else
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
endif
ifdef HAVE_MSA
DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c
DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c
else
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
endif
ifdef HAVE_MSA
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
else
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
endif
ifdef HAVE_MSA
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
else
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
endif

2
kernel/mips/Makefile Normal file
View File

@ -0,0 +1,2 @@
clean ::

66
kernel/mips/amax.c Normal file
View File

@ -0,0 +1,66 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;
if (n <= 0 || inc_x <= 0) return(maxf);
maxf=ABS(x[0]);
ix += inc_x;
i++;
while(i < n)
{
if( ABS(x[ix]) > maxf )
{
maxf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(maxf);
}

66
kernel/mips/amin.c Normal file
View File

@ -0,0 +1,66 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;
if (n <= 0 || inc_x <= 0) return(minf);
minf=ABS(x[0]);
ix += inc_x;
i++;
while(i < n)
{
if( ABS(x[ix]) < minf )
{
minf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(minf);
}

57
kernel/mips/asum.c Normal file
View File

@ -0,0 +1,57 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
if (n <= 0 || inc_x <= 0) return(sumf);
n *= inc_x;
while(i < n)
{
sumf += ABS(x[i]);
i += inc_x;
}
return(sumf);
}

95
kernel/mips/axpby.c Normal file
View File

@ -0,0 +1,95 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix,iy;
if ( n < 0 ) return(0);
ix = 0;
iy = 0;
if ( beta == 0.0 )
{
if ( alpha == 0.0 )
{
while(i < n)
{
y[iy] = 0.0 ;
iy += inc_y ;
i++ ;
}
}
else
{
while(i < n)
{
y[iy] = alpha * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
}
else
{
if ( alpha == 0.0 )
{
while(i < n)
{
y[iy] = beta * y[iy] ;
iy += inc_y ;
i++ ;
}
}
else
{
while(i < n)
{
y[iy] = alpha * x[ix] + beta * y[iy] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
}
return(0);
}

54
kernel/mips/axpy.c Normal file
View File

@ -0,0 +1,54 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix,iy;
if ( n < 0 ) return(0);
if ( da == 0.0 ) return(0);
ix = 0;
iy = 0;
while(i < n)
{
y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

338
kernel/mips/casum_msa.c Normal file
View File

@ -0,0 +1,338 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include <math.h>
#include "macros_msa.h"
#define AND_VEC_W(in) ((v4f32) ((v4i32) in & and_vec))
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i, inc_x2;
FLOAT sumf = 0.0;
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
v4f32 zero_v = {0};
v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
if (n <= 0 || inc_x <= 0) return (sumf);
if (1 == inc_x)
{
if (n > 15)
{
n -= 16;
LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
sum_abs0 = AND_VEC_W(src0);
sum_abs1 = AND_VEC_W(src1);
sum_abs2 = AND_VEC_W(src2);
sum_abs3 = AND_VEC_W(src3);
sum_abs0 += AND_VEC_W(src4);
sum_abs1 += AND_VEC_W(src5);
sum_abs2 += AND_VEC_W(src6);
sum_abs3 += AND_VEC_W(src7);
}
else
{
sum_abs0 = zero_v;
sum_abs1 = zero_v;
sum_abs2 = zero_v;
sum_abs3 = zero_v;
}
for (i = (n >> 4); i--;)
{
LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs2 += AND_VEC_W(src2);
sum_abs3 += AND_VEC_W(src3);
sum_abs0 += AND_VEC_W(src4);
sum_abs1 += AND_VEC_W(src5);
sum_abs2 += AND_VEC_W(src6);
sum_abs3 += AND_VEC_W(src7);
}
if (n & 15)
{
if ((n & 8) && (n & 4) && (n & 2))
{
LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs2 += AND_VEC_W(src2);
sum_abs3 += AND_VEC_W(src3);
sum_abs0 += AND_VEC_W(src4);
sum_abs1 += AND_VEC_W(src5);
sum_abs2 += AND_VEC_W(src6);
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
sumf = sum_abs0[0];
sumf += sum_abs0[1];
sumf += sum_abs0[2];
sumf += sum_abs0[3];
}
else if ((n & 8) && (n & 4))
{
LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs2 += AND_VEC_W(src2);
sum_abs3 += AND_VEC_W(src3);
sum_abs0 += AND_VEC_W(src4);
sum_abs1 += AND_VEC_W(src5);
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
sumf = sum_abs0[0];
sumf += sum_abs0[1];
sumf += sum_abs0[2];
sumf += sum_abs0[3];
}
else if ((n & 8) && (n & 2))
{
LD_SP5_INC(x, 4, src0, src1, src2, src3, src4);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs2 += AND_VEC_W(src2);
sum_abs3 += AND_VEC_W(src3);
sum_abs0 += AND_VEC_W(src4);
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
sumf = sum_abs0[0];
sumf += sum_abs0[1];
sumf += sum_abs0[2];
sumf += sum_abs0[3];
}
else if ((n & 4) && (n & 2))
{
LD_SP3_INC(x, 4, src0, src1, src2);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs2 += AND_VEC_W(src2);
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
sumf = sum_abs0[0];
sumf += sum_abs0[1];
sumf += sum_abs0[2];
sumf += sum_abs0[3];
}
else if (n & 8)
{
LD_SP4_INC(x, 4, src0, src1, src2, src3);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs2 += AND_VEC_W(src2);
sum_abs3 += AND_VEC_W(src3);
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
sumf = sum_abs0[0];
sumf += sum_abs0[1];
sumf += sum_abs0[2];
sumf += sum_abs0[3];
}
else if (n & 4)
{
LD_SP2_INC(x, 4, src0, src1);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
sumf = sum_abs0[0];
sumf += sum_abs0[1];
sumf += sum_abs0[2];
sumf += sum_abs0[3];
}
else if (n & 2)
{
src0 = LD_SP(x); x += 4;
sum_abs0 += AND_VEC_W(src0);
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
sumf = sum_abs0[0];
sumf += sum_abs0[1];
sumf += sum_abs0[2];
sumf += sum_abs0[3];
}
else
{
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
sumf = sum_abs0[0];
sumf += sum_abs0[1];
sumf += sum_abs0[2];
sumf += sum_abs0[3];
}
if (n & 1)
{
sumf += fabsf(*(x + 0));
sumf += fabsf(*(x + 1));
}
}
else
{
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
sumf = sum_abs0[0];
sumf += sum_abs0[1];
sumf += sum_abs0[2];
sumf += sum_abs0[3];
}
}
else
{
inc_x2 = 2 * inc_x;
if (n > 8)
{
n -= 8;
LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
sum_abs0 = AND_VEC_W(src0);
sum_abs1 = AND_VEC_W(src1);
sum_abs2 = AND_VEC_W(src2);
sum_abs3 = AND_VEC_W(src3);
sum_abs0 += AND_VEC_W(src4);
sum_abs1 += AND_VEC_W(src5);
sum_abs2 += AND_VEC_W(src6);
sum_abs3 += AND_VEC_W(src7);
}
else
{
sum_abs0 = zero_v;
sum_abs1 = zero_v;
sum_abs2 = zero_v;
sum_abs3 = zero_v;
}
for (i = (n >> 3); i--;)
{
LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs2 += AND_VEC_W(src2);
sum_abs3 += AND_VEC_W(src3);
sum_abs0 += AND_VEC_W(src4);
sum_abs1 += AND_VEC_W(src5);
sum_abs2 += AND_VEC_W(src6);
sum_abs3 += AND_VEC_W(src7);
}
if (n & 7)
{
if ((n & 4) && (n & 2) && (n & 1))
{
LD_SP7_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs2 += AND_VEC_W(src2);
sum_abs3 += AND_VEC_W(src3);
sum_abs0 += AND_VEC_W(src4);
sum_abs1 += AND_VEC_W(src5);
sum_abs2 += AND_VEC_W(src6);
}
else if ((n & 4) && (n & 2))
{
LD_SP6_INC(x, inc_x2, src0, src1, src2, src3, src4, src5);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs2 += AND_VEC_W(src2);
sum_abs3 += AND_VEC_W(src3);
sum_abs0 += AND_VEC_W(src4);
sum_abs1 += AND_VEC_W(src5);
}
else if ((n & 4) && (n & 1))
{
LD_SP5_INC(x, inc_x2, src0, src1, src2, src3, src4);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs2 += AND_VEC_W(src2);
sum_abs3 += AND_VEC_W(src3);
sum_abs0 += AND_VEC_W(src4);
}
else if ((n & 2) && (n & 1))
{
LD_SP3_INC(x, inc_x2, src0, src1, src2);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs2 += AND_VEC_W(src2);
}
else if (n & 4)
{
LD_SP4_INC(x, inc_x2, src0, src1, src2, src3);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs2 += AND_VEC_W(src2);
sum_abs3 += AND_VEC_W(src3);
}
else if (n & 2)
{
LD_SP2_INC(x, inc_x2, src0, src1);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
}
else if (n & 1)
{
src0 = LD_SP(x); x += inc_x2;
sum_abs0 += AND_VEC_W(src0);
}
}
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
sumf = sum_abs0[0] + sum_abs0[1];
}
return (sumf);
}

361
kernel/mips/cdot_msa.c Normal file
View File

@ -0,0 +1,361 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
#if !defined(CONJ)
#define OP2 +=
#define OP3 -
#define OP4 +
#else
#define OP2 -=
#define OP3 +
#define OP4 -
#endif
#define DOT16_KERNEL(OPR0, OPR1) \
dot0 += (vx0r * vy0r); \
dot0 OPR0## = (vx0i * vy0i); \
dot1 OPR1## = (vx0i * vy0r); \
dot1 += (vx0r * vy0i); \
\
dot0 += (vx1r * vy1r); \
dot0 OPR0## = (vx1i * vy1i); \
dot1 OPR1## = (vx1i * vy1r); \
dot1 += (vx1r * vy1i); \
\
dot0 += (vx2r * vy2r); \
dot0 OPR0## = (vx2i * vy2i); \
dot1 OPR1## = (vx2i * vy2r); \
dot1 += (vx2r * vy2i); \
\
dot0 += (vx3r * vy3r); \
dot0 OPR0## = (vx3i * vy3i); \
dot1 OPR1## = (vx3i * vy3r); \
dot1 += (vx3r * vy3i);
#define DOT12_KERNEL(OPR0, OPR1) \
dot0 += (vx0r * vy0r); \
dot0 OPR0## = (vx0i * vy0i); \
dot1 OPR1## = (vx0i * vy0r); \
dot1 += (vx0r * vy0i); \
\
dot0 += (vx1r * vy1r); \
dot0 OPR0## = (vx1i * vy1i); \
dot1 OPR1## = (vx1i * vy1r); \
dot1 += (vx1r * vy1i); \
\
dot0 += (vx2r * vy2r); \
dot0 OPR0## = (vx2i * vy2i); \
dot1 OPR1## = (vx2i * vy2r); \
dot1 += (vx2r * vy2i);
#define DOT8_KERNEL(OPR0, OPR1) \
dot0 += (vx0r * vy0r); \
dot0 OPR0## = (vx0i * vy0i); \
dot1 OPR1## = (vx0i * vy0r); \
dot1 += (vx0r * vy0i); \
\
dot0 += (vx1r * vy1r); \
dot0 OPR0## = (vx1i * vy1i); \
dot1 OPR1## = (vx1i * vy1r); \
dot1 += (vx1r * vy1i);
#define DOT4_KERNEL(OPR0, OPR1) \
dot0 += (vx0r * vy0r); \
dot0 OPR0## = (vx0i * vy0i); \
dot1 OPR1## = (vx0i * vy0r); \
dot1 += (vx0r * vy0i);
/* return float, x,y float */
/* cdotc - CONJ */
/* cdotu - !CONJ */
#ifndef _MSC_VER
#include <complex.h>
FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#else
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#endif
{
BLASLONG i = 0;
FLOAT dot[2];
BLASLONG inc_x2;
BLASLONG inc_y2;
FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
v4f32 dot0 = {0, 0, 0, 0};
v4f32 dot1 = {0, 0, 0, 0};
openblas_complex_float result;
dot[0] = 0.0;
dot[1] = 0.0;
__real__(result) = 0.0;
__imag__(result) = 0.0;
if ( n < 1 ) return(result);
if ((1 == inc_x) && (1 == inc_y))
{
for (i = (n >> 4); i--;)
{
LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i);
PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i);
#if !defined(CONJ)
DOT16_KERNEL(-, +);
#else
DOT16_KERNEL(+, -);
#endif
}
if (n & 15)
{
if ((n & 8) && (n & 4))
{
LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
LD_SP2_INC(x, 4, vx4, vx5);
LD_SP2_INC(y, 4, vy4, vy5);
PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
#if !defined(CONJ)
DOT12_KERNEL(-, +);
#else
DOT12_KERNEL(+, -);
#endif
}
else if (n & 8)
{
LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
#if !defined(CONJ)
DOT8_KERNEL(-, +);
#else
DOT8_KERNEL(+, -);
#endif
}
else if (n & 4)
{
LD_SP2_INC(x, 4, vx0, vx1);
LD_SP2_INC(y, 4, vy0, vy1);
PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
#if !defined(CONJ)
DOT4_KERNEL(-, +);
#else
DOT4_KERNEL(+, -);
#endif
}
if ((n & 2) && (n & 1))
{
LD_GP6_INC(x, 1, x0, x1, x2, x3, x4, x5);
LD_GP6_INC(y, 1, y0, y1, y2, y3, y4, y5);
dot[0] += ( x0 * y0 OP3 x1 * y1 );
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
dot[0] += ( x2 * y2 OP3 x3 * y3 );
dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
dot[0] += ( x4 * y4 OP3 x5 * y5 );
dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
}
else if (n & 2)
{
LD_GP4_INC(x, 1, x0, x1, x2, x3);
LD_GP4_INC(y, 1, y0, y1, y2, y3);
dot[0] += ( x0 * y0 OP3 x1 * y1 );
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
dot[0] += ( x2 * y2 OP3 x3 * y3 );
dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
}
else if (n & 1)
{
LD_GP2_INC(x, 1, x0, x1);
LD_GP2_INC(y, 1, y0, y1);
dot[0] += ( x0 * y0 OP3 x1 * y1 );
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
}
}
dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]);
dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]);
}
else
{
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
for (i = (n >> 2); i--;)
{
x0 = *x;
x1 = *(x + 1);
x += inc_x2;
x2 = *x;
x3 = *(x + 1);
x += inc_x2;
x4 = *x;
x5 = *(x + 1);
x += inc_x2;
x6 = *x;
x7 = *(x + 1);
x += inc_x2;
y0 = *y;
y1 = *(y + 1);
y += inc_y2;
y2 = *y;
y3 = *(y + 1);
y += inc_y2;
y4 = *y;
y5 = *(y + 1);
y += inc_y2;
y6 = *y;
y7 = *(y + 1);
y += inc_y2;
dot[0] += ( x0 * y0 OP3 x1 * y1 );
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
dot[0] += ( x2 * y2 OP3 x3 * y3 );
dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
dot[0] += ( x4 * y4 OP3 x5 * y5 );
dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
dot[0] += ( x6 * y6 OP3 x7 * y7 );
dot[1] OP2 ( x7 * y6 OP4 x6 * y7 );
}
if ((n & 2) && (n & 1))
{
x0 = *x;
x1 = *(x + 1);
x += inc_x2;
x2 = *x;
x3 = *(x + 1);
x += inc_x2;
x4 = *x;
x5 = *(x + 1);
x += inc_x2;
y0 = *y;
y1 = *(y + 1);
y += inc_y2;
y2 = *y;
y3 = *(y + 1);
y += inc_y2;
y4 = *y;
y5 = *(y + 1);
y += inc_y2;
dot[0] += ( x0 * y0 OP3 x1 * y1 );
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
dot[0] += ( x2 * y2 OP3 x3 * y3 );
dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
dot[0] += ( x4 * y4 OP3 x5 * y5 );
dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
}
else if (n & 2)
{
x0 = *x;
x1 = *(x + 1);
x += inc_x2;
x2 = *x;
x3 = *(x + 1);
x += inc_x2;
y0 = *y;
y1 = *(y + 1);
y += inc_y2;
y2 = *y;
y3 = *(y + 1);
y += inc_y2;
dot[0] += ( x0 * y0 OP3 x1 * y1 );
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
dot[0] += ( x2 * y2 OP3 x3 * y3 );
dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
}
else if (n & 1)
{
x0 = *x;
x1 = *(x + 1);
x += inc_x2;
y0 = *y;
y1 = *(y + 1);
y += inc_y2;
dot[0] += ( x0 * y0 OP3 x1 * y1 );
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
}
}
__real__(result) = dot[0];
__imag__(result) = dot[1];
return(result);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,195 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
{
BLASLONG i, j;
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
v4f32 dst0, dst1, dst4, dst5;
psrc0 = src;
pdst = dst;
lda *= 2;
for (j = (n >> 2); j--;)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc3 = psrc2 + lda;
psrc4 = psrc3 + lda;
psrc0 += 4 * lda;
for (i = (m >> 2); i--;)
{
LD_SP2_INC(psrc1, 4, src0, src1);
LD_SP2_INC(psrc2, 4, src2, src3);
LD_SP2_INC(psrc3, 4, src4, src5);
LD_SP2_INC(psrc4, 4, src6, src7);
ILVRL_D2_SP(src2, src0, dst0, dst4);
ILVRL_D2_SP(src6, src4, dst1, dst5);
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
ILVRL_D2_SP(src3, src1, dst0, dst4);
ILVRL_D2_SP(src7, src5, dst1, dst5);
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
}
if (m & 2)
{
src0 = LD_SP(psrc1);
src2 = LD_SP(psrc2);
src4 = LD_SP(psrc3);
src6 = LD_SP(psrc4);
psrc1 += 4;
psrc2 += 4;
psrc3 += 4;
psrc4 += 4;
ILVRL_D2_SP(src2, src0, dst0, dst4);
ILVRL_D2_SP(src6, src4, dst1, dst5);
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
}
if (m & 1)
{
ctemp01 = *(psrc1 + 0);
ctemp02 = *(psrc1 + 1);
ctemp03 = *(psrc2 + 0);
ctemp04 = *(psrc2 + 1);
ctemp05 = *(psrc3 + 0);
ctemp06 = *(psrc3 + 1);
ctemp07 = *(psrc4 + 0);
ctemp08 = *(psrc4 + 1);
psrc1 += 2;
psrc2 += 2;
psrc3 += 2;
psrc4 += 2;
*(pdst + 0) = ctemp01;
*(pdst + 1) = ctemp02;
*(pdst + 2) = ctemp03;
*(pdst + 3) = ctemp04;
*(pdst + 4) = ctemp05;
*(pdst + 5) = ctemp06;
*(pdst + 6) = ctemp07;
*(pdst + 7) = ctemp08;
pdst += 8;
}
}
if (n & 2)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc0 += 2 * lda;
for (i = (m >> 2); i--;)
{
LD_SP2_INC(psrc1, 4, src0, src1);
LD_SP2_INC(psrc2, 4, src2, src3);
ILVRL_D2_SP(src2, src0, dst0, dst4);
ST_SP2_INC(dst0, dst4, pdst, 4);
ILVRL_D2_SP(src3, src1, dst0, dst4);
ST_SP2_INC(dst0, dst4, pdst, 4);
}
if (m & 2)
{
src0 = LD_SP(psrc1);
src2 = LD_SP(psrc2);
psrc1 += 4;
psrc2 += 4;
ILVRL_D2_SP(src2, src0, dst0, dst4);
ST_SP2_INC(dst0, dst4, pdst, 4);
}
if (m & 1)
{
ctemp01 = *(psrc1 + 0);
ctemp02 = *(psrc1 + 1);
ctemp03 = *(psrc2 + 0);
ctemp04 = *(psrc2 + 1);
psrc1 += 2;
psrc2 += 2;
*(pdst + 0) = ctemp01;
*(pdst + 1) = ctemp02;
*(pdst + 2) = ctemp03;
*(pdst + 3) = ctemp04;
pdst += 4;
}
}
if (n & 1)
{
psrc1 = psrc0;
for (i = (m >> 2); i--;)
{
LD_SP2_INC(psrc1, 4, src0, src1);
ST_SP2_INC(src0, src1, pdst, 4);
}
if (m & 2)
{
src0 = LD_SP(psrc1);
psrc1 += 4;
ST_SP(src0, pdst);
pdst += 4;
}
if (m & 1)
{
ctemp01 = *(psrc1 + 0);
ctemp02 = *(psrc1 + 1);
psrc1 += 2;
*(pdst + 0) = ctemp01;
*(pdst + 1) = ctemp02;
pdst += 2;
}
}
return 0;
}

View File

@ -0,0 +1,310 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
{
BLASLONG i, j;
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
FLOAT *psrc8, *pdst;
FLOAT ctemp01, ctemp02, ctemp03, ctemp04, ctemp05, ctemp06, ctemp07;
FLOAT ctemp08, ctemp09, ctemp10, ctemp11, ctemp12, ctemp13, ctemp14;
FLOAT ctemp15, ctemp16;
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
psrc0 = src;
pdst = dst;
lda *= 2;
for (j = (n >> 3); j--;)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc3 = psrc2 + lda;
psrc4 = psrc3 + lda;
psrc5 = psrc4 + lda;
psrc6 = psrc5 + lda;
psrc7 = psrc6 + lda;
psrc8 = psrc7 + lda;
psrc0 += 8 * lda;
for (i = (m >> 2); i--;)
{
LD_SP2_INC(psrc1, 4, src0, src1);
LD_SP2_INC(psrc2, 4, src2, src3);
LD_SP2_INC(psrc3, 4, src4, src5);
LD_SP2_INC(psrc4, 4, src6, src7);
LD_SP2_INC(psrc5, 4, src8, src9);
LD_SP2_INC(psrc6, 4, src10, src11);
LD_SP2_INC(psrc7, 4, src12, src13);
LD_SP2_INC(psrc8, 4, src14, src15);
ILVRL_D2_SP(src2, src0, dst0, dst4);
ILVRL_D2_SP(src6, src4, dst1, dst5);
ILVRL_D2_SP(src10, src8, dst2, dst6);
ILVRL_D2_SP(src14, src12, dst3, dst7);
ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
ILVRL_D2_SP(src3, src1, dst0, dst4);
ILVRL_D2_SP(src7, src5, dst1, dst5);
ILVRL_D2_SP(src11, src9, dst2, dst6);
ILVRL_D2_SP(src15, src13, dst3, dst7);
ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
}
if (m & 2)
{
src0 = LD_SP(psrc1);
src2 = LD_SP(psrc2);
src4 = LD_SP(psrc3);
src6 = LD_SP(psrc4);
src8 = LD_SP(psrc5);
src10 = LD_SP(psrc6);
src12 = LD_SP(psrc7);
src14 = LD_SP(psrc8);
psrc1 += 4;
psrc2 += 4;
psrc3 += 4;
psrc4 += 4;
psrc5 += 4;
psrc6 += 4;
psrc7 += 4;
psrc8 += 4;
ILVRL_D2_SP(src2, src0, dst0, dst4);
ILVRL_D2_SP(src6, src4, dst1, dst5);
ILVRL_D2_SP(src10, src8, dst2, dst6);
ILVRL_D2_SP(src14, src12, dst3, dst7);
ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
}
if (m & 1)
{
ctemp01 = *(psrc1 + 0);
ctemp02 = *(psrc1 + 1);
ctemp03 = *(psrc2 + 0);
ctemp04 = *(psrc2 + 1);
ctemp05 = *(psrc3 + 0);
ctemp06 = *(psrc3 + 1);
ctemp07 = *(psrc4 + 0);
ctemp08 = *(psrc4 + 1);
ctemp09 = *(psrc5 + 0);
ctemp10 = *(psrc5 + 1);
ctemp11 = *(psrc6 + 0);
ctemp12 = *(psrc6 + 1);
ctemp13 = *(psrc7 + 0);
ctemp14 = *(psrc7 + 1);
ctemp15 = *(psrc8 + 0);
ctemp16 = *(psrc8 + 1);
psrc1 += 2;
psrc2 += 2;
psrc3 += 2;
psrc4 += 2;
psrc5 += 2;
psrc6 += 2;
psrc7 += 2;
psrc8 += 2;
*(pdst + 0) = ctemp01;
*(pdst + 1) = ctemp02;
*(pdst + 2) = ctemp03;
*(pdst + 3) = ctemp04;
*(pdst + 4) = ctemp05;
*(pdst + 5) = ctemp06;
*(pdst + 6) = ctemp07;
*(pdst + 7) = ctemp08;
*(pdst + 8) = ctemp09;
*(pdst + 9) = ctemp10;
*(pdst + 10) = ctemp11;
*(pdst + 11) = ctemp12;
*(pdst + 12) = ctemp13;
*(pdst + 13) = ctemp14;
*(pdst + 14) = ctemp15;
*(pdst + 15) = ctemp16;
pdst += 16;
}
}
if (n & 4)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc3 = psrc2 + lda;
psrc4 = psrc3 + lda;
psrc0 += 4 * lda;
for (i = (m >> 2); i--;)
{
LD_SP2_INC(psrc1, 4, src0, src1);
LD_SP2_INC(psrc2, 4, src2, src3);
LD_SP2_INC(psrc3, 4, src4, src5);
LD_SP2_INC(psrc4, 4, src6, src7);
ILVRL_D2_SP(src2, src0, dst0, dst4);
ILVRL_D2_SP(src6, src4, dst1, dst5);
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
ILVRL_D2_SP(src3, src1, dst0, dst4);
ILVRL_D2_SP(src7, src5, dst1, dst5);
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
}
if (m & 2)
{
src0 = LD_SP(psrc1);
src2 = LD_SP(psrc2);
src4 = LD_SP(psrc3);
src6 = LD_SP(psrc4);
psrc1 += 4;
psrc2 += 4;
psrc3 += 4;
psrc4 += 4;
ILVRL_D2_SP(src2, src0, dst0, dst4);
ILVRL_D2_SP(src6, src4, dst1, dst5);
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
}
if (m & 1)
{
ctemp01 = *(psrc1 + 0);
ctemp02 = *(psrc1 + 1);
ctemp03 = *(psrc2 + 0);
ctemp04 = *(psrc2 + 1);
ctemp05 = *(psrc3 + 0);
ctemp06 = *(psrc3 + 1);
ctemp07 = *(psrc4 + 0);
ctemp08 = *(psrc4 + 1);
psrc1 += 2;
psrc2 += 2;
psrc3 += 2;
psrc4 += 2;
*(pdst + 0) = ctemp01;
*(pdst + 1) = ctemp02;
*(pdst + 2) = ctemp03;
*(pdst + 3) = ctemp04;
*(pdst + 4) = ctemp05;
*(pdst + 5) = ctemp06;
*(pdst + 6) = ctemp07;
*(pdst + 7) = ctemp08;
pdst += 8;
}
}
if (n & 2)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc0 += 2 * lda;
for (i = (m >> 2); i--;)
{
LD_SP2_INC(psrc1, 4, src0, src1);
LD_SP2_INC(psrc2, 4, src2, src3);
ILVRL_D2_SP(src2, src0, dst0, dst4);
ST_SP2_INC(dst0, dst4, pdst, 4);
ILVRL_D2_SP(src3, src1, dst0, dst4);
ST_SP2_INC(dst0, dst4, pdst, 4);
}
if (m & 2)
{
src0 = LD_SP(psrc1);
src2 = LD_SP(psrc2);
psrc1 += 4;
psrc2 += 4;
ILVRL_D2_SP(src2, src0, dst0, dst4);
ST_SP2_INC(dst0, dst4, pdst, 4);
}
if (m & 1)
{
ctemp01 = *(psrc1 + 0);
ctemp02 = *(psrc1 + 1);
ctemp03 = *(psrc2 + 0);
ctemp04 = *(psrc2 + 1);
psrc1 += 2;
psrc2 += 2;
*(pdst + 0) = ctemp01;
*(pdst + 1) = ctemp02;
*(pdst + 2) = ctemp03;
*(pdst + 3) = ctemp04;
pdst += 4;
}
}
if (n & 1)
{
psrc1 = psrc0;
for (i = (m >> 2); i--;)
{
LD_SP2_INC(psrc1, 4, src0, src1);
ST_SP2_INC(src0, src1, pdst, 4);
}
if (m & 2)
{
src0 = LD_SP(psrc1);
psrc1 += 4;
ST_SP(src0, pdst);
pdst += 4;
}
if (m & 1)
{
ctemp01 = *(psrc1 + 0);
ctemp02 = *(psrc1 + 1);
psrc1 += 2;
*(pdst + 0) = ctemp01;
*(pdst + 1) = ctemp02;
pdst += 2;
}
}
return 0;
}

View File

@ -0,0 +1,125 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
{
BLASLONG i, j;
FLOAT *psrc0;
FLOAT *psrc1, *psrc2;
FLOAT *pdst0;
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
v4f32 src0, src1, src2, src3;
psrc0 = src;
pdst0 = dst;
lda *= 2;
for (j = (n >> 2); j--;)
{
psrc1 = psrc0;
psrc2 = psrc0 + lda;
psrc0 += 8;
for (i = (m >> 1); i--;)
{
LD_SP2(psrc1, 4, src0, src1);
LD_SP2(psrc2, 4, src2, src3);
ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
psrc1 += 2 * lda;
psrc2 += 2 * lda;
}
if (m & 1)
{
LD_SP2(psrc1, 4, src0, src1);
ST_SP2_INC(src0, src1, pdst0, 4);
}
}
if (n & 2)
{
psrc1 = psrc0;
psrc2 = psrc0 + lda;
psrc0 += 4;
for (i = (m >> 1); i--;)
{
src0 = LD_SP(psrc1);
src1 = LD_SP(psrc2);
ST_SP2_INC(src0, src1, pdst0, 4);
psrc1 += 2 * lda;
psrc2 += 2 * lda;
}
if (m & 1)
{
src0 = LD_SP(psrc1);
ST_SP(src0, pdst0);
pdst0 += 4;
}
}
if (n & 1)
{
psrc1 = psrc0;
psrc2 = psrc0 + lda;
psrc0 += 2;
for (i = (m >> 1); i--;)
{
ctemp01 = *(psrc1 + 0);
ctemp02 = *(psrc1 + 1);
ctemp03 = *(psrc2 + 0);
ctemp04 = *(psrc2 + 1);
*(pdst0 + 0) = ctemp01;
*(pdst0 + 1) = ctemp02;
*(pdst0 + 2) = ctemp03;
*(pdst0 + 3) = ctemp04;
psrc1 += 2 * lda;
psrc2 += 2 * lda;
pdst0 += 4;
}
if (m & 1)
{
ctemp01 = *(psrc1 + 0);
ctemp02 = *(psrc1 + 1);
*(pdst0 + 0) = ctemp01;
*(pdst0 + 1) = ctemp02;
pdst0 += 2;
}
}
return 0;
}

View File

@ -0,0 +1,214 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
{
BLASLONG i, j;
FLOAT *psrc0, *psrc1, *psrc2, *pdst0;
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
psrc0 = src;
pdst0 = dst;
lda *= 2;
for (j = (n >> 3); j--;)
{
psrc1 = psrc0;
psrc2 = psrc0 + lda;
psrc0 += 16;
for (i = (m >> 2); i--;)
{
LD_SP4(psrc1, 4, src0, src1, src2, src3);
LD_SP4(psrc2, 4, src4, src5, src6, src7);
LD_SP4(psrc1 + 2 * lda, 4, src8, src9, src10, src11);
LD_SP4(psrc2 + 2 * lda, 4, src12, src13, src14, src15);
ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4);
ST_SP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, pdst0, 4);
psrc1 += 4 * lda;
psrc2 += 4 * lda;
}
if (m & 2)
{
LD_SP4(psrc1, 4, src0, src1, src2, src3);
LD_SP4(psrc2, 4, src4, src5, src6, src7);
ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4);
psrc1 += 2 * lda;
psrc2 += 2 * lda;
}
if (m & 1)
{
LD_SP4(psrc1, 4, src0, src1, src2, src3);
ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
}
}
if (n & 4)
{
psrc1 = psrc0;
psrc2 = psrc0 + lda;
psrc0 += 8;
for (i = (m >> 2); i--;)
{
LD_SP2(psrc1, 4, src0, src1);
LD_SP2(psrc2, 4, src2, src3);
LD_SP2(psrc1 + 2 * lda, 4, src4, src5);
LD_SP2(psrc2 + 2 * lda, 4, src6, src7);
ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
ST_SP4_INC(src4, src5, src6, src7, pdst0, 4);
psrc1 += 4 * lda;
psrc2 += 4 * lda;
}
if (m & 2)
{
LD_SP2(psrc1, 4, src0, src1);
LD_SP2(psrc2, 4, src2, src3);
ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
psrc1 += 2 * lda;
psrc2 += 2 * lda;
}
if (m & 1)
{
LD_SP2(psrc1, 4, src0, src1);
ST_SP2_INC(src0, src1, pdst0, 4);
}
}
if (n & 2)
{
psrc1 = psrc0;
psrc2 = psrc0 + lda;
psrc0 += 4;
for (i = (m >> 2); i--;)
{
src0 = LD_SP(psrc1);
src1 = LD_SP(psrc2);
src2 = LD_SP(psrc1 + 2 * lda);
src3 = LD_SP(psrc2 + 2 * lda);
ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
psrc1 += 4 * lda;
psrc2 += 4 * lda;
}
if (m & 2)
{
src0 = LD_SP(psrc1);
src1 = LD_SP(psrc2);
ST_SP2_INC(src0, src1, pdst0, 4);
psrc1 += 2 * lda;
psrc2 += 2 * lda;
}
if (m & 1)
{
src0 = LD_SP(psrc1);
ST_SP(src0, pdst0);
pdst0 += 4;
}
}
if (n & 1)
{
psrc1 = psrc0;
psrc2 = psrc0 + lda;
psrc0 += 2;
for (i = (m >> 2); i--;)
{
ctemp01 = *(psrc1 + 0);
ctemp02 = *(psrc1 + 1);
ctemp03 = *(psrc2 + 0);
ctemp04 = *(psrc2 + 1);
*(pdst0 + 0) = ctemp01;
*(pdst0 + 1) = ctemp02;
*(pdst0 + 2) = ctemp03;
*(pdst0 + 3) = ctemp04;
psrc1 += 2 * lda;
psrc2 += 2 * lda;
pdst0 += 4;
ctemp01 = *(psrc1 + 0);
ctemp02 = *(psrc1 + 1);
ctemp03 = *(psrc2 + 0);
ctemp04 = *(psrc2 + 1);
*(pdst0 + 0) = ctemp01;
*(pdst0 + 1) = ctemp02;
*(pdst0 + 2) = ctemp03;
*(pdst0 + 3) = ctemp04;
psrc1 += 2 * lda;
psrc2 += 2 * lda;
pdst0 += 4;
}
if (m & 2)
{
ctemp01 = *(psrc1 + 0);
ctemp02 = *(psrc1 + 1);
ctemp03 = *(psrc2 + 0);
ctemp04 = *(psrc2 + 1);
*(pdst0 + 0) = ctemp01;
*(pdst0 + 1) = ctemp02;
*(pdst0 + 2) = ctemp03;
*(pdst0 + 3) = ctemp04;
psrc1 += 2 * lda;
psrc2 += 2 * lda;
pdst0 += 4;
}
if (m & 1)
{
ctemp01 = *(psrc1 + 0);
ctemp02 = *(psrc1 + 1);
*(pdst0 + 0) = ctemp01;
*(pdst0 + 1) = ctemp02;
pdst0 += 2;
}
}
return 0;
}

611
kernel/mips/cgemv_n_msa.c Normal file
View File

@ -0,0 +1,611 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
#undef OP0
#undef OP1
#undef OP2
#undef OP3
#undef OP4
#if !defined(XCONJ)
#define OP3 -=
#define OP4 +=
#else
#define OP3 +=
#define OP4 -=
#endif
#if !defined(CONJ)
#if !defined(XCONJ)
#define OP0 -=
#define OP1 +=
#define OP2 +=
#else
#define OP0 +=
#define OP1 +=
#define OP2 -=
#endif
#else
#if !defined(XCONJ)
#define OP0 +=
#define OP1 -=
#define OP2 -=
#else
#define OP0 -=
#define OP1 -=
#define OP2 +=
#endif
#endif
#define CGEMV_N_8x4() \
LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \
LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \
\
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
PCKEVOD_W2_SP(t11, t10, src5r, src5i); \
PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
PCKEVOD_W2_SP(t15, t14, src7r, src7i); \
\
y0r += tp0r * src0r; \
y1r += tp0r * src1r; \
y0r += tp1r * src2r; \
y1r += tp1r * src3r; \
y0r += tp2r * src4r; \
y1r += tp2r * src5r; \
y0r += tp3r * src6r; \
y1r += tp3r * src7r; \
\
y0r OP0 tp0i * src0i; \
y1r OP0 tp0i * src1i; \
y0r OP0 tp1i * src2i; \
y1r OP0 tp1i * src3i; \
y0r OP0 tp2i * src4i; \
y1r OP0 tp2i * src5i; \
y0r OP0 tp3i * src6i; \
y1r OP0 tp3i * src7i; \
\
y0i OP1 tp0r * src0i; \
y1i OP1 tp0r * src1i; \
y0i OP1 tp1r * src2i; \
y1i OP1 tp1r * src3i; \
y0i OP1 tp2r * src4i; \
y1i OP1 tp2r * src5i; \
y0i OP1 tp3r * src6i; \
y1i OP1 tp3r * src7i; \
\
y0i OP2 tp0i * src0r; \
y1i OP2 tp0i * src1r; \
y0i OP2 tp1i * src2r; \
y1i OP2 tp1i * src3r; \
y0i OP2 tp2i * src4r; \
y1i OP2 tp2i * src5r; \
y0i OP2 tp3i * src6r; \
y1i OP2 tp3i * src7r; \
#define CGEMV_N_4x4() \
LD_SP2(pa0 + k, 4, t0, t1); \
LD_SP2(pa1 + k, 4, t4, t5); \
LD_SP2(pa2 + k, 4, t8, t9); \
LD_SP2(pa3 + k, 4, t12, t13); \
\
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
\
y0r += tp0r * src0r; \
y0r += tp1r * src2r; \
y0r += tp2r * src4r; \
y0r += tp3r * src6r; \
\
y0r OP0 tp0i * src0i; \
y0r OP0 tp1i * src2i; \
y0r OP0 tp2i * src4i; \
y0r OP0 tp3i * src6i; \
\
y0i OP1 tp0r * src0i; \
y0i OP1 tp1r * src2i; \
y0i OP1 tp2r * src4i; \
y0i OP1 tp3r * src6i; \
\
y0i OP2 tp0i * src0r; \
y0i OP2 tp1i * src2r; \
y0i OP2 tp2i * src4r; \
y0i OP2 tp3i * src6r; \
#define CGEMV_N_1x4() \
res0 = y[0 * inc_y2]; \
res1 = y[0 * inc_y2 + 1]; \
\
res0 += temp0_r * pa0[k]; \
res0 OP0 temp0_i * pa0[k + 1]; \
res0 += temp1_r * pa1[k]; \
res0 OP0 temp1_i * pa1[k + 1]; \
res0 += temp2_r * pa2[k]; \
res0 OP0 temp2_i * pa2[k + 1]; \
res0 += temp3_r * pa3[k]; \
res0 OP0 temp3_i * pa3[k + 1]; \
\
res1 OP1 temp0_r * pa0[k + 1]; \
res1 OP2 temp0_i * pa0[k]; \
res1 OP1 temp1_r * pa1[k + 1]; \
res1 OP2 temp1_i * pa1[k]; \
res1 OP1 temp2_r * pa2[k + 1]; \
res1 OP2 temp2_i * pa2[k]; \
res1 OP1 temp3_r * pa3[k + 1]; \
res1 OP2 temp3_i * pa3[k]; \
\
y[0 * inc_y2] = res0; \
y[0 * inc_y2 + 1] = res1; \
#define CGEMV_N_8x2() \
LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
\
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
\
y0r += tp0r * src0r; \
y1r += tp0r * src1r; \
y0r += tp1r * src2r; \
y1r += tp1r * src3r; \
\
y0r OP0 tp0i * src0i; \
y1r OP0 tp0i * src1i; \
y0r OP0 tp1i * src2i; \
y1r OP0 tp1i * src3i; \
\
y0i OP1 tp0r * src0i; \
y1i OP1 tp0r * src1i; \
y0i OP1 tp1r * src2i; \
y1i OP1 tp1r * src3i; \
\
y0i OP2 tp0i * src0r; \
y1i OP2 tp0i * src1r; \
y0i OP2 tp1i * src2r; \
y1i OP2 tp1i * src3r; \
#define CGEMV_N_4x2() \
LD_SP2(pa0 + k, 4, t0, t1); \
LD_SP2(pa1 + k, 4, t4, t5); \
\
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
\
y0r += tp0r * src0r; \
y0r += tp1r * src2r; \
\
y0r OP0 tp0i * src0i; \
y0r OP0 tp1i * src2i; \
\
y0i OP1 tp0r * src0i; \
y0i OP1 tp1r * src2i; \
\
y0i OP2 tp0i * src0r; \
y0i OP2 tp1i * src2r; \
#define CGEMV_N_1x2() \
res0 = y[0 * inc_y2]; \
res1 = y[0 * inc_y2 + 1]; \
\
res0 += temp0_r * pa0[k]; \
res0 OP0 temp0_i * pa0[k + 1]; \
res0 += temp1_r * pa1[k]; \
res0 OP0 temp1_i * pa1[k + 1]; \
\
res1 OP1 temp0_r * pa0[k + 1]; \
res1 OP2 temp0_i * pa0[k]; \
res1 OP1 temp1_r * pa1[k + 1]; \
res1 OP2 temp1_i * pa1[k]; \
\
y[0 * inc_y2] = res0; \
y[0 * inc_y2 + 1] = res1; \
#define CGEMV_N_1x1() \
res0 = y[0 * inc_y2]; \
res1 = y[0 * inc_y2 + 1]; \
\
res0 += temp_r * pa0[k]; \
res0 OP0 temp_i * pa0[k + 1]; \
\
res1 OP1 temp_r * pa0[k + 1]; \
res1 OP2 temp_i * pa0[k]; \
\
y[0 * inc_y2] = res0; \
y[0 * inc_y2 + 1] = res1; \
#define CLOAD_X4_SCALE_VECTOR() \
LD_SP2(x, 4, x0, x1); \
\
PCKEVOD_W2_SP(x1, x0, x0r, x0i); \
\
tp4r = alphar * x0r; \
tp4r OP3 alphai * x0i; \
tp4i = alphar * x0i; \
tp4i OP4 alphai * x0r; \
\
SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \
SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \
#define CLOAD_X4_SCALE_GP() \
x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \
x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \
\
tp4r = alphar * x0r; \
tp4r OP3 alphai * x0i; \
tp4i = alphar * x0i; \
tp4i OP4 alphai * x0r; \
\
SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \
SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \
#define CLOAD_X2_SCALE_GP() \
temp0_r = alpha_r * x[0 * inc_x2]; \
temp0_r OP3 alpha_i * x[0 * inc_x2 + 1]; \
temp0_i = alpha_r * x[0 * inc_x2 + 1]; \
temp0_i OP4 alpha_i * x[0 * inc_x2]; \
\
temp1_r = alpha_r * x[1 * inc_x2]; \
temp1_r OP3 alpha_i * x[1 * inc_x2 + 1]; \
temp1_i = alpha_r * x[1 * inc_x2 + 1]; \
temp1_i OP4 alpha_i * x[1 * inc_x2]; \
\
tp0r = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_r); \
tp0i = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_i); \
tp1r = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_r); \
tp1i = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_i); \
#define CLOAD_X1_SCALE_GP() \
temp_r = alpha_r * x[0 * inc_x2]; \
temp_r OP3 alpha_i * x[0 * inc_x2 + 1]; \
temp_i = alpha_r * x[0 * inc_x2 + 1]; \
temp_i OP4 alpha_i * x[0 * inc_x2]; \
#define CLOAD_Y8_VECTOR() \
LD_SP4(y, 4, y0, y1, y2, y3); \
PCKEVOD_W2_SP(y1, y0, y0r, y0i); \
PCKEVOD_W2_SP(y3, y2, y1r, y1i); \
#define CLOAD_Y4_VECTOR() \
LD_SP2(y, 4, y0, y1); \
PCKEVOD_W2_SP(y1, y0, y0r, y0i); \
#define CSTORE_Y8_VECTOR() \
ILVRL_W2_SP(y0i, y0r, y0, y1); \
ILVRL_W2_SP(y1i, y1r, y2, y3); \
ST_SP4(y0, y1, y2, y3, y, 4); \
#define CSTORE_Y4_VECTOR() \
ILVRL_W2_SP(y0i, y0r, y0, y1); \
ST_SP2(y0, y1, y, 4); \
#define CLOAD_Y8_GP() \
y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \
y1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2))); \
y1r = (v4f32) __msa_insert_w((v4i32) y1r, 1, *((int *)(y + 5 * inc_y2))); \
y1r = (v4f32) __msa_insert_w((v4i32) y1r, 2, *((int *)(y + 6 * inc_y2))); \
y1r = (v4f32) __msa_insert_w((v4i32) y1r, 3, *((int *)(y + 7 * inc_y2))); \
y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \
y1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2 + 1))); \
y1i = (v4f32) __msa_insert_w((v4i32) y1i, 1, *((int *)(y + 5 * inc_y2 + 1))); \
y1i = (v4f32) __msa_insert_w((v4i32) y1i, 2, *((int *)(y + 6 * inc_y2 + 1))); \
y1i = (v4f32) __msa_insert_w((v4i32) y1i, 3, *((int *)(y + 7 * inc_y2 + 1))); \
#define CLOAD_Y4_GP() \
y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \
y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \
#define CSTORE_Y8_GP() \
*((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \
*((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \
*((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \
*((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \
*((int *)(y + 4 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 0); \
*((int *)(y + 5 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 1); \
*((int *)(y + 6 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 2); \
*((int *)(y + 7 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 3); \
*((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \
*((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \
*((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \
*((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \
*((int *)(y + 4 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 0); \
*((int *)(y + 5 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 1); \
*((int *)(y + 6 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 2); \
*((int *)(y + 7 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 3); \
#define CSTORE_Y4_GP() \
*((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \
*((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \
*((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \
*((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \
*((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \
*((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \
*((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \
*((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \
#define CGEMV_N_MSA() \
for (j = (n >> 2); j--;) \
{ \
CLOAD_X4_SCALE(); \
\
k = 0; \
y = y_org; \
\
for (i = (m >> 3); i--;) \
{ \
CLOAD_Y8() \
CGEMV_N_8x4(); \
CSTORE_Y8(); \
\
k += 2 * 8; \
y += inc_y2 * 8; \
} \
\
if (m & 4) \
{ \
CLOAD_Y4(); \
CGEMV_N_4x4(); \
CSTORE_Y4(); \
\
k += 2 * 4; \
y += inc_y2 * 4; \
} \
\
if (m & 3) \
{ \
temp0_r = tp4r[0]; \
temp1_r = tp4r[1]; \
temp2_r = tp4r[2]; \
temp3_r = tp4r[3]; \
\
temp0_i = tp4i[0]; \
temp1_i = tp4i[1]; \
temp2_i = tp4i[2]; \
temp3_i = tp4i[3]; \
\
for (i = (m & 3); i--;) \
{ \
CGEMV_N_1x4(); \
\
k += 2; \
y += inc_y2; \
} \
} \
\
pa0 += 4 * lda2; \
pa1 += 4 * lda2; \
pa2 += 4 * lda2; \
pa3 += 4 * lda2; \
\
x += 4 * inc_x2; \
} \
\
if (n & 2) \
{ \
CLOAD_X2_SCALE(); \
\
k = 0; \
y = y_org; \
\
for (i = (m >> 3); i--;) \
{ \
CLOAD_Y8(); \
CGEMV_N_8x2(); \
CSTORE_Y8(); \
\
k += 2 * 8; \
y += inc_y2 * 8; \
} \
\
if (m & 4) \
{ \
CLOAD_Y4(); \
CGEMV_N_4x2(); \
CSTORE_Y4(); \
\
k += 2 * 4; \
y += inc_y2 * 4; \
} \
\
for (i = (m & 3); i--;) \
{ \
CGEMV_N_1x2(); \
\
k += 2; \
y += inc_y2; \
} \
\
pa0 += 2 * lda2; \
pa1 += 2 * lda2; \
\
x += 2 * inc_x2; \
} \
\
if (n & 1) \
{ \
CLOAD_X1_SCALE(); \
\
k = 0; \
y = y_org; \
\
for (i = m; i--;) \
{ \
CGEMV_N_1x1(); \
\
k += 2; \
y += inc_y2; \
} \
\
pa0 += lda2; \
x += inc_x2; \
} \
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y,
BLASLONG inc_y2, FLOAT *buffer)
{
BLASLONG i, j, k;
FLOAT *y_org = y;
FLOAT *pa0, *pa1, *pa2, *pa3;
FLOAT temp_r, temp_i, res0, res1, temp0_r;
FLOAT temp0_i, temp1_r, temp1_i, temp2_r, temp2_i, temp3_r, temp3_i;
v4f32 alphar, alphai;
v4f32 x0, x1, y0, y1, y2, y3, x0r, x0i, y0r, y1r, y0i, y1i;
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
v4f32 tp0r, tp1r, tp2r, tp3r, tp4r, tp0i, tp1i, tp2i, tp3i, tp4i;
lda2 = 2 * lda2;
inc_x2 = 2 * inc_x2;
inc_y2 = 2 * inc_y2;
pa0 = A;
pa1 = A + lda2;
pa2 = A + 2 * lda2;
pa3 = A + 3 * lda2;
alphar = COPY_FLOAT_TO_VECTOR(alpha_r);
alphai = COPY_FLOAT_TO_VECTOR(alpha_i);
if ((2 == inc_x2) && (2 == inc_y2))
{
#define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR
#define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
#define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
#define CLOAD_Y8 CLOAD_Y8_VECTOR
#define CLOAD_Y4 CLOAD_Y4_VECTOR
#define CSTORE_Y8 CSTORE_Y8_VECTOR
#define CSTORE_Y4 CSTORE_Y4_VECTOR
CGEMV_N_MSA();
#undef CLOAD_X4_SCALE
#undef CLOAD_X2_SCALE
#undef CLOAD_X1_SCALE
#undef CLOAD_Y8
#undef CLOAD_Y4
#undef CSTORE_Y8
#undef CSTORE_Y4
}
else if (2 == inc_x2)
{
#define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR
#define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
#define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
#define CLOAD_Y8 CLOAD_Y8_GP
#define CLOAD_Y4 CLOAD_Y4_GP
#define CSTORE_Y8 CSTORE_Y8_GP
#define CSTORE_Y4 CSTORE_Y4_GP
CGEMV_N_MSA();
#undef CLOAD_X4_SCALE
#undef CLOAD_X2_SCALE
#undef CLOAD_X1_SCALE
#undef CLOAD_Y8
#undef CLOAD_Y4
#undef CSTORE_Y8
#undef CSTORE_Y4
}
else if (2 == inc_y2)
{
#define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP
#define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
#define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
#define CLOAD_Y8 CLOAD_Y8_VECTOR
#define CLOAD_Y4 CLOAD_Y4_VECTOR
#define CSTORE_Y8 CSTORE_Y8_VECTOR
#define CSTORE_Y4 CSTORE_Y4_VECTOR
CGEMV_N_MSA();
#undef CLOAD_X4_SCALE
#undef CLOAD_X2_SCALE
#undef CLOAD_X1_SCALE
#undef CLOAD_Y8
#undef CLOAD_Y4
#undef CSTORE_Y8
#undef CSTORE_Y4
}
else
{
#define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP
#define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
#define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
#define CLOAD_Y8 CLOAD_Y8_GP
#define CLOAD_Y4 CLOAD_Y4_GP
#define CSTORE_Y8 CSTORE_Y8_GP
#define CSTORE_Y4 CSTORE_Y4_GP
CGEMV_N_MSA();
#undef CLOAD_X4_SCALE
#undef CLOAD_X2_SCALE
#undef CLOAD_X1_SCALE
#undef CLOAD_Y8
#undef CLOAD_Y4
#undef CSTORE_Y8
#undef CSTORE_Y4
}
return(0);
}
#undef OP0
#undef OP1
#undef OP2
#undef OP3
#undef OP4

583
kernel/mips/cgemv_t_msa.c Normal file
View File

@ -0,0 +1,583 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
#undef OP0
#undef OP1
#undef OP2
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
#define OP0 -=
#define OP1 +=
#define OP2 +=
#else
#define OP0 +=
#define OP1 +=
#define OP2 -=
#endif
#define CGEMV_T_8x4() \
LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \
LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \
\
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
PCKEVOD_W2_SP(t11, t10, src5r, src5i); \
PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
PCKEVOD_W2_SP(t15, t14, src7r, src7i); \
\
tp0r += src0r * x0r; \
tp0r += src1r * x1r; \
tp0r OP0 src0i * x0i; \
tp0r OP0 src1i * x1i; \
\
tp1r += src2r * x0r; \
tp1r += src3r * x1r; \
tp1r OP0 src2i * x0i; \
tp1r OP0 src3i * x1i; \
\
tp2r += src4r * x0r; \
tp2r += src5r * x1r; \
tp2r OP0 src4i * x0i; \
tp2r OP0 src5i * x1i; \
\
tp3r += src6r * x0r; \
tp3r += src7r * x1r; \
tp3r OP0 src6i * x0i; \
tp3r OP0 src7i * x1i; \
\
tp0i OP1 src0r * x0i; \
tp0i OP1 src1r * x1i; \
tp0i OP2 src0i * x0r; \
tp0i OP2 src1i * x1r; \
\
tp1i OP1 src2r * x0i; \
tp1i OP1 src3r * x1i; \
tp1i OP2 src2i * x0r; \
tp1i OP2 src3i * x1r; \
\
tp2i OP1 src4r * x0i; \
tp2i OP1 src5r * x1i; \
tp2i OP2 src4i * x0r; \
tp2i OP2 src5i * x1r; \
\
tp3i OP1 src6r * x0i; \
tp3i OP1 src7r * x1i; \
tp3i OP2 src6i * x0r; \
tp3i OP2 src7i * x1r; \
#define CGEMV_T_8x2() \
LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
\
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
\
tp0r += src0r * x0r; \
tp0r += src1r * x1r; \
tp0r OP0 src0i * x0i; \
tp0r OP0 src1i * x1i; \
\
tp1r += src2r * x0r; \
tp1r += src3r * x1r; \
tp1r OP0 src2i * x0i; \
tp1r OP0 src3i * x1i; \
\
tp0i OP1 src0r * x0i; \
tp0i OP1 src1r * x1i; \
tp0i OP2 src0i * x0r; \
tp0i OP2 src1i * x1r; \
\
tp1i OP1 src2r * x0i; \
tp1i OP1 src3r * x1i; \
tp1i OP2 src2i * x0r; \
tp1i OP2 src3i * x1r; \
#define CGEMV_T_8x1() \
LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
\
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
\
tp0r += src0r * x0r; \
tp0r += src1r * x1r; \
tp0r OP0 src0i * x0i; \
tp0r OP0 src1i * x1i; \
\
tp0i OP1 src0r * x0i; \
tp0i OP1 src1r * x1i; \
tp0i OP2 src0i * x0r; \
tp0i OP2 src1i * x1r; \
#define CGEMV_T_4x4() \
LD_SP2(pa0 + k, 4, t0, t1); \
LD_SP2(pa1 + k, 4, t4, t5); \
LD_SP2(pa2 + k, 4, t8, t9); \
LD_SP2(pa3 + k, 4, t12, t13); \
\
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
\
tp0r += src0r * x0r; \
tp0r OP0 src0i * x0i; \
\
tp1r += src2r * x0r; \
tp1r OP0 src2i * x0i; \
\
tp2r += src4r * x0r; \
tp2r OP0 src4i * x0i; \
\
tp3r += src6r * x0r; \
tp3r OP0 src6i * x0i; \
\
tp0i OP1 src0r * x0i; \
tp0i OP2 src0i * x0r; \
\
tp1i OP1 src2r * x0i; \
tp1i OP2 src2i * x0r; \
\
tp2i OP1 src4r * x0i; \
tp2i OP2 src4i * x0r; \
\
tp3i OP1 src6r * x0i; \
tp3i OP2 src6i * x0r; \
#define CGEMV_T_4x2() \
LD_SP2(pa0 + k, 4, t0, t1); \
LD_SP2(pa1 + k, 4, t4, t5); \
\
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
\
tp0r += src0r * x0r; \
tp0r OP0 src0i * x0i; \
\
tp1r += src2r * x0r; \
tp1r OP0 src2i * x0i; \
\
tp0i OP1 src0r * x0i; \
tp0i OP2 src0i * x0r; \
\
tp1i OP1 src2r * x0i; \
tp1i OP2 src2i * x0r; \
#define CGEMV_T_4x1() \
LD_SP2(pa0 + k, 4, t0, t1); \
\
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
\
tp0r += src0r * x0r; \
tp0r OP0 src0i * x0i; \
\
tp0i OP1 src0r * x0i; \
tp0i OP2 src0i * x0r; \
#define CGEMV_T_1x4() \
temp0r += pa0[k + 0] * x[0 * inc_x2]; \
temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
temp1r += pa1[k + 0] * x[0 * inc_x2]; \
temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \
temp2r += pa2[k + 0] * x[0 * inc_x2]; \
temp2r OP0 pa2[k + 1] * x[0 * inc_x2 + 1]; \
temp3r += pa3[k + 0] * x[0 * inc_x2]; \
temp3r OP0 pa3[k + 1] * x[0 * inc_x2 + 1]; \
\
temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \
temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \
temp2i OP1 pa2[k + 0] * x[0 * inc_x2 + 1]; \
temp2i OP2 pa2[k + 1] * x[0 * inc_x2]; \
temp3i OP1 pa3[k + 0] * x[0 * inc_x2 + 1]; \
temp3i OP2 pa3[k + 1] * x[0 * inc_x2]; \
#define CGEMV_T_1x2() \
temp0r += pa0[k + 0] * x[0 * inc_x2]; \
temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
temp1r += pa1[k + 0] * x[0 * inc_x2]; \
temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \
\
temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \
temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \
#define CGEMV_T_1x1() \
temp0r += pa0[k + 0] * x[0 * inc_x2]; \
temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
\
temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
#define CSCALE_STORE_Y4_GP() \
res0r = y[0 * inc_y2]; \
res1r = y[1 * inc_y2]; \
res2r = y[2 * inc_y2]; \
res3r = y[3 * inc_y2]; \
\
res0i = y[0 * inc_y2 + 1]; \
res1i = y[1 * inc_y2 + 1]; \
res2i = y[2 * inc_y2 + 1]; \
res3i = y[3 * inc_y2 + 1]; \
\
res0r += alphar * temp0r; \
res0r OP0 alphai * temp0i; \
res1r += alphar * temp1r; \
res1r OP0 alphai * temp1i; \
res2r += alphar * temp2r; \
res2r OP0 alphai * temp2i; \
res3r += alphar * temp3r; \
res3r OP0 alphai * temp3i; \
\
res0i OP1 alphar * temp0i; \
res0i OP2 alphai * temp0r; \
res1i OP1 alphar * temp1i; \
res1i OP2 alphai * temp1r; \
res2i OP1 alphar * temp2i; \
res2i OP2 alphai * temp2r; \
res3i OP1 alphar * temp3i; \
res3i OP2 alphai * temp3r; \
\
y[0 * inc_y2] = res0r; \
y[1 * inc_y2] = res1r; \
y[2 * inc_y2] = res2r; \
y[3 * inc_y2] = res3r; \
\
y[0 * inc_y2 + 1] = res0i; \
y[1 * inc_y2 + 1] = res1i; \
y[2 * inc_y2 + 1] = res2i; \
y[3 * inc_y2 + 1] = res3i; \
#define CSCALE_STORE_Y2_GP() \
res0r = y[0 * inc_y2]; \
res1r = y[1 * inc_y2]; \
\
res0i = y[0 * inc_y2 + 1]; \
res1i = y[1 * inc_y2 + 1]; \
\
res0r += alphar * temp0r; \
res0r OP0 alphai * temp0i; \
res1r += alphar * temp1r; \
res1r OP0 alphai * temp1i; \
\
res0i OP1 alphar * temp0i; \
res0i OP2 alphai * temp0r; \
res1i OP1 alphar * temp1i; \
res1i OP2 alphai * temp1r; \
\
y[0 * inc_y2] = res0r; \
y[1 * inc_y2] = res1r; \
\
y[0 * inc_y2 + 1] = res0i; \
y[1 * inc_y2 + 1] = res1i; \
#define CSCALE_STORE_Y1_GP() \
res0r = y[0 * inc_y2]; \
res0i = y[0 * inc_y2 + 1]; \
\
res0r += alphar * temp0r; \
res0r OP0 alphai * temp0i; \
\
res0i OP1 alphar * temp0i; \
res0i OP2 alphai * temp0r; \
\
y[0 * inc_y2] = res0r; \
y[0 * inc_y2 + 1] = res0i; \
#define CLOAD_X8_VECTOR() \
LD_SP4(x, 4, x0, x1, x2, x3); \
PCKEVOD_W2_SP(x1, x0, x0r, x0i); \
PCKEVOD_W2_SP(x3, x2, x1r, x1i); \
#define CLOAD_X4_VECTOR() \
LD_SP2(x, 4, x0, x1); \
PCKEVOD_W2_SP(x1, x0, x0r, x0i); \
#define CLOAD_X8_GP() \
x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \
x1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2))); \
x1r = (v4f32) __msa_insert_w((v4i32) x1r, 1, *((int *) (x + 5 * inc_x2))); \
x1r = (v4f32) __msa_insert_w((v4i32) x1r, 2, *((int *) (x + 6 * inc_x2))); \
x1r = (v4f32) __msa_insert_w((v4i32) x1r, 3, *((int *) (x + 7 * inc_x2))); \
x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \
x1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2 + 1))); \
x1i = (v4f32) __msa_insert_w((v4i32) x1i, 1, *((int *) (x + 5 * inc_x2 + 1))); \
x1i = (v4f32) __msa_insert_w((v4i32) x1i, 2, *((int *) (x + 6 * inc_x2 + 1))); \
x1i = (v4f32) __msa_insert_w((v4i32) x1i, 3, *((int *) (x + 7 * inc_x2 + 1))); \
#define CLOAD_X4_GP() \
x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \
x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \
#define CGEMV_T_MSA() \
for (j = (n >> 2); j--;) \
{ \
tp0r = tp1r = tp2r = tp3r = zero; \
tp0i = tp1i = tp2i = tp3i = zero; \
\
k = 0; \
x = srcx_org; \
\
for (i = (m >> 3); i--;) \
{ \
CLOAD_X8() \
CGEMV_T_8x4(); \
\
k += 2 * 8; \
x += inc_x2 * 8; \
} \
\
if (m & 4) \
{ \
CLOAD_X4(); \
\
CGEMV_T_4x4(); \
\
k += 2 * 4; \
x += inc_x2 * 4; \
} \
\
TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp2r, tp3r, \
tp0r, tp1r, tp2r, tp3r); \
TRANSPOSE4x4_SP_SP(tp0i, tp1i, tp2i, tp3i, \
tp0i, tp1i, tp2i, tp3i); \
\
tp0r += tp1r; \
tp0r += tp2r; \
tp0r += tp3r; \
tp0i += tp1i; \
tp0i += tp2i; \
tp0i += tp3i; \
\
temp0r = tp0r[0]; \
temp1r = tp0r[1]; \
temp2r = tp0r[2]; \
temp3r = tp0r[3]; \
temp0i = tp0i[0]; \
temp1i = tp0i[1]; \
temp2i = tp0i[2]; \
temp3i = tp0i[3]; \
\
for (i = (m & 3); i--;) \
{ \
CGEMV_T_1x4(); \
\
k += 2; \
x += inc_x2; \
} \
\
CSCALE_STORE_Y4_GP(); \
\
pa0 += 4 * lda2; \
pa1 += 4 * lda2; \
pa2 += 4 * lda2; \
pa3 += 4 * lda2; \
y += 4 * inc_y2; \
} \
\
if (n & 2) \
{ \
tp0r = tp1r = zero; \
tp0i = tp1i = zero; \
\
k = 0; \
x = srcx_org; \
\
for (i = (m >> 3); i--;) \
{ \
CLOAD_X8(); \
\
CGEMV_T_8x2(); \
\
k += 2 * 8; \
x += inc_x2 * 8; \
} \
\
if (m & 4) \
{ \
CLOAD_X4(); \
\
CGEMV_T_4x2(); \
\
k += 2 * 4; \
x += inc_x2 * 4; \
} \
\
TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp0i, tp1i, \
tp0r, tp1r, tp0i, tp1i); \
\
tp0r += tp1r; \
tp0r += tp0i; \
tp0r += tp1i; \
\
temp0r = tp0r[0]; \
temp1r = tp0r[1]; \
temp0i = tp0r[2]; \
temp1i = tp0r[3]; \
\
for (i = (m & 3); i--;) \
{ \
CGEMV_T_1x2(); \
\
k += 2; \
x += inc_x2; \
} \
\
CSCALE_STORE_Y2_GP(); \
\
pa0 += 2 * lda2; \
pa1 += 2 * lda2; \
y += 2 * inc_y2; \
} \
\
if (n & 1) \
{ \
tp0r = zero; \
tp0i = zero; \
\
k = 0; \
x = srcx_org; \
\
for (i = (m >> 3); i--;) \
{ \
CLOAD_X8(); \
\
CGEMV_T_8x1(); \
\
k += 2 * 8; \
x += inc_x2 * 8; \
} \
\
if (m & 4) \
{ \
CLOAD_X4(); \
\
CGEMV_T_4x1(); \
\
k += 2 * 4; \
x += inc_x2 * 4; \
} \
\
ILVRL_W2_SP(tp0i, tp0r, t0, t1); \
\
t0 += t1; \
\
temp0r = t0[0] + t0[2]; \
temp0i = t0[1] + t0[3]; \
\
for (i = (m & 3); i--;) \
{ \
CGEMV_T_1x1(); \
\
k += 2; \
x += inc_x2; \
} \
\
CSCALE_STORE_Y1_GP(); \
\
pa0 += lda2; \
y += inc_y2; \
} \
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai,
FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i, j, k;
FLOAT *pa0, *pa1, *pa2, *pa3;
FLOAT *srcx_org = x;
FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i;
FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i;
BLASLONG inc_x2, inc_y2, lda2;
v4f32 zero = {0};
v4f32 x0, x1, x2, x3, x0r, x1r, x0i, x1i;
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
v4f32 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i;
lda2 = 2 * lda;
pa0 = A;
pa1 = A + lda2;
pa2 = A + 2 * lda2;
pa3 = A + 3 * lda2;
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
if (2 == inc_x2)
{
#define CLOAD_X8 CLOAD_X8_VECTOR
#define CLOAD_X4 CLOAD_X4_VECTOR
CGEMV_T_MSA();
#undef CLOAD_X8
#undef CLOAD_X4
}
else
{
#define CLOAD_X8 CLOAD_X8_GP
#define CLOAD_X4 CLOAD_X4_GP
CGEMV_T_MSA();
#undef CLOAD_X8
#undef CLOAD_X4
}
return(0);
}
#undef OP0
#undef OP1
#undef OP2

50
kernel/mips/copy.c Normal file
View File

@ -0,0 +1,50 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n < 0 ) return(0);
while(i < n)
{
y[iy] = x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

278
kernel/mips/dasum_msa.c Normal file
View File

@ -0,0 +1,278 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include <math.h>
#include "macros_msa.h"
#define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec))
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i;
FLOAT sumf = 0.0;
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
v2f64 zero_v = {0};
v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF};
if (n <= 0 || inc_x <= 0) return (sumf);
if (1 == inc_x)
{
if (n > 15)
{
n -= 16;
LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
sum_abs0 = AND_VEC_D(src0);
sum_abs1 = AND_VEC_D(src1);
sum_abs2 = AND_VEC_D(src2);
sum_abs3 = AND_VEC_D(src3);
sum_abs0 += AND_VEC_D(src4);
sum_abs1 += AND_VEC_D(src5);
sum_abs2 += AND_VEC_D(src6);
sum_abs3 += AND_VEC_D(src7);
}
else
{
sum_abs0 = zero_v;
sum_abs1 = zero_v;
sum_abs2 = zero_v;
sum_abs3 = zero_v;
}
for (i = (n >> 4); i--;)
{
LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs2 += AND_VEC_D(src2);
sum_abs3 += AND_VEC_D(src3);
sum_abs0 += AND_VEC_D(src4);
sum_abs1 += AND_VEC_D(src5);
sum_abs2 += AND_VEC_D(src6);
sum_abs3 += AND_VEC_D(src7);
}
if (n & 15)
{
if ((n & 8) && (n & 4) && (n & 2))
{
LD_DP7_INC(x, 2, src0, src1, src2, src3, src4, src5, src6);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs2 += AND_VEC_D(src2);
sum_abs3 += AND_VEC_D(src3);
sum_abs0 += AND_VEC_D(src4);
sum_abs1 += AND_VEC_D(src5);
sum_abs2 += AND_VEC_D(src6);
}
else if ((n & 8) && (n & 4))
{
LD_DP6_INC(x, 2, src0, src1, src2, src3, src4, src5);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs2 += AND_VEC_D(src2);
sum_abs3 += AND_VEC_D(src3);
sum_abs0 += AND_VEC_D(src4);
sum_abs1 += AND_VEC_D(src5);
}
else if ((n & 8) && (n & 2))
{
LD_DP5_INC(x, 2, src0, src1, src2, src3, src4);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs2 += AND_VEC_D(src2);
sum_abs3 += AND_VEC_D(src3);
sum_abs0 += AND_VEC_D(src4);
}
else if ((n & 4) && (n & 2))
{
LD_DP3_INC(x, 2, src0, src1, src2);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs2 += AND_VEC_D(src2);
}
else if (n & 8)
{
LD_DP4_INC(x, 2, src0, src1, src2, src3);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs2 += AND_VEC_D(src2);
sum_abs3 += AND_VEC_D(src3);
}
else if (n & 4)
{
LD_DP2_INC(x, 2, src0, src1);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
}
else if (n & 2)
{
src0 = LD_DP(x); x += 2;
sum_abs0 += AND_VEC_D(src0);
}
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
sumf = sum_abs0[0] + sum_abs0[1];
if (n & 1)
{
sumf += fabs(*x);
}
}
else
{
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
sumf = sum_abs0[0] + sum_abs0[1];
}
}
else
{
if (n > 8)
{
n -= 8;
LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
sum_abs0 = AND_VEC_D(src0);
sum_abs1 = AND_VEC_D(src1);
sum_abs2 = AND_VEC_D(src2);
sum_abs3 = AND_VEC_D(src3);
sum_abs0 += AND_VEC_D(src4);
sum_abs1 += AND_VEC_D(src5);
sum_abs2 += AND_VEC_D(src6);
sum_abs3 += AND_VEC_D(src7);
}
else
{
sum_abs0 = zero_v;
sum_abs1 = zero_v;
sum_abs2 = zero_v;
sum_abs3 = zero_v;
}
for (i = (n >> 3); i--;)
{
LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs2 += AND_VEC_D(src2);
sum_abs3 += AND_VEC_D(src3);
sum_abs0 += AND_VEC_D(src4);
sum_abs1 += AND_VEC_D(src5);
sum_abs2 += AND_VEC_D(src6);
sum_abs3 += AND_VEC_D(src7);
}
if (n & 7)
{
if ((n & 4) && (n & 2) && (n & 1))
{
LD_DP7_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs2 += AND_VEC_D(src2);
sum_abs3 += AND_VEC_D(src3);
sum_abs0 += AND_VEC_D(src4);
sum_abs1 += AND_VEC_D(src5);
sum_abs2 += AND_VEC_D(src6);
}
else if ((n & 4) && (n & 2))
{
LD_DP6_INC(x, inc_x, src0, src1, src2, src3, src4, src5);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs2 += AND_VEC_D(src2);
sum_abs3 += AND_VEC_D(src3);
sum_abs0 += AND_VEC_D(src4);
sum_abs1 += AND_VEC_D(src5);
}
else if ((n & 4) && (n & 1))
{
LD_DP5_INC(x, inc_x, src0, src1, src2, src3, src4);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs2 += AND_VEC_D(src2);
sum_abs3 += AND_VEC_D(src3);
sum_abs0 += AND_VEC_D(src4);
}
else if ((n & 2) && (n & 1))
{
LD_DP3_INC(x, inc_x, src0, src1, src2);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs2 += AND_VEC_D(src2);
}
else if (n & 4)
{
LD_DP4_INC(x, inc_x, src0, src1, src2, src3);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs2 += AND_VEC_D(src2);
sum_abs3 += AND_VEC_D(src3);
}
else if (n & 2)
{
LD_DP2_INC(x, inc_x, src0, src1);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
}
else if (n & 1)
{
src0 = LD_DP(x);
sum_abs0 += AND_VEC_D(src0);
}
}
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
sumf = sum_abs0[0];
}
return (sumf);
}

189
kernel/mips/ddot_msa.c Normal file
View File

@ -0,0 +1,189 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
/* return float, x,y float */
#if defined(DSDOT)
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#else
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#endif
{
BLASLONG i = 0;
double dot = 0.0;
FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
v2f64 dot0 = {0, 0};
if (n < 0) return (dot);
if ((1 == inc_x) && (1 == inc_y))
{
for (i = (n >> 4); i--;)
{
LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
dot0 += (vy0 * vx0);
dot0 += (vy1 * vx1);
dot0 += (vy2 * vx2);
dot0 += (vy3 * vx3);
dot0 += (vy4 * vx4);
dot0 += (vy5 * vx5);
dot0 += (vy6 * vx6);
dot0 += (vy7 * vx7);
}
if (n & 15)
{
if ((n & 8) && (n & 4) && (n & 2))
{
LD_DP7_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6);
LD_DP7_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6);
dot0 += (vy0 * vx0);
dot0 += (vy1 * vx1);
dot0 += (vy2 * vx2);
dot0 += (vy3 * vx3);
dot0 += (vy4 * vx4);
dot0 += (vy5 * vx5);
dot0 += (vy6 * vx6);
}
else if ((n & 8) && (n & 4))
{
LD_DP6_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5);
LD_DP6_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5);
dot0 += (vy0 * vx0);
dot0 += (vy1 * vx1);
dot0 += (vy2 * vx2);
dot0 += (vy3 * vx3);
dot0 += (vy4 * vx4);
dot0 += (vy5 * vx5);
}
else if ((n & 8) && (n & 2))
{
LD_DP5_INC(x, 2, vx0, vx1, vx2, vx3, vx4);
LD_DP5_INC(y, 2, vy0, vy1, vy2, vy3, vy4);
dot0 += (vy0 * vx0);
dot0 += (vy1 * vx1);
dot0 += (vy2 * vx2);
dot0 += (vy3 * vx3);
dot0 += (vy4 * vx4);
}
else if ((n & 4) && (n & 2))
{
LD_DP3_INC(x, 2, vx0, vx1, vx2);
LD_DP3_INC(y, 2, vy0, vy1, vy2);
dot0 += (vy0 * vx0);
dot0 += (vy1 * vx1);
dot0 += (vy2 * vx2);
}
else if (n & 8)
{
LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3);
LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3);
dot0 += (vy0 * vx0);
dot0 += (vy1 * vx1);
dot0 += (vy2 * vx2);
dot0 += (vy3 * vx3);
}
else if (n & 4)
{
LD_DP2_INC(x, 2, vx0, vx1);
LD_DP2_INC(y, 2, vy0, vy1);
dot0 += (vy0 * vx0);
dot0 += (vy1 * vx1);
}
else if (n & 2)
{
vx0 = LD_DP(x); x += 2;
vy0 = LD_DP(y); y += 2;
dot0 += (vy0 * vx0);
}
if (n & 1)
{
x0 = *x;
y0 = *y;
dot += (y0 * x0);
}
}
dot += dot0[0];
dot += dot0[1];
}
else
{
for (i = (n >> 2); i--;)
{
LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
LD_GP4_INC(y, inc_y, y0, y1, y2, y3);
dot += (y0 * x0);
dot += (y1 * x1);
dot += (y2 * x2);
dot += (y3 * x3);
}
if ((n & 2) && (n & 1))
{
LD_GP3_INC(x, inc_x, x0, x1, x2);
LD_GP3_INC(y, inc_y, y0, y1, y2);
dot += (y0 * x0);
dot += (y1 * x1);
dot += (y2 * x2);
}
else if (n & 2)
{
LD_GP2_INC(x, inc_x, x0, x1);
LD_GP2_INC(y, inc_y, y0, y1);
dot += (y0 * x0);
dot += (y1 * x1);
}
else if (n & 1)
{
x0 = *x;
y0 = *y;
dot += (y0 * x0);
}
}
return (dot);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,118 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
FLOAT * __restrict dst)
{
BLASLONG i, j;
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
psrc0 = src;
pdst = dst;
for (j = (n >> 2); j--;)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc3 = psrc2 + lda;
psrc4 = psrc3 + lda;
psrc0 += 4 * lda;
for (i = (m >> 2); i--;)
{
LD_DP2_INC(psrc1, 2, src0, src1);
LD_DP2_INC(psrc2, 2, src2, src3);
LD_DP2_INC(psrc3, 2, src4, src5);
LD_DP2_INC(psrc4, 2, src6, src7);
ILVRL_D2_DP(src2, src0, dst0, dst4);
ILVRL_D2_DP(src6, src4, dst1, dst5);
ILVRL_D2_DP(src3, src1, dst2, dst6);
ILVRL_D2_DP(src7, src5, dst3, dst7);
ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
}
for (i = (m & 3); i--;)
{
*pdst++ = *psrc1++;
*pdst++ = *psrc2++;
*pdst++ = *psrc3++;
*pdst++ = *psrc4++;
}
}
if (n & 2)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc0 += 2 * lda;
for (i = (m >> 2); i--;)
{
LD_DP2_INC(psrc1, 2, src0, src1);
LD_DP2_INC(psrc2, 2, src2, src3);
ILVRL_D2_DP(src2, src0, dst0, dst4);
ILVRL_D2_DP(src3, src1, dst1, dst5);
ST_DP4_INC(dst0, dst4, dst1, dst5, pdst, 2);
}
for (i = (m & 3); i--;)
{
*pdst++ = *psrc1++;
*pdst++ = *psrc2++;
}
}
if (n & 1)
{
psrc1 = psrc0;
for (i = (m >> 2); i--;)
{
LD_DP2(psrc1, 2, src0, src1);
psrc1 += 4;
ST_DP2(src0, src1, pdst, 2);
pdst += 4;
}
for (i = (m & 3); i--;)
{
*pdst++ = *psrc1++;
}
}
return 0;
}

View File

@ -0,0 +1,186 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
FLOAT * __restrict dst)
{
BLASLONG i, j;
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
FLOAT *psrc8, *pdst;
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
psrc0 = src;
pdst = dst;
for (j = (n >> 3); j--;)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc3 = psrc2 + lda;
psrc4 = psrc3 + lda;
psrc5 = psrc4 + lda;
psrc6 = psrc5 + lda;
psrc7 = psrc6 + lda;
psrc8 = psrc7 + lda;
psrc0 += 8 * lda;
for (i = (m >> 3); i--;)
{
LD_DP2_INC(psrc1, 2, src0, src1);
LD_DP2_INC(psrc2, 2, src2, src3);
LD_DP2_INC(psrc3, 2, src4, src5);
LD_DP2_INC(psrc4, 2, src6, src7);
LD_DP2_INC(psrc5, 2, src8, src9);
LD_DP2_INC(psrc6, 2, src10, src11);
LD_DP2_INC(psrc7, 2, src12, src13);
LD_DP2_INC(psrc8, 2, src14, src15);
ILVRL_D2_DP(src2, src0, dst0, dst4);
ILVRL_D2_DP(src6, src4, dst1, dst5);
ILVRL_D2_DP(src10, src8, dst2, dst6);
ILVRL_D2_DP(src14, src12, dst3, dst7);
ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
ILVRL_D2_DP(src3, src1, dst0, dst4);
ILVRL_D2_DP(src7, src5, dst1, dst5);
ILVRL_D2_DP(src11, src9, dst2, dst6);
ILVRL_D2_DP(src15, src13, dst3, dst7);
ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
LD_DP2_INC(psrc1, 2, src0, src1);
LD_DP2_INC(psrc2, 2, src2, src3);
LD_DP2_INC(psrc3, 2, src4, src5);
LD_DP2_INC(psrc4, 2, src6, src7);
LD_DP2_INC(psrc5, 2, src8, src9);
LD_DP2_INC(psrc6, 2, src10, src11);
LD_DP2_INC(psrc7, 2, src12, src13);
LD_DP2_INC(psrc8, 2, src14, src15);
ILVRL_D2_DP(src2, src0, dst0, dst4);
ILVRL_D2_DP(src6, src4, dst1, dst5);
ILVRL_D2_DP(src10, src8, dst2, dst6);
ILVRL_D2_DP(src14, src12, dst3, dst7);
ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
ILVRL_D2_DP(src3, src1, dst0, dst4);
ILVRL_D2_DP(src7, src5, dst1, dst5);
ILVRL_D2_DP(src11, src9, dst2, dst6);
ILVRL_D2_DP(src15, src13, dst3, dst7);
ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
}
for (i = (m & 7); i--;)
{
*pdst++ = *psrc1++;
*pdst++ = *psrc2++;
*pdst++ = *psrc3++;
*pdst++ = *psrc4++;
*pdst++ = *psrc5++;
*pdst++ = *psrc6++;
*pdst++ = *psrc7++;
*pdst++ = *psrc8++;
}
}
if (n & 4)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc3 = psrc2 + lda;
psrc4 = psrc3 + lda;
psrc0 += 4 * lda;
for (i = (m >> 2); i--;)
{
LD_DP2_INC(psrc1, 2, src0, src1);
LD_DP2_INC(psrc2, 2, src2, src3);
LD_DP2_INC(psrc3, 2, src4, src5);
LD_DP2_INC(psrc4, 2, src6, src7);
ILVRL_D2_DP(src2, src0, dst0, dst4);
ILVRL_D2_DP(src6, src4, dst1, dst5);
ILVRL_D2_DP(src3, src1, dst2, dst6);
ILVRL_D2_DP(src7, src5, dst3, dst7);
ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
}
for (i = (m & 3); i--;)
{
*pdst++ = *psrc1++;
*pdst++ = *psrc2++;
*pdst++ = *psrc3++;
*pdst++ = *psrc4++;
}
}
if (n & 2)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc0 += 2 * lda;
for (i = (m >> 1); i--;)
{
src0 = LD_DP(psrc1);
src1 = LD_DP(psrc2);
psrc1 += 2;
psrc2 += 2;
ILVRL_D2_DP(src1, src0, dst0, dst1);
ST_DP2_INC(dst0, dst1, pdst, 2);
}
if (m & 1)
{
*pdst++ = *psrc1++;
*pdst++ = *psrc2++;
}
}
if (n & 1)
{
psrc1 = psrc0;
for (i = m; i--;)
{
*pdst++ = *psrc1++;
}
}
return 0;
}

View File

@ -0,0 +1,153 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
FLOAT * __restrict dst)
{
BLASLONG i, j;
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
FLOAT *pdst0, *pdst1, *pdst2, *pdst3;
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
psrc0 = src;
pdst0 = dst;
pdst2 = dst + m * (n & ~3);
pdst3 = dst + m * (n & ~1);
for (j = (m >> 2); j--;)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc3 = psrc2 + lda;
psrc4 = psrc3 + lda;
psrc0 += 4 * lda;
pdst1 = pdst0;
pdst0 += 16;
for (i = (n >> 2); i--;)
{
LD_DP2_INC(psrc1, 2, src0, src1);
LD_DP2_INC(psrc2, 2, src2, src3);
LD_DP2_INC(psrc3, 2, src4, src5);
LD_DP2_INC(psrc4, 2, src6, src7);
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
pdst1 += m * 4;
}
if (n & 2)
{
src0 = LD_DP(psrc1);
src1 = LD_DP(psrc2);
src2 = LD_DP(psrc3);
src3 = LD_DP(psrc4);
psrc1 += 2;
psrc2 += 2;
psrc3 += 2;
psrc4 += 2;
ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
}
if (n & 1)
{
*pdst3++ = *psrc1++;
*pdst3++ = *psrc2++;
*pdst3++ = *psrc3++;
*pdst3++ = *psrc4++;
}
}
if (m & 2)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc0 += 2 * lda;
pdst1 = pdst0;
pdst0 += 8;
for (i = (n >> 2); i--;)
{
LD_DP2_INC(psrc1, 2, src0, src1);
LD_DP2_INC(psrc2, 2, src2, src3);
ST_DP4(src0, src1, src2, src3, pdst1, 2);
pdst1 += m * 4;
}
if (n & 2)
{
src0 = LD_DP(psrc1);
src1 = LD_DP(psrc2);
psrc1 += 2;
psrc2 += 2;
ST_DP2_INC(src0, src1, pdst2, 2);
}
if (n & 1)
{
*pdst3++ = *psrc1++;
*pdst3++ = *psrc2++;
}
}
if (m & 1)
{
psrc1 = psrc0;
pdst1 = pdst0;
for (i = (n >> 2); i--;)
{
LD_DP2_INC(psrc1, 2, src0, src1);
ST_DP2(src0, src1, pdst1, 2);
pdst1 += 4 * m;
}
if (n & 2)
{
src0 = LD_DP(psrc1);
psrc1 += 2;
ST_DP(src0, pdst2);
}
if (n & 1)
{
*pdst3 = *psrc1;
}
}
return 0;
}

View File

@ -0,0 +1,276 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
FLOAT * __restrict dst)
{
BLASLONG i, j;
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
FLOAT *psrc5, *psrc6, *psrc7, *psrc8;
FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4;
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
psrc0 = src;
pdst0 = dst;
pdst2 = dst + m * (n & ~7);
pdst3 = dst + m * (n & ~3);
pdst4 = dst + m * (n & ~1);
for (j = (m >> 3); j--;)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc3 = psrc2 + lda;
psrc4 = psrc3 + lda;
psrc5 = psrc4 + lda;
psrc6 = psrc5 + lda;
psrc7 = psrc6 + lda;
psrc8 = psrc7 + lda;
psrc0 += 8 * lda;
pdst1 = pdst0;
pdst0 += 64;
for (i = (n >> 3); i--;)
{
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
pdst1 + 16, 2);
LD_DP4_INC(psrc5, 2, src0, src1, src2, src3);
LD_DP4_INC(psrc6, 2, src4, src5, src6, src7);
LD_DP4_INC(psrc7, 2, src8, src9, src10, src11);
LD_DP4_INC(psrc8, 2, src12, src13, src14, src15);
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1 + 32,
2);
ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
pdst1 + 48, 2);
pdst1 += m * 8;
}
if (n & 4)
{
LD_DP2_INC(psrc1, 2, src0, src1);
LD_DP2_INC(psrc2, 2, src2, src3);
LD_DP2_INC(psrc3, 2, src4, src5);
LD_DP2_INC(psrc4, 2, src6, src7);
LD_DP2_INC(psrc5, 2, src8, src9);
LD_DP2_INC(psrc6, 2, src10, src11);
LD_DP2_INC(psrc7, 2, src12, src13);
LD_DP2_INC(psrc8, 2, src14, src15);
ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
ST_DP8_INC(src8, src9, src10, src11, src12, src13, src14, src15,
pdst2, 2);
}
if (n & 2)
{
src0 = LD_DP(psrc1);
src1 = LD_DP(psrc2);
src2 = LD_DP(psrc3);
src3 = LD_DP(psrc4);
src4 = LD_DP(psrc5);
src5 = LD_DP(psrc6);
src6 = LD_DP(psrc7);
src7 = LD_DP(psrc8);
psrc1 += 2;
psrc2 += 2;
psrc3 += 2;
psrc4 += 2;
psrc5 += 2;
psrc6 += 2;
psrc7 += 2;
psrc8 += 2;
ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2);
}
if (n & 1)
{
*pdst4++ = *psrc1++;
*pdst4++ = *psrc2++;
*pdst4++ = *psrc3++;
*pdst4++ = *psrc4++;
*pdst4++ = *psrc5++;
*pdst4++ = *psrc6++;
*pdst4++ = *psrc7++;
*pdst4++ = *psrc8++;
}
}
if (m & 4)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc3 = psrc2 + lda;
psrc4 = psrc3 + lda;
psrc0 += 4 * lda;
pdst1 = pdst0;
pdst0 += 32;
for (i = (n >> 3); i--;)
{
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
pdst1 + 16, 2);
pdst1 += 8 * m;
}
if (n & 4)
{
LD_DP2_INC(psrc1, 2, src0, src1);
LD_DP2_INC(psrc2, 2, src2, src3);
LD_DP2_INC(psrc3, 2, src4, src5);
LD_DP2_INC(psrc4, 2, src6, src7);
ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
}
if (n & 2)
{
src0 = LD_DP(psrc1);
src1 = LD_DP(psrc2);
src2 = LD_DP(psrc3);
src3 = LD_DP(psrc4);
psrc1 += 2;
psrc2 += 2;
psrc3 += 2;
psrc4 += 2;
ST_DP4_INC(src0, src1, src2, src3, pdst3, 2);
}
if (n & 1)
{
*pdst4++ = *psrc1++;
*pdst4++ = *psrc2++;
*pdst4++ = *psrc3++;
*pdst4++ = *psrc4++;
}
}
if (m & 2)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc0 += 2 * lda;
pdst1 = pdst0;
pdst0 += 16;
for (i = (n >> 3); i--;)
{
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
pdst1 += 8 * m;
}
if (n & 4)
{
LD_DP2_INC(psrc1, 2, src0, src1);
LD_DP2_INC(psrc2, 2, src2, src3);
ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
}
if (n & 2)
{
src0 = LD_DP(psrc1);
src1 = LD_DP(psrc2);
psrc1 += 2;
psrc2 += 2;
ST_DP2_INC(src0, src1, pdst3, 2);
}
if (n & 1)
{
*pdst4++ = *psrc1++;
*pdst4++ = *psrc2++;
}
}
if (m & 1)
{
psrc1 = psrc0;
psrc0 += lda;
pdst1 = pdst0;
pdst0 += 8;
for (i = (n >> 3); i--;)
{
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
ST_DP4(src0, src1, src2, src3, pdst1, 2);
pdst1 += 8 * m;
}
if (n & 4)
{
LD_DP2_INC(psrc1, 2, src0, src1);
ST_DP2_INC(src0, src1, pdst2, 2);
}
if (n & 2)
{
src0 = LD_DP(psrc1);
psrc1 += 2;
ST_DP(src0, pdst3);
pdst3 += 2;
}
if (n & 1)
{
*pdst4++ = *psrc1++;
}
}
return 0;
}

577
kernel/mips/dgemv_n_msa.c Normal file
View File

@ -0,0 +1,577 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
#define DGEMV_N_8x8() \
{ \
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \
LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \
LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \
LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \
\
y0 += tp0 * t0; \
y1 += tp0 * t1; \
y2 += tp0 * t2; \
y3 += tp0 * t3; \
\
y0 += tp1 * t4; \
y1 += tp1 * t5; \
y2 += tp1 * t6; \
y3 += tp1 * t7; \
\
y0 += tp2 * t8; \
y1 += tp2 * t9; \
y2 += tp2 * t10; \
y3 += tp2 * t11; \
\
y0 += tp3 * t12; \
y1 += tp3 * t13; \
y2 += tp3 * t14; \
y3 += tp3 * t15; \
\
y0 += tp4 * t16; \
y1 += tp4 * t17; \
y2 += tp4 * t18; \
y3 += tp4 * t19; \
\
y0 += tp5 * t20; \
y1 += tp5 * t21; \
y2 += tp5 * t22; \
y3 += tp5 * t23; \
\
y0 += tp6 * t24; \
y1 += tp6 * t25; \
y2 += tp6 * t26; \
y3 += tp6 * t27; \
\
y0 += tp7 * t28; \
y1 += tp7 * t29; \
y2 += tp7 * t30; \
y3 += tp7 * t31; \
}
#define DGEMV_N_4x8() \
{ \
LD_DP2(pa0 + k, 2, t0, t1); \
LD_DP2(pa1 + k, 2, t4, t5); \
LD_DP2(pa2 + k, 2, t8, t9); \
LD_DP2(pa3 + k, 2, t12, t13); \
LD_DP2(pa4 + k, 2, t16, t17); \
LD_DP2(pa5 + k, 2, t20, t21); \
LD_DP2(pa6 + k, 2, t24, t25); \
LD_DP2(pa7 + k, 2, t28, t29); \
\
y0 += tp0 * t0; \
y1 += tp0 * t1; \
\
y0 += tp1 * t4; \
y1 += tp1 * t5; \
\
y0 += tp2 * t8; \
y1 += tp2 * t9; \
\
y0 += tp3 * t12; \
y1 += tp3 * t13; \
\
y0 += tp4 * t16; \
y1 += tp4 * t17; \
\
y0 += tp5 * t20; \
y1 += tp5 * t21; \
\
y0 += tp6 * t24; \
y1 += tp6 * t25; \
\
y0 += tp7 * t28; \
y1 += tp7 * t29; \
}
#define DGEMV_N_8x4() \
{ \
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
\
y0 += tp0 * t0; \
y1 += tp0 * t1; \
y2 += tp0 * t2; \
y3 += tp0 * t3; \
\
y0 += tp1 * t4; \
y1 += tp1 * t5; \
y2 += tp1 * t6; \
y3 += tp1 * t7; \
\
y0 += tp2 * t8; \
y1 += tp2 * t9; \
y2 += tp2 * t10; \
y3 += tp2 * t11; \
\
y0 += tp3 * t12; \
y1 += tp3 * t13; \
y2 += tp3 * t14; \
y3 += tp3 * t15; \
}
#define DGEMV_N_4x4() \
{ \
LD_DP2(pa0 + k, 2, t0, t1); \
LD_DP2(pa1 + k, 2, t4, t5); \
LD_DP2(pa2 + k, 2, t8, t9); \
LD_DP2(pa3 + k, 2, t12, t13); \
\
y0 += tp0 * t0; \
y1 += tp0 * t1; \
\
y0 += tp1 * t4; \
y1 += tp1 * t5; \
\
y0 += tp2 * t8; \
y1 += tp2 * t9; \
\
y0 += tp3 * t12; \
y1 += tp3 * t13; \
}
#define DGEMV_N_8x2() \
{ \
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
\
y0 += tp0 * t0; \
y1 += tp0 * t1; \
y2 += tp0 * t2; \
y3 += tp0 * t3; \
\
y0 += tp1 * t4; \
y1 += tp1 * t5; \
y2 += tp1 * t6; \
y3 += tp1 * t7; \
}
#define DGEMV_N_4x2() \
{ \
LD_DP2(pa0 + k, 2, t0, t1); \
LD_DP2(pa1 + k, 2, t4, t5); \
\
y0 += tp0 * t0; \
y1 += tp0 * t1; \
\
y0 += tp1 * t4; \
y1 += tp1 * t5; \
}
#define DLOAD_X8_SCALE_GP() \
temp0 = alpha * x[0 * inc_x]; \
temp1 = alpha * x[1 * inc_x]; \
temp2 = alpha * x[2 * inc_x]; \
temp3 = alpha * x[3 * inc_x]; \
temp4 = alpha * x[4 * inc_x]; \
temp5 = alpha * x[5 * inc_x]; \
temp6 = alpha * x[6 * inc_x]; \
temp7 = alpha * x[7 * inc_x]; \
\
tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \
tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \
tp4 = COPY_DOUBLE_TO_VECTOR(temp4); \
tp5 = COPY_DOUBLE_TO_VECTOR(temp5); \
tp6 = COPY_DOUBLE_TO_VECTOR(temp6); \
tp7 = COPY_DOUBLE_TO_VECTOR(temp7); \
#define DLOAD_X4_SCALE_GP() \
temp0 = alpha * x[0 * inc_x]; \
temp1 = alpha * x[1 * inc_x]; \
temp2 = alpha * x[2 * inc_x]; \
temp3 = alpha * x[3 * inc_x]; \
\
tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \
tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \
#define DLOAD_X8_SCALE_VECTOR() \
LD_DP4(x, 2, x0, x1, x2, x3); \
\
x0 = x0 * v_alpha; \
x1 = x1 * v_alpha; \
x2 = x2 * v_alpha; \
x3 = x3 * v_alpha; \
\
SPLATI_D2_DP(x0, tp0, tp1); \
SPLATI_D2_DP(x1, tp2, tp3); \
SPLATI_D2_DP(x2, tp4, tp5); \
SPLATI_D2_DP(x3, tp6, tp7); \
#define DLOAD_X4_SCALE_VECTOR() \
LD_DP2(x, 2, x0, x1); \
\
x0 = x0 * v_alpha; \
x1 = x1 * v_alpha; \
\
SPLATI_D2_DP(x0, tp0, tp1); \
SPLATI_D2_DP(x1, tp2, tp3); \
#define DLOAD_Y8_GP() \
y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \
y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \
y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \
y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \
y2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 4 * inc_y))); \
y2 = (v2f64) __msa_insert_d((v2i64) y2, 1, *((long long *)(y + 5 * inc_y))); \
y3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 6 * inc_y))); \
y3 = (v2f64) __msa_insert_d((v2i64) y3, 1, *((long long *)(y + 7 * inc_y))); \
#define DLOAD_Y4_GP() \
y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \
y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \
y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \
y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \
#define DLOAD_Y8_VECTOR() LD_DP4(y, 2, y0, y1, y2, y3);
#define DLOAD_Y4_VECTOR() LD_DP2(y, 2, y0, y1);
#define DSTORE_Y8_GP() \
*((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \
*((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \
*((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \
*((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \
*((long long *)(y + 4 * inc_y)) = __msa_copy_s_d((v2i64) y2, 0); \
*((long long *)(y + 5 * inc_y)) = __msa_copy_s_d((v2i64) y2, 1); \
*((long long *)(y + 6 * inc_y)) = __msa_copy_s_d((v2i64) y3, 0); \
*((long long *)(y + 7 * inc_y)) = __msa_copy_s_d((v2i64) y3, 1); \
#define DSTORE_Y4_GP() \
*((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \
*((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \
*((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \
*((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \
#define DSTORE_Y8_VECTOR() ST_DP4(y0, y1, y2, y3, y, 2);
#define DSTORE_Y4_VECTOR() ST_DP2(y0, y1, y, 2);
#define DGEMV_N_MSA() \
for (j = (n >> 3); j--;) \
{ \
DLOAD_X8_SCALE(); \
\
k = 0; \
y = y_org; \
\
for (i = (m >> 3); i--;) \
{ \
DLOAD_Y8(); \
DGEMV_N_8x8(); \
DSTORE_Y8(); \
\
y += 8 * inc_y; \
k += 8; \
} \
\
if (m & 4) \
{ \
DLOAD_Y4(); \
DGEMV_N_4x8(); \
DSTORE_Y4(); \
\
y += 4 * inc_y; \
k += 4; \
} \
\
if (m & 3) \
{ \
temp0 = alpha * x[0 * inc_x]; \
temp1 = alpha * x[1 * inc_x]; \
temp2 = alpha * x[2 * inc_x]; \
temp3 = alpha * x[3 * inc_x]; \
temp4 = alpha * x[4 * inc_x]; \
temp5 = alpha * x[5 * inc_x]; \
temp6 = alpha * x[6 * inc_x]; \
temp7 = alpha * x[7 * inc_x]; \
\
for (i = (m & 3); i--;) \
{ \
temp = y[0]; \
temp += temp0 * pa0[k]; \
temp += temp1 * pa1[k]; \
temp += temp2 * pa2[k]; \
temp += temp3 * pa3[k]; \
temp += temp4 * pa4[k]; \
temp += temp5 * pa5[k]; \
temp += temp6 * pa6[k]; \
temp += temp7 * pa7[k]; \
y[0] = temp; \
\
y += inc_y; \
k++; \
} \
} \
pa0 += 8 * lda; \
pa1 += 8 * lda; \
pa2 += 8 * lda; \
pa3 += 8 * lda; \
pa4 += 8 * lda; \
pa5 += 8 * lda; \
pa6 += 8 * lda; \
pa7 += 8 * lda; \
\
x += 8 * inc_x; \
} \
\
if (n & 4) \
{ \
DLOAD_X4_SCALE(); \
\
k = 0; \
y = y_org; \
\
for (i = (m >> 3); i--;) \
{ \
DLOAD_Y8(); \
DGEMV_N_8x4(); \
DSTORE_Y8(); \
\
y += 8 * inc_y; \
k += 8; \
} \
\
if (m & 4) \
{ \
DLOAD_Y4(); \
DGEMV_N_4x4(); \
DSTORE_Y4(); \
\
y += 4 * inc_y; \
k += 4; \
} \
\
if (m & 3) \
{ \
temp0 = alpha * x[0 * inc_x]; \
temp1 = alpha * x[1 * inc_x]; \
temp2 = alpha * x[2 * inc_x]; \
temp3 = alpha * x[3 * inc_x]; \
\
for (i = (m & 3); i--;) \
{ \
temp = y[0]; \
temp += temp0 * pa0[k]; \
temp += temp1 * pa1[k]; \
temp += temp2 * pa2[k]; \
temp += temp3 * pa3[k]; \
y[0] = temp; \
\
y += inc_y; \
k++; \
} \
} \
\
pa0 += 4 * lda; \
pa1 += 4 * lda; \
pa2 += 4 * lda; \
pa3 += 4 * lda; \
\
x += 4 * inc_x; \
} \
\
if (n & 2) \
{ \
temp0 = alpha * x[0 * inc_x]; \
temp1 = alpha * x[1 * inc_x]; \
\
tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
\
k = 0; \
y = y_org; \
\
for (i = (m >> 3); i--;) \
{ \
DLOAD_Y8(); \
DGEMV_N_8x2(); \
DSTORE_Y8(); \
\
y += 8 * inc_y; \
k += 8; \
} \
\
if (m & 4) \
{ \
DLOAD_Y4(); \
DGEMV_N_4x2(); \
DSTORE_Y4(); \
\
y += 4 * inc_y; \
k += 4; \
} \
\
if (m & 3) \
{ \
temp0 = alpha * x[0 * inc_x]; \
temp1 = alpha * x[1 * inc_x]; \
\
for (i = (m & 3); i--;) \
{ \
temp = y[0]; \
temp += temp0 * pa0[k]; \
temp += temp1 * pa1[k]; \
y[0] = temp; \
\
y += inc_y; \
k++; \
} \
} \
\
pa0 += 2 * lda; \
pa1 += 2 * lda; \
\
x += 2 * inc_x; \
} \
\
if (n & 1) \
{ \
temp = alpha * x[0]; \
\
k = 0; \
y = y_org; \
\
for (i = m; i--;) \
{ \
y[0] += temp * pa0[k]; \
y += inc_y; \
k++; \
} \
} \
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *buffer)
{
BLASLONG i, j, k;
FLOAT *y_org = y;
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
v2f64 v_alpha;
v2f64 x0, x1, x2, x3, y0, y1, y2, y3;
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);
pa0 = A;
pa1 = A + lda;
pa2 = A + 2 * lda;
pa3 = A + 3 * lda;
pa4 = A + 4 * lda;
pa5 = A + 5 * lda;
pa6 = A + 6 * lda;
pa7 = A + 7 * lda;
if ((1 == inc_x) && (1 == inc_y))
{
#define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR
#define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR
#define DLOAD_Y8 DLOAD_Y8_VECTOR
#define DLOAD_Y4 DLOAD_Y4_VECTOR
#define DSTORE_Y8 DSTORE_Y8_VECTOR
#define DSTORE_Y4 DSTORE_Y4_VECTOR
DGEMV_N_MSA();
#undef DLOAD_X8_SCALE
#undef DLOAD_X4_SCALE
#undef DLOAD_Y8
#undef DLOAD_Y4
#undef DSTORE_Y8
#undef DSTORE_Y4
}
else if (1 == inc_y)
{
#define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP
#define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP
#define DLOAD_Y8 DLOAD_Y8_VECTOR
#define DLOAD_Y4 DLOAD_Y4_VECTOR
#define DSTORE_Y8 DSTORE_Y8_VECTOR
#define DSTORE_Y4 DSTORE_Y4_VECTOR
DGEMV_N_MSA();
#undef DLOAD_X8_SCALE
#undef DLOAD_X4_SCALE
#undef DLOAD_Y8
#undef DLOAD_Y4
#undef DSTORE_Y8
#undef DSTORE_Y4
}
else if (1 == inc_x)
{
#define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR
#define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR
#define DLOAD_Y8 DLOAD_Y8_GP
#define DLOAD_Y4 DLOAD_Y4_GP
#define DSTORE_Y8 DSTORE_Y8_GP
#define DSTORE_Y4 DSTORE_Y4_GP
DGEMV_N_MSA();
#undef DLOAD_X8_SCALE
#undef DLOAD_X4_SCALE
#undef DLOAD_Y8
#undef DLOAD_Y4
#undef DSTORE_Y8
#undef DSTORE_Y4
}
else
{
#define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP
#define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP
#define DLOAD_Y8 DLOAD_Y8_GP
#define DLOAD_Y4 DLOAD_Y4_GP
#define DSTORE_Y8 DSTORE_Y8_GP
#define DSTORE_Y4 DSTORE_Y4_GP
DGEMV_N_MSA();
#undef DLOAD_X8_SCALE
#undef DLOAD_X4_SCALE
#undef DLOAD_Y8
#undef DLOAD_Y4
#undef DSTORE_Y8
#undef DSTORE_Y4
}
return(0);
}

589
kernel/mips/dgemv_t_msa.c Normal file
View File

@ -0,0 +1,589 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
#define DGEMV_T_8x8() \
{ \
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \
LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \
LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \
LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \
\
tp0 += x0 * t0; \
tp0 += x1 * t1; \
tp0 += x2 * t2; \
tp0 += x3 * t3; \
\
tp1 += x0 * t4; \
tp1 += x1 * t5; \
tp1 += x2 * t6; \
tp1 += x3 * t7; \
\
tp2 += x0 * t8; \
tp2 += x1 * t9; \
tp2 += x2 * t10; \
tp2 += x3 * t11; \
\
tp3 += x0 * t12; \
tp3 += x1 * t13; \
tp3 += x2 * t14; \
tp3 += x3 * t15; \
\
tp4 += x0 * t16; \
tp4 += x1 * t17; \
tp4 += x2 * t18; \
tp4 += x3 * t19; \
\
tp5 += x0 * t20; \
tp5 += x1 * t21; \
tp5 += x2 * t22; \
tp5 += x3 * t23; \
\
tp6 += x0 * t24; \
tp6 += x1 * t25; \
tp6 += x2 * t26; \
tp6 += x3 * t27; \
\
tp7 += x0 * t28; \
tp7 += x1 * t29; \
tp7 += x2 * t30; \
tp7 += x3 * t31; \
}
#define DGEMV_T_8x4() \
{ \
LD_DP2(pa0 + k, 2, t0, t1); \
LD_DP2(pa1 + k, 2, t4, t5); \
LD_DP2(pa2 + k, 2, t8, t9); \
LD_DP2(pa3 + k, 2, t12, t13); \
LD_DP2(pa4 + k, 2, t16, t17); \
LD_DP2(pa5 + k, 2, t20, t21); \
LD_DP2(pa6 + k, 2, t24, t25); \
LD_DP2(pa7 + k, 2, t28, t29); \
\
tp0 += x0 * t0; \
tp0 += x1 * t1; \
\
tp1 += x0 * t4; \
tp1 += x1 * t5; \
\
tp2 += x0 * t8; \
tp2 += x1 * t9; \
\
tp3 += x0 * t12; \
tp3 += x1 * t13; \
\
tp4 += x0 * t16; \
tp4 += x1 * t17; \
\
tp5 += x0 * t20; \
tp5 += x1 * t21; \
\
tp6 += x0 * t24; \
tp6 += x1 * t25; \
\
tp7 += x0 * t28; \
tp7 += x1 * t29; \
}
#define DGEMV_T_8x2() \
{ \
t0 = LD_DP(pa0 + k); \
t4 = LD_DP(pa1 + k); \
t8 = LD_DP(pa2 + k); \
t12 = LD_DP(pa3 + k); \
t16 = LD_DP(pa4 + k); \
t20 = LD_DP(pa5 + k); \
t24 = LD_DP(pa6 + k); \
t28 = LD_DP(pa7 + k); \
\
tp0 += x0 * t0; \
tp1 += x0 * t4; \
tp2 += x0 * t8; \
tp3 += x0 * t12; \
tp4 += x0 * t16; \
tp5 += x0 * t20; \
tp6 += x0 * t24; \
tp7 += x0 * t28; \
}
#define DGEMV_T_4x8() \
{ \
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
\
tp0 += x0 * t0; \
tp0 += x1 * t1; \
tp0 += x2 * t2; \
tp0 += x3 * t3; \
\
tp1 += x0 * t4; \
tp1 += x1 * t5; \
tp1 += x2 * t6; \
tp1 += x3 * t7; \
\
tp2 += x0 * t8; \
tp2 += x1 * t9; \
tp2 += x2 * t10; \
tp2 += x3 * t11; \
\
tp3 += x0 * t12; \
tp3 += x1 * t13; \
tp3 += x2 * t14; \
tp3 += x3 * t15; \
}
#define DGEMV_T_4x4() \
{ \
LD_DP2(pa0 + k, 2, t0, t1); \
LD_DP2(pa1 + k, 2, t4, t5); \
LD_DP2(pa2 + k, 2, t8, t9); \
LD_DP2(pa3 + k, 2, t12, t13); \
\
tp0 += x0 * t0; \
tp0 += x1 * t1; \
\
tp1 += x0 * t4; \
tp1 += x1 * t5; \
\
tp2 += x0 * t8; \
tp2 += x1 * t9; \
\
tp3 += x0 * t12; \
tp3 += x1 * t13; \
}
#define DGEMV_T_4x2() \
{ \
t0 = LD_DP(pa0 + k); \
t4 = LD_DP(pa1 + k); \
t8 = LD_DP(pa2 + k); \
t12 = LD_DP(pa3 + k); \
\
tp0 += x0 * t0; \
tp1 += x0 * t4; \
tp2 += x0 * t8; \
tp3 += x0 * t12; \
}
#define DGEMV_T_2x8() \
{ \
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
\
tp0 += x0 * t0; \
tp0 += x1 * t1; \
tp0 += x2 * t2; \
tp0 += x3 * t3; \
\
tp1 += x0 * t4; \
tp1 += x1 * t5; \
tp1 += x2 * t6; \
tp1 += x3 * t7; \
}
#define DGEMV_T_2x4() \
{ \
LD_DP2(pa0 + k, 2, t0, t1); \
LD_DP2(pa1 + k, 2, t4, t5); \
\
tp0 += x0 * t0; \
tp0 += x1 * t1; \
\
tp1 += x0 * t4; \
tp1 += x1 * t5; \
}
#define DGEMV_T_2x2() \
{ \
t0 = LD_DP(pa0 + k); \
t4 = LD_DP(pa1 + k); \
\
tp0 += x0 * t0; \
tp1 += x0 * t4; \
}
#define DLOAD_X8_GP() \
x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \
x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \
x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \
x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \
x2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 4 * inc_x))); \
x2 = (v2f64) __msa_insert_d((v2i64) x2, 1, *((long long *)(x + 5 * inc_x))); \
x3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 6 * inc_x))); \
x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((long long *)(x + 7 * inc_x))); \
#define DLOAD_X4_GP() \
x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \
x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \
x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \
x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \
#define DLOAD_X2_GP() \
x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \
x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \
#define DLOAD_X8_VECTOR() LD_DP4(x, 2, x0, x1, x2, x3);
#define DLOAD_X4_VECTOR() LD_DP2(x, 2, x0, x1);
#define DLOAD_X2_VECTOR() x0 = LD_DP(x);
#define DGEMV_T_MSA() \
for (j = (n >> 3); j--;) \
{ \
tp0 = zero; \
tp1 = zero; \
tp2 = zero; \
tp3 = zero; \
tp4 = zero; \
tp5 = zero; \
tp6 = zero; \
tp7 = zero; \
\
k = 0; \
x = srcx_org; \
\
for (i = (m >> 3); i--;) \
{ \
DLOAD_X8(); \
DGEMV_T_8x8(); \
\
x += 8 * inc_x; \
k += 8; \
} \
\
if (m & 4) \
{ \
DLOAD_X4(); \
DGEMV_T_8x4(); \
\
x += 4 * inc_x; \
k += 4; \
} \
\
if (m & 2) \
{ \
DLOAD_X2(); \
DGEMV_T_8x2(); \
\
x += 2 * inc_x; \
k += 2; \
} \
\
ILVRL_D2_DP(tp1, tp0, t0, t4); \
ILVRL_D2_DP(tp3, tp2, t1, t5); \
ILVRL_D2_DP(tp5, tp4, t2, t6); \
ILVRL_D2_DP(tp7, tp6, t3, t7); \
ADD2(t0, t4, t1, t5, t0, t1); \
ADD2(t2, t6, t3, t7, t2, t3); \
\
temp0 = t0[0]; \
temp1 = t0[1]; \
temp2 = t1[0]; \
temp3 = t1[1]; \
temp4 = t2[0]; \
temp5 = t2[1]; \
temp6 = t3[0]; \
temp7 = t3[1]; \
\
if (m & 1) \
{ \
temp0 += pa0[k] * x[0]; \
temp1 += pa1[k] * x[0]; \
temp2 += pa2[k] * x[0]; \
temp3 += pa3[k] * x[0]; \
temp4 += pa4[k] * x[0]; \
temp5 += pa5[k] * x[0]; \
temp6 += pa6[k] * x[0]; \
temp7 += pa7[k] * x[0]; \
\
x += inc_x; \
k++; \
} \
\
res0 = y[0 * inc_y]; \
res1 = y[1 * inc_y]; \
res2 = y[2 * inc_y]; \
res3 = y[3 * inc_y]; \
res4 = y[4 * inc_y]; \
res5 = y[5 * inc_y]; \
res6 = y[6 * inc_y]; \
res7 = y[7 * inc_y]; \
\
res0 += alpha * temp0; \
res1 += alpha * temp1; \
res2 += alpha * temp2; \
res3 += alpha * temp3; \
res4 += alpha * temp4; \
res5 += alpha * temp5; \
res6 += alpha * temp6; \
res7 += alpha * temp7; \
\
y[0 * inc_y] = res0; \
y[1 * inc_y] = res1; \
y[2 * inc_y] = res2; \
y[3 * inc_y] = res3; \
y[4 * inc_y] = res4; \
y[5 * inc_y] = res5; \
y[6 * inc_y] = res6; \
y[7 * inc_y] = res7; \
\
y += 8 * inc_y; \
\
pa0 += 8 * lda; \
pa1 += 8 * lda; \
pa2 += 8 * lda; \
pa3 += 8 * lda; \
pa4 += 8 * lda; \
pa5 += 8 * lda; \
pa6 += 8 * lda; \
pa7 += 8 * lda; \
} \
\
if (n & 4) \
{ \
tp0 = zero; \
tp1 = zero; \
tp2 = zero; \
tp3 = zero; \
\
k = 0; \
x = srcx_org; \
\
for (i = (m >> 3); i--;) \
{ \
DLOAD_X8(); \
DGEMV_T_4x8(); \
\
x += 8 * inc_x; \
k += 8; \
} \
\
if (m & 4) \
{ \
DLOAD_X4(); \
DGEMV_T_4x4(); \
\
x += 4 * inc_x; \
k += 4; \
} \
\
if (m & 2) \
{ \
DLOAD_X2(); \
DGEMV_T_4x2(); \
\
x += 2 * inc_x; \
k += 2; \
} \
\
ILVRL_D2_DP(tp1, tp0, t0, t4); \
ILVRL_D2_DP(tp3, tp2, t1, t5); \
ADD2(t0, t4, t1, t5, t0, t1); \
\
temp0 = t0[0]; \
temp1 = t0[1]; \
temp2 = t1[0]; \
temp3 = t1[1]; \
\
if (m & 1) \
{ \
temp0 += pa0[k] * x[0]; \
temp1 += pa1[k] * x[0]; \
temp2 += pa2[k] * x[0]; \
temp3 += pa3[k] * x[0]; \
\
x += inc_x; \
k++; \
} \
\
res0 = y[0 * inc_y]; \
res1 = y[1 * inc_y]; \
res2 = y[2 * inc_y]; \
res3 = y[3 * inc_y]; \
\
res0 += alpha * temp0; \
res1 += alpha * temp1; \
res2 += alpha * temp2; \
res3 += alpha * temp3; \
\
y[0 * inc_y] = res0; \
y[1 * inc_y] = res1; \
y[2 * inc_y] = res2; \
y[3 * inc_y] = res3; \
\
y += 4 * inc_y; \
\
pa0 += 4 * lda; \
pa1 += 4 * lda; \
pa2 += 4 * lda; \
pa3 += 4 * lda; \
} \
\
if (n & 2) \
{ \
tp0 = zero; \
tp1 = zero; \
\
k = 0; \
x = srcx_org; \
\
for (i = (m >> 3); i--;) \
{ \
DLOAD_X8(); \
DGEMV_T_2x8(); \
\
x += 8 * inc_x; \
k += 8; \
} \
\
if (m & 4) \
{ \
DLOAD_X4(); \
DGEMV_T_2x4(); \
\
x += 4 * inc_x; \
k += 4; \
} \
\
if (m & 2) \
{ \
DLOAD_X2(); \
DGEMV_T_2x2(); \
\
x += 2 * inc_x; \
k += 2; \
} \
\
ILVRL_D2_DP(tp1, tp0, t0, t4); \
\
t0 += t4; \
\
temp0 = t0[0]; \
temp1 = t0[1]; \
\
if (m & 1) \
{ \
temp0 += pa0[k] * x[0]; \
temp1 += pa1[k] * x[0]; \
x += inc_x; \
k++; \
} \
\
res0 = y[0 * inc_y]; \
res1 = y[1 * inc_y]; \
\
res0 += alpha * temp0; \
res1 += alpha * temp1; \
\
y[0 * inc_y] = res0; \
y[1 * inc_y] = res1; \
\
y += 2 * inc_y; \
\
pa0 += 2 * lda; \
pa1 += 2 * lda; \
} \
\
if (n & 1) \
{ \
temp0 = 0.0; \
\
k = 0; \
x = srcx_org; \
\
for (i = m; i--;) \
{ \
temp0 += pa0[k] * x[0]; \
x += inc_x; \
k++; \
} \
\
y[0] += alpha * temp0; \
y += inc_y; \
pa0 += lda; \
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *buffer)
{
BLASLONG i, j, k;
FLOAT *srcx_org = x;
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
v2f64 x0, x1, x2, x3;
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
v2f64 zero = {0};
pa0 = A + 0 * lda;
pa1 = A + 1 * lda;
pa2 = A + 2 * lda;
pa3 = A + 3 * lda;
pa4 = A + 4 * lda;
pa5 = A + 5 * lda;
pa6 = A + 6 * lda;
pa7 = A + 7 * lda;
if (1 == inc_x)
{
#define DLOAD_X8 DLOAD_X8_VECTOR
#define DLOAD_X4 DLOAD_X4_VECTOR
#define DLOAD_X2 DLOAD_X2_VECTOR
DGEMV_T_MSA();
#undef DLOAD_X8
#undef DLOAD_X4
#undef DLOAD_X2
}
else
{
#define DLOAD_X8 DLOAD_X8_GP
#define DLOAD_X4 DLOAD_X4_GP
#define DLOAD_X2 DLOAD_X2_GP
DGEMV_T_MSA();
#undef DLOAD_X8
#undef DLOAD_X4
#undef DLOAD_X2
}
return(0);
}

Some files were not shown because too many files have changed in this diff Show More