conflict resolved by syncing with 'xianyi:develop'
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
This commit is contained in:
commit
085cf236c2
|
@ -151,5 +151,9 @@ In chronological order:
|
||||||
* [2016-03-20] Fix compiler error in VisualStudio with CMake
|
* [2016-03-20] Fix compiler error in VisualStudio with CMake
|
||||||
* [2016-03-22] Fix access violation on Windows while static linking
|
* [2016-03-22] Fix access violation on Windows while static linking
|
||||||
|
|
||||||
|
* Paul Mustière <https://github.com/buffer51/>
|
||||||
|
* [2016-02-04] Fix Android build on ARMV7
|
||||||
|
* [2016-04-26] Android build with LAPACK for ARMV7 & ARMV8
|
||||||
|
|
||||||
* Shivraj Patil <https://github.com/sva-img/>
|
* Shivraj Patil <https://github.com/sva-img/>
|
||||||
* [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA
|
* [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA
|
||||||
|
|
4
Makefile
4
Makefile
|
@ -108,8 +108,6 @@ endif
|
||||||
|
|
||||||
tests :
|
tests :
|
||||||
ifndef NOFORTRAN
|
ifndef NOFORTRAN
|
||||||
ifndef TARGET
|
|
||||||
ifndef CROSS
|
|
||||||
touch $(LIBNAME)
|
touch $(LIBNAME)
|
||||||
ifndef NO_FBLAS
|
ifndef NO_FBLAS
|
||||||
$(MAKE) -C test all
|
$(MAKE) -C test all
|
||||||
|
@ -119,8 +117,6 @@ ifndef NO_CBLAS
|
||||||
$(MAKE) -C ctest all
|
$(MAKE) -C ctest all
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
|
||||||
endif
|
|
||||||
|
|
||||||
libs :
|
libs :
|
||||||
ifeq ($(CORE), UNKOWN)
|
ifeq ($(CORE), UNKOWN)
|
||||||
|
|
|
@ -20,75 +20,75 @@ lib.grd :
|
||||||
$(error OpenBLAS: Please run "make" firstly)
|
$(error OpenBLAS: Please run "make" firstly)
|
||||||
|
|
||||||
install : lib.grd
|
install : lib.grd
|
||||||
@-mkdir -p $(DESTDIR)$(PREFIX)
|
@-mkdir -p "$(DESTDIR)$(PREFIX)"
|
||||||
@-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
@-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)"
|
||||||
@-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR)
|
@-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
||||||
@-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
|
@-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)"
|
||||||
@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||||
#for inc
|
#for inc
|
||||||
@echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
@echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
|
||||||
@echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
@echo \#define OPENBLAS_CONFIG_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
|
||||||
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
|
||||||
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
|
||||||
@cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
@cat openblas_config_template.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
|
||||||
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
|
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
|
||||||
|
|
||||||
@echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
@echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||||
@echo \#ifndef OPENBLAS_F77BLAS_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
|
@echo \#ifndef OPENBLAS_F77BLAS_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
|
||||||
@echo \#define OPENBLAS_F77BLAS_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
|
@echo \#define OPENBLAS_F77BLAS_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
|
||||||
@echo \#include \"openblas_config.h\" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
|
@echo \#include \"openblas_config.h\" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
|
||||||
@cat common_interface.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
|
@cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
|
||||||
@echo \#endif >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
|
@echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
|
||||||
|
|
||||||
ifndef NO_CBLAS
|
ifndef NO_CBLAS
|
||||||
@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||||
@sed 's/common/openblas_config/g' cblas.h > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h
|
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef NO_LAPACKE
|
ifndef NO_LAPACKE
|
||||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
|
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
|
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
|
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
|
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
|
||||||
endif
|
endif
|
||||||
|
|
||||||
#for install static library
|
#for install static library
|
||||||
ifndef NO_STATIC
|
ifndef NO_STATIC
|
||||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||||
@install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||||
endif
|
endif
|
||||||
#for install shared library
|
#for install shared library
|
||||||
ifndef NO_SHARED
|
ifndef NO_SHARED
|
||||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||||
@install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), FreeBSD)
|
ifeq ($(OSNAME), FreeBSD)
|
||||||
@cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), NetBSD)
|
ifeq ($(OSNAME), NetBSD)
|
||||||
@cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), Darwin)
|
ifeq ($(OSNAME), Darwin)
|
||||||
@-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
|
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
|
||||||
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), WINNT)
|
ifeq ($(OSNAME), WINNT)
|
||||||
@-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR)
|
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
||||||
@-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), CYGWIN_NT)
|
ifeq ($(OSNAME), CYGWIN_NT)
|
||||||
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
|
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
|
||||||
|
@ -96,34 +96,34 @@ endif
|
||||||
endif
|
endif
|
||||||
#Generating OpenBLASConfig.cmake
|
#Generating OpenBLASConfig.cmake
|
||||||
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
|
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
|
||||||
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||||
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||||
|
|
||||||
ifndef NO_SHARED
|
ifndef NO_SHARED
|
||||||
#ifeq logical or
|
#ifeq logical or
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
|
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
|
||||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
|
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
|
||||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), Darwin)
|
ifeq ($(OSNAME), Darwin)
|
||||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
#only static
|
#only static
|
||||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||||
endif
|
endif
|
||||||
#Generating OpenBLASConfigVersion.cmake
|
#Generating OpenBLASConfigVersion.cmake
|
||||||
@echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
|
@echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
|
||||||
@echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
@echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||||
@echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
@echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||||
@echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
@echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||||
@echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
@echo "else ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||||
@echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
@echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||||
@echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
@echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||||
@echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
@echo " set (PACKAGE_VERSION_EXACT TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||||
@echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
@echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||||
@echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
|
@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||||
@echo Install OK!
|
@echo Install OK!
|
||||||
|
|
||||||
|
|
|
@ -82,6 +82,7 @@ Please read GotoBLAS_01Readme.txt
|
||||||
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||||
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
|
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
|
||||||
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
|
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
|
||||||
|
- **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||||
|
|
||||||
## Usages
|
## Usages
|
||||||
Link with libopenblas.a or -lopenblas for shared library.
|
Link with libopenblas.a or -lopenblas for shared library.
|
||||||
|
|
18
c_check
18
c_check
|
@ -1,5 +1,7 @@
|
||||||
#!/usr/bin/perl
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
use File::Basename;
|
||||||
|
|
||||||
# Checking cross compile
|
# Checking cross compile
|
||||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||||
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
|
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
|
||||||
|
@ -26,14 +28,12 @@ if ($?) {
|
||||||
|
|
||||||
$cross_suffix = "";
|
$cross_suffix = "";
|
||||||
|
|
||||||
if ($ARGV[0] =~ /(.*)(-[.\d]+)/) {
|
if (dirname($compiler_name) ne ".") {
|
||||||
if ($1 =~ /(.*-)(.*)/) {
|
$cross_suffix .= dirname($compiler_name) . "/";
|
||||||
$cross_suffix = $1;
|
}
|
||||||
}
|
|
||||||
} else {
|
if (basename($compiler_name) =~ /(.*-)(.*)/) {
|
||||||
if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) {
|
$cross_suffix .= $1;
|
||||||
$cross_suffix = $1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$compiler = "";
|
$compiler = "";
|
||||||
|
@ -243,7 +243,7 @@ print MAKEFILE "BINARY64=\n" if $binformat ne bin64;
|
||||||
print MAKEFILE "BINARY32=1\n" if $binformat eq bin32;
|
print MAKEFILE "BINARY32=1\n" if $binformat eq bin32;
|
||||||
print MAKEFILE "BINARY64=1\n" if $binformat eq bin64;
|
print MAKEFILE "BINARY64=1\n" if $binformat eq bin64;
|
||||||
print MAKEFILE "FU=$need_fu\n" if $need_fu ne "";
|
print MAKEFILE "FU=$need_fu\n" if $need_fu ne "";
|
||||||
print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne "";
|
print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne "";
|
||||||
print MAKEFILE "CROSS=1\n" if $cross != 0;
|
print MAKEFILE "CROSS=1\n" if $cross != 0;
|
||||||
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
|
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
|
||||||
|
|
||||||
|
|
|
@ -42,6 +42,7 @@ ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
|
||||||
all :: all1 all2 all3
|
all :: all1 all2 all3
|
||||||
|
|
||||||
all1: xscblat1 xdcblat1 xccblat1 xzcblat1
|
all1: xscblat1 xdcblat1 xccblat1 xzcblat1
|
||||||
|
ifndef CROSS
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
OMP_NUM_THREADS=2 ./xscblat1
|
OMP_NUM_THREADS=2 ./xscblat1
|
||||||
OMP_NUM_THREADS=2 ./xdcblat1
|
OMP_NUM_THREADS=2 ./xdcblat1
|
||||||
|
@ -53,8 +54,10 @@ else
|
||||||
OPENBLAS_NUM_THREADS=2 ./xccblat1
|
OPENBLAS_NUM_THREADS=2 ./xccblat1
|
||||||
OPENBLAS_NUM_THREADS=2 ./xzcblat1
|
OPENBLAS_NUM_THREADS=2 ./xzcblat1
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
all2: xscblat2 xdcblat2 xccblat2 xzcblat2
|
all2: xscblat2 xdcblat2 xccblat2 xzcblat2
|
||||||
|
ifndef CROSS
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
OMP_NUM_THREADS=2 ./xscblat2 < sin2
|
OMP_NUM_THREADS=2 ./xscblat2 < sin2
|
||||||
OMP_NUM_THREADS=2 ./xdcblat2 < din2
|
OMP_NUM_THREADS=2 ./xdcblat2 < din2
|
||||||
|
@ -66,8 +69,10 @@ else
|
||||||
OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2
|
OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2
|
||||||
OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2
|
OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
all3: xscblat3 xdcblat3 xccblat3 xzcblat3
|
all3: xscblat3 xdcblat3 xccblat3 xzcblat3
|
||||||
|
ifndef CROSS
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
OMP_NUM_THREADS=2 ./xscblat3 < sin3
|
OMP_NUM_THREADS=2 ./xscblat3 < sin3
|
||||||
OMP_NUM_THREADS=2 ./xdcblat3 < din3
|
OMP_NUM_THREADS=2 ./xdcblat3 < din3
|
||||||
|
@ -88,6 +93,7 @@ else
|
||||||
OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
|
OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
|
||||||
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
|
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -439,7 +439,7 @@ static gotoblas_t *force_coretype(char *coretype){
|
||||||
char message[128];
|
char message[128];
|
||||||
//char mname[20];
|
//char mname[20];
|
||||||
|
|
||||||
for ( i=1 ; i <= 21; i++)
|
for ( i=1 ; i <= 22; i++)
|
||||||
{
|
{
|
||||||
if (!strncasecmp(coretype,corename[i],20))
|
if (!strncasecmp(coretype,corename[i],20))
|
||||||
{
|
{
|
||||||
|
|
|
@ -361,6 +361,9 @@ static void numa_mapping(void) {
|
||||||
unsigned long work, bit;
|
unsigned long work, bit;
|
||||||
int count = 0;
|
int count = 0;
|
||||||
int bitmask_idx = 0;
|
int bitmask_idx = 0;
|
||||||
|
int current_cpu;
|
||||||
|
int current_node = 0;
|
||||||
|
int cpu_count = 0;
|
||||||
|
|
||||||
for (node = 0; node < common -> num_nodes; node ++) {
|
for (node = 0; node < common -> num_nodes; node ++) {
|
||||||
core = 0;
|
core = 0;
|
||||||
|
@ -382,6 +385,56 @@ static void numa_mapping(void) {
|
||||||
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
|
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
current_cpu = sched_getcpu();
|
||||||
|
for (cpu = 0; cpu < count; cpu++) {
|
||||||
|
if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) {
|
||||||
|
current_node = READ_NODE(common -> cpu_info[cpu]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (i = 0; i < MAX_BITMASK_LEN; i++)
|
||||||
|
cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If all the processes can be accommodated in the
|
||||||
|
* in the current node itself, then bind to cores
|
||||||
|
* from the current node only
|
||||||
|
*/
|
||||||
|
if (numprocs <= cpu_count) {
|
||||||
|
/*
|
||||||
|
* First sort all the cores in order from the current node.
|
||||||
|
* Then take remaining nodes one by one in order,
|
||||||
|
* and sort their cores in order.
|
||||||
|
*/
|
||||||
|
for (i = 0; i < count; i++) {
|
||||||
|
for (j = 0; j < count - 1; j++) {
|
||||||
|
int node_1, node_2;
|
||||||
|
int core_1, core_2;
|
||||||
|
int swap = 0;
|
||||||
|
|
||||||
|
node_1 = READ_NODE(common -> cpu_info[j]);
|
||||||
|
node_2 = READ_NODE(common -> cpu_info[j + 1]);
|
||||||
|
core_1 = READ_CORE(common -> cpu_info[j]);
|
||||||
|
core_2 = READ_CORE(common -> cpu_info[j + 1]);
|
||||||
|
|
||||||
|
if (node_1 == node_2) {
|
||||||
|
if (core_1 > core_2)
|
||||||
|
swap = 1;
|
||||||
|
} else {
|
||||||
|
if ((node_2 == current_node) ||
|
||||||
|
((node_1 != current_node) && (node_1 > node_2)))
|
||||||
|
swap = 1;
|
||||||
|
}
|
||||||
|
if (swap) {
|
||||||
|
unsigned long temp;
|
||||||
|
|
||||||
|
temp = common->cpu_info[j];
|
||||||
|
common->cpu_info[j] = common->cpu_info[j + 1];
|
||||||
|
common->cpu_info[j + 1] = temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
h = 1;
|
h = 1;
|
||||||
|
|
||||||
while (h < count) h = 2 * h + 1;
|
while (h < count) h = 2 * h + 1;
|
||||||
|
@ -411,12 +464,16 @@ static void numa_mapping(void) {
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
fprintf(stderr, "\nSorting ...\n\n");
|
fprintf(stderr, "\nSorting ...\n\n");
|
||||||
|
|
||||||
for (cpu = 0; cpu < count; cpu++)
|
for (cpu = 0; cpu < count; cpu++)
|
||||||
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
|
fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu],
|
||||||
|
READ_CPU(common -> cpu_info[cpu]),
|
||||||
|
READ_CORE(common -> cpu_info[cpu]),
|
||||||
|
READ_NODE(common -> cpu_info[cpu]));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -167,7 +167,7 @@ int get_L2_size(void){
|
||||||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
|
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
|
||||||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
||||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER)
|
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
|
|
||||||
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
||||||
|
|
||||||
|
@ -251,7 +251,7 @@ int get_L2_size(void){
|
||||||
void blas_set_parameter(void){
|
void blas_set_parameter(void){
|
||||||
|
|
||||||
int factor;
|
int factor;
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
int size = 16;
|
int size = 16;
|
||||||
#else
|
#else
|
||||||
int size = get_L2_size();
|
int size = get_L2_size();
|
||||||
|
|
|
@ -110,9 +110,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
|
||||||
endif
|
endif
|
||||||
ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2))
|
ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2))
|
||||||
#only build without Fortran
|
#only build without Fortran
|
||||||
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||||
else
|
else
|
||||||
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
dllinit.$(SUFFIX) : dllinit.c
|
dllinit.$(SUFFIX) : dllinit.c
|
||||||
|
|
|
@ -12,7 +12,7 @@ SGEMMKERNEL = sgemm_kernel_16x8_power8.S
|
||||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
SGEMMITCOPY = sgemm_tcopy_16_power8.S
|
SGEMMITCOPY = sgemm_tcopy_16_power8.S
|
||||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||||
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
SGEMMOTCOPY = sgemm_tcopy_8_power8.S
|
||||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||||
|
@ -21,8 +21,8 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||||
DGEMMKERNEL = dgemm_kernel_16x4_power8.S
|
DGEMMKERNEL = dgemm_kernel_16x4_power8.S
|
||||||
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
DGEMMITCOPY = dgemm_tcopy_16_power8.S
|
DGEMMITCOPY = dgemm_tcopy_16_power8.S
|
||||||
DGEMMONCOPY = gemm_ncopy_4.S
|
DGEMMONCOPY = dgemm_ncopy_4_power8.S
|
||||||
DGEMMOTCOPY = gemm_tcopy_4.S
|
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||||
|
@ -30,7 +30,7 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||||
|
|
||||||
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
||||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
CGEMMITCOPY = cgemm_tcopy_8_power8.S
|
||||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||||
|
@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
|
||||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
ZGEMMITCOPY = zgemm_tcopy_8_power8.S
|
||||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||||
ZGEMMINCOPYOBJ = zgemm_incopy.o
|
ZGEMMINCOPYOBJ = zgemm_incopy.o
|
||||||
|
|
|
@ -0,0 +1,206 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
#include "def_vsx.h"
|
||||||
|
|
||||||
|
#define M r3
|
||||||
|
#define N r4
|
||||||
|
#define A r5
|
||||||
|
#define LDA r6
|
||||||
|
#define B r7
|
||||||
|
|
||||||
|
#define A0 r8
|
||||||
|
#define A1 r9
|
||||||
|
#define A2 r10
|
||||||
|
#define A3 r11
|
||||||
|
|
||||||
|
#define J r12
|
||||||
|
|
||||||
|
#define PREA r14
|
||||||
|
#define PREB r15
|
||||||
|
#define BO r16
|
||||||
|
#define B8 r17
|
||||||
|
#define B4 r18
|
||||||
|
#define B2 r19
|
||||||
|
#define B1 r20
|
||||||
|
#define o4 r21
|
||||||
|
#define T2 r22
|
||||||
|
#define I r23
|
||||||
|
#define o16 r24
|
||||||
|
#define o32 r25
|
||||||
|
#define o48 r26
|
||||||
|
#define NOTUS2 r27
|
||||||
|
#define M8 r30
|
||||||
|
#define T1 r31
|
||||||
|
|
||||||
|
#define o0 0
|
||||||
|
|
||||||
|
#include "cgemm_tcopy_macros_8_power8.S"
|
||||||
|
|
||||||
|
#define STACKSIZE 384
|
||||||
|
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
addi SP, SP, -STACKSIZE
|
||||||
|
li r0, 0
|
||||||
|
|
||||||
|
std r31, 144(SP)
|
||||||
|
std r30, 152(SP)
|
||||||
|
std r29, 160(SP)
|
||||||
|
std r28, 168(SP)
|
||||||
|
std r27, 176(SP)
|
||||||
|
std r26, 184(SP)
|
||||||
|
std r25, 192(SP)
|
||||||
|
std r24, 200(SP)
|
||||||
|
std r23, 208(SP)
|
||||||
|
std r22, 216(SP)
|
||||||
|
std r21, 224(SP)
|
||||||
|
std r20, 232(SP)
|
||||||
|
std r19, 240(SP)
|
||||||
|
std r18, 248(SP)
|
||||||
|
std r17, 256(SP)
|
||||||
|
std r16, 264(SP)
|
||||||
|
std r15, 272(SP)
|
||||||
|
std r14, 280(SP)
|
||||||
|
|
||||||
|
cmpwi cr0, M, 0
|
||||||
|
ble- L999
|
||||||
|
cmpwi cr0, N, 0
|
||||||
|
ble- L999
|
||||||
|
|
||||||
|
slwi LDA, LDA, ZBASE_SHIFT
|
||||||
|
slwi M8, M, 3 + ZBASE_SHIFT
|
||||||
|
|
||||||
|
li T2, -8
|
||||||
|
li PREA, -4
|
||||||
|
li PREB, -2
|
||||||
|
|
||||||
|
and B4, N, T2
|
||||||
|
and B2, N, PREA
|
||||||
|
and B1, N, PREB
|
||||||
|
|
||||||
|
mullw B4, B4, M
|
||||||
|
mullw B2, B2, M
|
||||||
|
mullw B1, B1, M
|
||||||
|
|
||||||
|
slwi B4, B4, ZBASE_SHIFT
|
||||||
|
slwi B2, B2, ZBASE_SHIFT
|
||||||
|
slwi B1, B1, ZBASE_SHIFT
|
||||||
|
|
||||||
|
add B4, B4, B
|
||||||
|
add B2, B2, B
|
||||||
|
add B1, B1, B
|
||||||
|
|
||||||
|
li PREA, 384
|
||||||
|
addi PREB, M8, 128
|
||||||
|
|
||||||
|
li o4, 4
|
||||||
|
li o16, 16
|
||||||
|
li o32, 32
|
||||||
|
li o48, 48
|
||||||
|
|
||||||
|
#include "cgemm_tcopy_logic_8_power8.S"
|
||||||
|
|
||||||
|
L999:
|
||||||
|
|
||||||
|
li r3, 0
|
||||||
|
|
||||||
|
ld r31, 144(SP)
|
||||||
|
ld r30, 152(SP)
|
||||||
|
ld r29, 160(SP)
|
||||||
|
ld r28, 168(SP)
|
||||||
|
ld r27, 176(SP)
|
||||||
|
ld r26, 184(SP)
|
||||||
|
ld r25, 192(SP)
|
||||||
|
ld r24, 200(SP)
|
||||||
|
ld r23, 208(SP)
|
||||||
|
ld r22, 216(SP)
|
||||||
|
ld r21, 224(SP)
|
||||||
|
ld r20, 232(SP)
|
||||||
|
ld r19, 240(SP)
|
||||||
|
ld r18, 248(SP)
|
||||||
|
ld r17, 256(SP)
|
||||||
|
ld r16, 264(SP)
|
||||||
|
ld r15, 272(SP)
|
||||||
|
ld r14, 280(SP)
|
||||||
|
|
||||||
|
addi SP, SP, STACKSIZE
|
||||||
|
|
||||||
|
blr
|
||||||
|
EPILOGUE
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,247 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
srawi. I, M, 2
|
||||||
|
ble CCOPYT_L2_BEGIN
|
||||||
|
|
||||||
|
|
||||||
|
CCOPYT_L4_BEGIN:
|
||||||
|
|
||||||
|
mr A0, A
|
||||||
|
add A1, A0, LDA
|
||||||
|
add A2, A1, LDA
|
||||||
|
add A3, A2, LDA
|
||||||
|
add A, A3, LDA
|
||||||
|
mr B8, B
|
||||||
|
addi B, B, 64*SIZE
|
||||||
|
|
||||||
|
sradi. J, N, 3
|
||||||
|
ble CCOPYT_L4x4_BEGIN
|
||||||
|
|
||||||
|
mr BO, B8
|
||||||
|
|
||||||
|
CCOPYT_L4x8_LOOP:
|
||||||
|
|
||||||
|
dcbt A0, PREA
|
||||||
|
dcbt A1, PREA
|
||||||
|
dcbt A2, PREA
|
||||||
|
dcbt A3, PREA
|
||||||
|
dcbtst BO, M8
|
||||||
|
dcbtst BO, PREB
|
||||||
|
COPY_4x8
|
||||||
|
|
||||||
|
add BO, BO, M8
|
||||||
|
|
||||||
|
addic. J, J, -1
|
||||||
|
ble CCOPYT_L4x4_BEGIN
|
||||||
|
|
||||||
|
|
||||||
|
COPY_4x8
|
||||||
|
|
||||||
|
add BO, BO, M8
|
||||||
|
|
||||||
|
addic. J, J, -1
|
||||||
|
bgt CCOPYT_L4x8_LOOP
|
||||||
|
|
||||||
|
CCOPYT_L4x4_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 4
|
||||||
|
ble CCOPYT_L4x2_BEGIN
|
||||||
|
|
||||||
|
mr BO, B4
|
||||||
|
|
||||||
|
COPY_4x4
|
||||||
|
|
||||||
|
|
||||||
|
addi B4, B4, 32*SIZE
|
||||||
|
|
||||||
|
CCOPYT_L4x2_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 2
|
||||||
|
ble CCOPYT_L4x1_BEGIN
|
||||||
|
|
||||||
|
mr BO, B2
|
||||||
|
|
||||||
|
COPY_4x2
|
||||||
|
|
||||||
|
|
||||||
|
addi B2, B2, 16*SIZE
|
||||||
|
|
||||||
|
CCOPYT_L4x1_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 1
|
||||||
|
ble CCOPYT_L4_END
|
||||||
|
|
||||||
|
mr BO, B1
|
||||||
|
|
||||||
|
COPY_4x1
|
||||||
|
|
||||||
|
|
||||||
|
addi B1, B1, 8*SIZE
|
||||||
|
|
||||||
|
CCOPYT_L4_END:
|
||||||
|
|
||||||
|
addic. I, I, -1
|
||||||
|
bgt CCOPYT_L4_BEGIN
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
CCOPYT_L2_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, M, 2
|
||||||
|
ble CCOPYT_L1_BEGIN
|
||||||
|
|
||||||
|
mr A0, A
|
||||||
|
add A1, A0, LDA
|
||||||
|
add A, A1, LDA
|
||||||
|
mr B8, B
|
||||||
|
addi B, B, 32*SIZE
|
||||||
|
|
||||||
|
sradi. J, N, 3
|
||||||
|
ble CCOPYT_L2x4_BEGIN
|
||||||
|
|
||||||
|
mr BO, B8
|
||||||
|
|
||||||
|
CCOPYT_L2x8_LOOP:
|
||||||
|
|
||||||
|
COPY_2x8
|
||||||
|
|
||||||
|
add BO, BO, M8
|
||||||
|
|
||||||
|
addic. J, J, -1
|
||||||
|
bgt CCOPYT_L2x8_LOOP
|
||||||
|
|
||||||
|
CCOPYT_L2x4_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 4
|
||||||
|
ble CCOPYT_L2x2_BEGIN
|
||||||
|
|
||||||
|
mr BO, B4
|
||||||
|
|
||||||
|
COPY_2x4
|
||||||
|
|
||||||
|
|
||||||
|
addi B4, B4, 16*SIZE
|
||||||
|
|
||||||
|
CCOPYT_L2x2_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 2
|
||||||
|
ble CCOPYT_L2x1_BEGIN
|
||||||
|
|
||||||
|
mr BO, B2
|
||||||
|
|
||||||
|
COPY_2x2
|
||||||
|
|
||||||
|
|
||||||
|
addi B2, B2, 8*SIZE
|
||||||
|
|
||||||
|
CCOPYT_L2x1_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 1
|
||||||
|
ble CCOPYT_L2_END
|
||||||
|
|
||||||
|
mr BO, B1
|
||||||
|
|
||||||
|
COPY_2x1
|
||||||
|
|
||||||
|
|
||||||
|
addi B1, B1, 4*SIZE
|
||||||
|
|
||||||
|
CCOPYT_L2_END:
|
||||||
|
|
||||||
|
|
||||||
|
CCOPYT_L1_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, M, 1
|
||||||
|
ble L999
|
||||||
|
|
||||||
|
mr A0, A
|
||||||
|
add A, A0, LDA
|
||||||
|
mr B8, B
|
||||||
|
addi B, B, 16*SIZE
|
||||||
|
|
||||||
|
sradi. J, N, 3
|
||||||
|
ble CCOPYT_L1x4_BEGIN
|
||||||
|
|
||||||
|
mr BO, B8
|
||||||
|
|
||||||
|
CCOPYT_L1x8_LOOP:
|
||||||
|
|
||||||
|
COPY_1x8
|
||||||
|
|
||||||
|
add BO, BO, M8
|
||||||
|
|
||||||
|
addic. J, J, -1
|
||||||
|
bgt CCOPYT_L1x8_LOOP
|
||||||
|
|
||||||
|
CCOPYT_L1x4_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 4
|
||||||
|
ble CCOPYT_L1x2_BEGIN
|
||||||
|
|
||||||
|
mr BO, B4
|
||||||
|
|
||||||
|
COPY_1x4
|
||||||
|
|
||||||
|
|
||||||
|
addi B4, B4, 8*SIZE
|
||||||
|
|
||||||
|
CCOPYT_L1x2_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 2
|
||||||
|
ble CCOPYT_L1x1_BEGIN
|
||||||
|
|
||||||
|
mr BO, B2
|
||||||
|
|
||||||
|
COPY_1x2
|
||||||
|
|
||||||
|
|
||||||
|
addi B2, B2, 4*SIZE
|
||||||
|
|
||||||
|
CCOPYT_L1x1_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 1
|
||||||
|
ble CCOPYT_L1_END
|
||||||
|
|
||||||
|
mr BO, B1
|
||||||
|
|
||||||
|
COPY_1x1
|
||||||
|
|
||||||
|
|
||||||
|
addi B1, B1, 2*SIZE
|
||||||
|
|
||||||
|
CCOPYT_L1_END:
|
||||||
|
|
|
@ -0,0 +1,385 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=8
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x8
|
||||||
|
|
||||||
|
lxvw4x vs32, o0, A0
|
||||||
|
lxvw4x vs33, o16, A0
|
||||||
|
lxvw4x vs34, o32, A0
|
||||||
|
lxvw4x vs35, o48, A0
|
||||||
|
|
||||||
|
lxvw4x vs36, o0, A1
|
||||||
|
lxvw4x vs37, o16, A1
|
||||||
|
lxvw4x vs38, o32, A1
|
||||||
|
lxvw4x vs39, o48, A1
|
||||||
|
|
||||||
|
addi A0, A0, 64
|
||||||
|
addi A1, A1, 64
|
||||||
|
|
||||||
|
lxvw4x vs40, o0, A2
|
||||||
|
lxvw4x vs41, o16, A2
|
||||||
|
lxvw4x vs42, o32, A2
|
||||||
|
lxvw4x vs43, o48, A2
|
||||||
|
|
||||||
|
lxvw4x vs44, o0, A3
|
||||||
|
lxvw4x vs45, o16, A3
|
||||||
|
lxvw4x vs46, o32, A3
|
||||||
|
lxvw4x vs47, o48, A3
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
addi A2, A2, 64
|
||||||
|
addi A3, A3, 64
|
||||||
|
|
||||||
|
stxvw4x vs32, o0, T1
|
||||||
|
stxvw4x vs33, o16, T1
|
||||||
|
stxvw4x vs34, o32, T1
|
||||||
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvw4x vs36, o0, T1
|
||||||
|
stxvw4x vs37, o16, T1
|
||||||
|
stxvw4x vs38, o32, T1
|
||||||
|
stxvw4x vs39, o48, T1
|
||||||
|
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvw4x vs40, o0, T1
|
||||||
|
stxvw4x vs41, o16, T1
|
||||||
|
stxvw4x vs42, o32, T1
|
||||||
|
stxvw4x vs43, o48, T1
|
||||||
|
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvw4x vs44, o0, T1
|
||||||
|
stxvw4x vs45, o16, T1
|
||||||
|
stxvw4x vs46, o32, T1
|
||||||
|
stxvw4x vs47, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=4
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x4
|
||||||
|
|
||||||
|
lxvw4x vs32, o0, A0
|
||||||
|
lxvw4x vs33, o16, A0
|
||||||
|
addi A0, A0, 32
|
||||||
|
|
||||||
|
lxvw4x vs34, o0, A1
|
||||||
|
lxvw4x vs35, o16, A1
|
||||||
|
addi A1, A1, 32
|
||||||
|
|
||||||
|
lxvw4x vs36, o0, A2
|
||||||
|
lxvw4x vs37, o16, A2
|
||||||
|
addi A2, A2, 32
|
||||||
|
|
||||||
|
lxvw4x vs38, o0, A3
|
||||||
|
lxvw4x vs39, o16, A3
|
||||||
|
addi A3, A3, 32
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvw4x vs32, o0, T1
|
||||||
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
stxvw4x vs34, o32, T1
|
||||||
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvw4x vs36, o0, T1
|
||||||
|
stxvw4x vs37, o16, T1
|
||||||
|
|
||||||
|
stxvw4x vs38, o32, T1
|
||||||
|
stxvw4x vs39, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=2
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x2
|
||||||
|
|
||||||
|
lxvw4x vs32, o0, A0
|
||||||
|
addi A0, A0, 16
|
||||||
|
|
||||||
|
lxvw4x vs33, o0, A1
|
||||||
|
addi A1, A1, 16
|
||||||
|
|
||||||
|
lxvw4x vs34, o0, A2
|
||||||
|
addi A2, A2, 16
|
||||||
|
|
||||||
|
lxvw4x vs35, o0, A3
|
||||||
|
addi A3, A3, 16
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvw4x vs32, o0, T1
|
||||||
|
|
||||||
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
stxvw4x vs34, o32, T1
|
||||||
|
|
||||||
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=1
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x1
|
||||||
|
|
||||||
|
lxsspx vs32, o0, A0
|
||||||
|
lxsspx vs33, o4, A0
|
||||||
|
addi A0, A0, 8
|
||||||
|
|
||||||
|
lxsspx vs34, o0, A1
|
||||||
|
lxsspx vs35, o4, A1
|
||||||
|
addi A1, A1, 8
|
||||||
|
|
||||||
|
lxsspx vs36, o0, A2
|
||||||
|
lxsspx vs37, o4, A2
|
||||||
|
addi A2, A2, 8
|
||||||
|
|
||||||
|
lxsspx vs38, o0, A3
|
||||||
|
lxsspx vs39, o4, A3
|
||||||
|
addi A3, A3, 8
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxsspx vs32, o0, T1
|
||||||
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
addi T1, T1, 8
|
||||||
|
|
||||||
|
stxsspx vs34, o0, T1
|
||||||
|
stxsspx vs35, o4, T1
|
||||||
|
|
||||||
|
addi T1, T1, 8
|
||||||
|
|
||||||
|
stxsspx vs36, o0, T1
|
||||||
|
stxsspx vs37, o4, T1
|
||||||
|
|
||||||
|
addi T1, T1, 8
|
||||||
|
|
||||||
|
stxsspx vs38, o0, T1
|
||||||
|
stxsspx vs39, o4, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=8
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x8
|
||||||
|
|
||||||
|
lxvw4x vs32, o0, A0
|
||||||
|
lxvw4x vs33, o16, A0
|
||||||
|
lxvw4x vs34, o32, A0
|
||||||
|
lxvw4x vs35, o48, A0
|
||||||
|
addi A0, A0, 64
|
||||||
|
|
||||||
|
lxvw4x vs36, o0, A1
|
||||||
|
lxvw4x vs37, o16, A1
|
||||||
|
lxvw4x vs38, o32, A1
|
||||||
|
lxvw4x vs39, o48, A1
|
||||||
|
addi A1, A1, 64
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvw4x vs32, o0, T1
|
||||||
|
stxvw4x vs33, o16, T1
|
||||||
|
stxvw4x vs34, o32, T1
|
||||||
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvw4x vs36, o0, T1
|
||||||
|
stxvw4x vs37, o16, T1
|
||||||
|
stxvw4x vs38, o32, T1
|
||||||
|
stxvw4x vs39, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=4
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x4
|
||||||
|
|
||||||
|
lxvw4x vs32, o0, A0
|
||||||
|
lxvw4x vs33, o16, A0
|
||||||
|
addi A0, A0, 32
|
||||||
|
|
||||||
|
lxvw4x vs34, o0, A1
|
||||||
|
lxvw4x vs35, o16, A1
|
||||||
|
addi A1, A1, 32
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvw4x vs32, o0, T1
|
||||||
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
stxvw4x vs34, o32, T1
|
||||||
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=2
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x2
|
||||||
|
|
||||||
|
lxvw4x vs32, o0, A0
|
||||||
|
addi A0, A0, 16
|
||||||
|
|
||||||
|
lxvw4x vs33, o0, A1
|
||||||
|
addi A1, A1, 16
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvw4x vs32, o0, T1
|
||||||
|
|
||||||
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=1
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x1
|
||||||
|
|
||||||
|
lxsspx vs32, o0, A0
|
||||||
|
lxsspx vs33, o4, A0
|
||||||
|
addi A0, A0, 8
|
||||||
|
|
||||||
|
lxsspx vs34, o0, A1
|
||||||
|
lxsspx vs35, o4, A1
|
||||||
|
addi A1, A1, 8
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxsspx vs32, o0, T1
|
||||||
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
addi T1, T1, 8
|
||||||
|
|
||||||
|
stxsspx vs34, o0, T1
|
||||||
|
stxsspx vs35, o4, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=8
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x8
|
||||||
|
|
||||||
|
lxvw4x vs32, o0, A0
|
||||||
|
lxvw4x vs33, o16, A0
|
||||||
|
lxvw4x vs34, o32, A0
|
||||||
|
lxvw4x vs35, o48, A0
|
||||||
|
addi A0, A0, 64
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvw4x vs32, o0, T1
|
||||||
|
stxvw4x vs33, o16, T1
|
||||||
|
stxvw4x vs34, o32, T1
|
||||||
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=4
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x4
|
||||||
|
|
||||||
|
lxvw4x vs32, o0, A0
|
||||||
|
lxvw4x vs33, o16, A0
|
||||||
|
addi A0, A0, 32
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvw4x vs32, o0, T1
|
||||||
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=2
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x2
|
||||||
|
|
||||||
|
lxvw4x vs32, o0, A0
|
||||||
|
addi A0, A0, 16
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvw4x vs32, o0, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=1
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x1
|
||||||
|
|
||||||
|
lxsspx vs32, o0, A0
|
||||||
|
lxsspx vs33, o4, A0
|
||||||
|
addi A0, A0, 8
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxsspx vs32, o0, T1
|
||||||
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
|
@ -131,13 +131,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define o0 0
|
#define o0 0
|
||||||
|
|
||||||
|
#define T4 r12
|
||||||
|
#define T3 r11
|
||||||
|
|
||||||
|
#define o40 r12
|
||||||
|
#define o56 r11
|
||||||
|
|
||||||
|
#define o112 r14
|
||||||
#define o8 r15
|
#define o8 r15
|
||||||
#define o24 r16
|
#define o24 r16
|
||||||
#define ALPHA r17
|
#define o64 r17
|
||||||
#define L r18
|
#define L r18
|
||||||
#define T1 r19
|
#define T1 r19
|
||||||
#define KK r20
|
#define o80 r20
|
||||||
#define BB r21
|
#define o96 r21
|
||||||
#define I r22
|
#define I r22
|
||||||
#define J r23
|
#define J r23
|
||||||
#define AO r24
|
#define AO r24
|
||||||
|
@ -202,6 +209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
std r17, 256(SP)
|
std r17, 256(SP)
|
||||||
std r16, 264(SP)
|
std r16, 264(SP)
|
||||||
std r15, 272(SP)
|
std r15, 272(SP)
|
||||||
|
std r14, 280(SP)
|
||||||
#else
|
#else
|
||||||
stw r31, 144(SP)
|
stw r31, 144(SP)
|
||||||
stw r30, 148(SP)
|
stw r30, 148(SP)
|
||||||
|
@ -220,6 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stw r17, 200(SP)
|
stw r17, 200(SP)
|
||||||
stw r16, 204(SP)
|
stw r16, 204(SP)
|
||||||
stw r15, 208(SP)
|
stw r15, 208(SP)
|
||||||
|
stw r14, 212(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
stfd f1, ALPHA_SP
|
stfd f1, ALPHA_SP
|
||||||
|
@ -260,19 +269,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
ble .L999_H1
|
ble .L999_H1
|
||||||
|
|
||||||
#ifdef __64BIT__
|
#ifdef __64BIT__
|
||||||
addi ALPHA, SP, 296
|
addi T1, SP, 296
|
||||||
#else
|
#else
|
||||||
addi ALPHA, SP, 224
|
addi T1, SP, 224
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
li PRE, 256
|
li PRE, 384
|
||||||
li o8 , 8
|
li o8 , 8
|
||||||
li o16, 16
|
li o16, 16
|
||||||
li o24, 24
|
li o24, 24
|
||||||
li o32, 32
|
li o32, 32
|
||||||
li o48, 48
|
li o48, 48
|
||||||
|
li o64, 64
|
||||||
|
li o80, 80
|
||||||
|
li o96, 96
|
||||||
|
li o112, 112
|
||||||
|
|
||||||
lxvdsx alpha_r, 0, ALPHA
|
lxvdsx alpha_r, 0, T1
|
||||||
|
|
||||||
#include "dgemm_logic_16x4_power8.S"
|
#include "dgemm_logic_16x4_power8.S"
|
||||||
|
|
||||||
|
@ -320,6 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
ld r17, 256(SP)
|
ld r17, 256(SP)
|
||||||
ld r16, 264(SP)
|
ld r16, 264(SP)
|
||||||
ld r15, 272(SP)
|
ld r15, 272(SP)
|
||||||
|
ld r14, 280(SP)
|
||||||
#else
|
#else
|
||||||
lwz r31, 144(SP)
|
lwz r31, 144(SP)
|
||||||
lwz r30, 148(SP)
|
lwz r30, 148(SP)
|
||||||
|
@ -338,6 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
lwz r17, 200(SP)
|
lwz r17, 200(SP)
|
||||||
lwz r16, 204(SP)
|
lwz r16, 204(SP)
|
||||||
lwz r15, 208(SP)
|
lwz r15, 208(SP)
|
||||||
|
lwz r14, 212(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
addi SP, SP, STACKSIZE
|
addi SP, SP, STACKSIZE
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -47,21 +47,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
lxvdsx vs24, 0, BO
|
lxvdsx vs24, 0, BO
|
||||||
lxvdsx vs25, o8, BO
|
lxvdsx vs25, o8, BO
|
||||||
|
|
||||||
addi AO, AO, 64
|
lxvd2x vs4, o64, AO
|
||||||
|
lxvd2x vs5, o80, AO
|
||||||
lxvd2x vs4, 0, AO
|
lxvd2x vs6, o96, AO
|
||||||
lxvd2x vs5, o16, AO
|
lxvd2x vs7, o112, AO
|
||||||
lxvd2x vs6, o32, AO
|
|
||||||
lxvd2x vs7, o48, AO
|
|
||||||
|
|
||||||
lxvdsx vs26, o16, BO
|
lxvdsx vs26, o16, BO
|
||||||
lxvdsx vs27, o24, BO
|
lxvdsx vs27, o24, BO
|
||||||
|
|
||||||
addi AO, AO, 64
|
addi AO, AO, 128
|
||||||
addi BO, BO, 32
|
addi BO, BO, 32
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
.macro KERNEL4x16_I1
|
.macro KERNEL4x16_I1
|
||||||
|
|
||||||
xvmuldp vs32, vs0, vs24
|
xvmuldp vs32, vs0, vs24
|
||||||
|
@ -69,8 +68,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
xvmuldp vs34, vs2, vs24
|
xvmuldp vs34, vs2, vs24
|
||||||
xvmuldp vs35, vs3, vs24
|
xvmuldp vs35, vs3, vs24
|
||||||
|
|
||||||
lxvd2x vs8, 0, AO
|
lxvd2x vs8, o0, AO
|
||||||
lxvd2x vs9, o16, AO
|
lxvd2x vs9, o16, AO
|
||||||
|
lxvd2x vs10, o32, AO
|
||||||
|
lxvd2x vs11, o48, AO
|
||||||
|
|
||||||
xvmuldp vs36, vs4, vs24
|
xvmuldp vs36, vs4, vs24
|
||||||
xvmuldp vs37, vs5, vs24
|
xvmuldp vs37, vs5, vs24
|
||||||
|
@ -85,37 +86,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
xvmuldp vs42, vs2, vs25
|
xvmuldp vs42, vs2, vs25
|
||||||
xvmuldp vs43, vs3, vs25
|
xvmuldp vs43, vs3, vs25
|
||||||
|
|
||||||
lxvd2x vs10, o32, AO
|
|
||||||
lxvd2x vs11, o48, AO
|
|
||||||
|
|
||||||
xvmuldp vs44, vs4, vs25
|
xvmuldp vs44, vs4, vs25
|
||||||
xvmuldp vs45, vs5, vs25
|
xvmuldp vs45, vs5, vs25
|
||||||
xvmuldp vs46, vs6, vs25
|
xvmuldp vs46, vs6, vs25
|
||||||
xvmuldp vs47, vs7, vs25
|
xvmuldp vs47, vs7, vs25
|
||||||
|
|
||||||
addi AO, AO, 64
|
|
||||||
|
|
||||||
xvmuldp vs48, vs0, vs26
|
xvmuldp vs48, vs0, vs26
|
||||||
xvmuldp vs49, vs1, vs26
|
xvmuldp vs49, vs1, vs26
|
||||||
xvmuldp vs50, vs2, vs26
|
xvmuldp vs50, vs2, vs26
|
||||||
xvmuldp vs51, vs3, vs26
|
xvmuldp vs51, vs3, vs26
|
||||||
|
|
||||||
lxvd2x vs12, 0, AO
|
lxvd2x vs12, o64, AO
|
||||||
lxvd2x vs13, o16, AO
|
lxvd2x vs13, o80, AO
|
||||||
|
|
||||||
xvmuldp vs52, vs4, vs26
|
xvmuldp vs52, vs4, vs26
|
||||||
xvmuldp vs53, vs5, vs26
|
xvmuldp vs53, vs5, vs26
|
||||||
xvmuldp vs54, vs6, vs26
|
xvmuldp vs54, vs6, vs26
|
||||||
xvmuldp vs55, vs7, vs26
|
xvmuldp vs55, vs7, vs26
|
||||||
|
|
||||||
lxvd2x vs14, o32, AO
|
lxvd2x vs14, o96, AO
|
||||||
lxvd2x vs15, o48, AO
|
lxvd2x vs15, o112, AO
|
||||||
|
|
||||||
xvmuldp vs56, vs0, vs27
|
xvmuldp vs56, vs0, vs27
|
||||||
xvmuldp vs57, vs1, vs27
|
xvmuldp vs57, vs1, vs27
|
||||||
xvmuldp vs58, vs2, vs27
|
xvmuldp vs58, vs2, vs27
|
||||||
xvmuldp vs59, vs3, vs27
|
xvmuldp vs59, vs3, vs27
|
||||||
|
|
||||||
|
|
||||||
lxvdsx vs30, o16, BO
|
lxvdsx vs30, o16, BO
|
||||||
lxvdsx vs31, o24, BO
|
lxvdsx vs31, o24, BO
|
||||||
|
|
||||||
|
@ -124,11 +123,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
xvmuldp vs62, vs6, vs27
|
xvmuldp vs62, vs6, vs27
|
||||||
xvmuldp vs63, vs7, vs27
|
xvmuldp vs63, vs7, vs27
|
||||||
|
|
||||||
addi AO, AO, 64
|
addi AO, AO, 128
|
||||||
addi BO, BO, 32
|
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
.macro KERNEL4x16_1
|
.macro KERNEL4x16_1
|
||||||
|
|
||||||
xvmaddadp vs32, vs0, vs24
|
xvmaddadp vs32, vs0, vs24
|
||||||
|
@ -136,8 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
xvmaddadp vs34, vs2, vs24
|
xvmaddadp vs34, vs2, vs24
|
||||||
xvmaddadp vs35, vs3, vs24
|
xvmaddadp vs35, vs3, vs24
|
||||||
|
|
||||||
lxvd2x vs8, 0, AO
|
lxvd2x vs8, o0, AO
|
||||||
lxvd2x vs9, o16, AO
|
lxvd2x vs9, o16, AO
|
||||||
|
lxvd2x vs10, o32, AO
|
||||||
|
lxvd2x vs11, o48, AO
|
||||||
|
|
||||||
xvmaddadp vs36, vs4, vs24
|
xvmaddadp vs36, vs4, vs24
|
||||||
xvmaddadp vs37, vs5, vs24
|
xvmaddadp vs37, vs5, vs24
|
||||||
|
@ -152,31 +154,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
xvmaddadp vs42, vs2, vs25
|
xvmaddadp vs42, vs2, vs25
|
||||||
xvmaddadp vs43, vs3, vs25
|
xvmaddadp vs43, vs3, vs25
|
||||||
|
|
||||||
lxvd2x vs10, o32, AO
|
|
||||||
lxvd2x vs11, o48, AO
|
|
||||||
|
|
||||||
xvmaddadp vs44, vs4, vs25
|
xvmaddadp vs44, vs4, vs25
|
||||||
xvmaddadp vs45, vs5, vs25
|
xvmaddadp vs45, vs5, vs25
|
||||||
xvmaddadp vs46, vs6, vs25
|
xvmaddadp vs46, vs6, vs25
|
||||||
xvmaddadp vs47, vs7, vs25
|
xvmaddadp vs47, vs7, vs25
|
||||||
|
|
||||||
addi AO, AO, 64
|
|
||||||
|
|
||||||
xvmaddadp vs48, vs0, vs26
|
xvmaddadp vs48, vs0, vs26
|
||||||
xvmaddadp vs49, vs1, vs26
|
xvmaddadp vs49, vs1, vs26
|
||||||
xvmaddadp vs50, vs2, vs26
|
xvmaddadp vs50, vs2, vs26
|
||||||
xvmaddadp vs51, vs3, vs26
|
xvmaddadp vs51, vs3, vs26
|
||||||
|
|
||||||
lxvd2x vs12, 0, AO
|
lxvd2x vs12, o64, AO
|
||||||
lxvd2x vs13, o16, AO
|
lxvd2x vs13, o80, AO
|
||||||
|
|
||||||
xvmaddadp vs52, vs4, vs26
|
xvmaddadp vs52, vs4, vs26
|
||||||
xvmaddadp vs53, vs5, vs26
|
xvmaddadp vs53, vs5, vs26
|
||||||
xvmaddadp vs54, vs6, vs26
|
xvmaddadp vs54, vs6, vs26
|
||||||
xvmaddadp vs55, vs7, vs26
|
xvmaddadp vs55, vs7, vs26
|
||||||
|
|
||||||
lxvd2x vs14, o32, AO
|
lxvd2x vs14, o96, AO
|
||||||
lxvd2x vs15, o48, AO
|
lxvd2x vs15, o112, AO
|
||||||
|
|
||||||
xvmaddadp vs56, vs0, vs27
|
xvmaddadp vs56, vs0, vs27
|
||||||
xvmaddadp vs57, vs1, vs27
|
xvmaddadp vs57, vs1, vs27
|
||||||
|
@ -192,7 +191,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
xvmaddadp vs62, vs6, vs27
|
xvmaddadp vs62, vs6, vs27
|
||||||
xvmaddadp vs63, vs7, vs27
|
xvmaddadp vs63, vs7, vs27
|
||||||
|
|
||||||
addi AO, AO, 64
|
addi AO, AO, 128
|
||||||
addi BO, BO, 32
|
addi BO, BO, 32
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
@ -228,23 +227,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
xvmaddadp vs46, vs14, vs29
|
xvmaddadp vs46, vs14, vs29
|
||||||
xvmaddadp vs47, vs15, vs29
|
xvmaddadp vs47, vs15, vs29
|
||||||
|
|
||||||
addi AO, AO, 64
|
|
||||||
|
|
||||||
xvmaddadp vs48, vs8, vs30
|
xvmaddadp vs48, vs8, vs30
|
||||||
xvmaddadp vs49, vs9, vs30
|
xvmaddadp vs49, vs9, vs30
|
||||||
xvmaddadp vs50, vs10, vs30
|
xvmaddadp vs50, vs10, vs30
|
||||||
xvmaddadp vs51, vs11, vs30
|
xvmaddadp vs51, vs11, vs30
|
||||||
|
|
||||||
lxvd2x vs4, 0, AO
|
lxvd2x vs4, o64, AO
|
||||||
lxvd2x vs5, o16, AO
|
lxvd2x vs5, o80, AO
|
||||||
|
|
||||||
xvmaddadp vs52, vs12, vs30
|
xvmaddadp vs52, vs12, vs30
|
||||||
xvmaddadp vs53, vs13, vs30
|
xvmaddadp vs53, vs13, vs30
|
||||||
xvmaddadp vs54, vs14, vs30
|
xvmaddadp vs54, vs14, vs30
|
||||||
xvmaddadp vs55, vs15, vs30
|
xvmaddadp vs55, vs15, vs30
|
||||||
|
|
||||||
lxvd2x vs6, o32, AO
|
lxvd2x vs6, o96, AO
|
||||||
lxvd2x vs7, o48, AO
|
lxvd2x vs7, o112, AO
|
||||||
|
|
||||||
xvmaddadp vs56, vs8, vs31
|
xvmaddadp vs56, vs8, vs31
|
||||||
xvmaddadp vs57, vs9, vs31
|
xvmaddadp vs57, vs9, vs31
|
||||||
|
@ -259,11 +257,144 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
xvmaddadp vs62, vs14, vs31
|
xvmaddadp vs62, vs14, vs31
|
||||||
xvmaddadp vs63, vs15, vs31
|
xvmaddadp vs63, vs15, vs31
|
||||||
|
|
||||||
addi AO, AO, 64
|
addi AO, AO, 128
|
||||||
addi BO, BO, 32
|
addi BO, BO, 32
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL4x16_L1
|
||||||
|
|
||||||
|
xvmaddadp vs32, vs0, vs24
|
||||||
|
xvmaddadp vs33, vs1, vs24
|
||||||
|
xvmaddadp vs34, vs2, vs24
|
||||||
|
xvmaddadp vs35, vs3, vs24
|
||||||
|
|
||||||
|
lxvd2x vs8, o0, AO
|
||||||
|
lxvd2x vs9, o16, AO
|
||||||
|
lxvd2x vs10, o32, AO
|
||||||
|
lxvd2x vs11, o48, AO
|
||||||
|
|
||||||
|
xvmaddadp vs36, vs4, vs24
|
||||||
|
xvmaddadp vs37, vs5, vs24
|
||||||
|
xvmaddadp vs38, vs6, vs24
|
||||||
|
xvmaddadp vs39, vs7, vs24
|
||||||
|
|
||||||
|
lxvdsx vs28, 0, BO
|
||||||
|
lxvdsx vs29, o8, BO
|
||||||
|
|
||||||
|
xvmaddadp vs40, vs0, vs25
|
||||||
|
xvmaddadp vs41, vs1, vs25
|
||||||
|
xvmaddadp vs42, vs2, vs25
|
||||||
|
xvmaddadp vs43, vs3, vs25
|
||||||
|
|
||||||
|
|
||||||
|
xvmaddadp vs44, vs4, vs25
|
||||||
|
xvmaddadp vs45, vs5, vs25
|
||||||
|
xvmaddadp vs46, vs6, vs25
|
||||||
|
xvmaddadp vs47, vs7, vs25
|
||||||
|
|
||||||
|
|
||||||
|
xvmaddadp vs48, vs0, vs26
|
||||||
|
xvmaddadp vs49, vs1, vs26
|
||||||
|
xvmaddadp vs50, vs2, vs26
|
||||||
|
xvmaddadp vs51, vs3, vs26
|
||||||
|
|
||||||
|
lxvd2x vs12, o64, AO
|
||||||
|
lxvd2x vs13, o80, AO
|
||||||
|
|
||||||
|
xvmaddadp vs52, vs4, vs26
|
||||||
|
xvmaddadp vs53, vs5, vs26
|
||||||
|
xvmaddadp vs54, vs6, vs26
|
||||||
|
xvmaddadp vs55, vs7, vs26
|
||||||
|
|
||||||
|
lxvd2x vs14, o96, AO
|
||||||
|
lxvd2x vs15, o112, AO
|
||||||
|
|
||||||
|
xvmaddadp vs56, vs0, vs27
|
||||||
|
xvmaddadp vs57, vs1, vs27
|
||||||
|
xvmaddadp vs58, vs2, vs27
|
||||||
|
xvmaddadp vs59, vs3, vs27
|
||||||
|
|
||||||
|
|
||||||
|
lxvdsx vs30, o16, BO
|
||||||
|
lxvdsx vs31, o24, BO
|
||||||
|
|
||||||
|
xvmaddadp vs60, vs4, vs27
|
||||||
|
xvmaddadp vs61, vs5, vs27
|
||||||
|
xvmaddadp vs62, vs6, vs27
|
||||||
|
xvmaddadp vs63, vs7, vs27
|
||||||
|
|
||||||
|
addi AO, AO, 128
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL4x16_L2
|
||||||
|
|
||||||
|
xvmaddadp vs32, vs8, vs28
|
||||||
|
xvmaddadp vs33, vs9, vs28
|
||||||
|
xvmaddadp vs34, vs10, vs28
|
||||||
|
xvmaddadp vs35, vs11, vs28
|
||||||
|
|
||||||
|
lxvd2x vs0, 0, AO
|
||||||
|
lxvd2x vs1, o16, AO
|
||||||
|
|
||||||
|
xvmaddadp vs36, vs12, vs28
|
||||||
|
xvmaddadp vs37, vs13, vs28
|
||||||
|
xvmaddadp vs38, vs14, vs28
|
||||||
|
xvmaddadp vs39, vs15, vs28
|
||||||
|
|
||||||
|
lxvdsx vs24, o32, BO
|
||||||
|
lxvdsx vs25, o40, BO
|
||||||
|
|
||||||
|
xvmaddadp vs40, vs8, vs29
|
||||||
|
xvmaddadp vs41, vs9, vs29
|
||||||
|
xvmaddadp vs42, vs10, vs29
|
||||||
|
xvmaddadp vs43, vs11, vs29
|
||||||
|
|
||||||
|
lxvd2x vs2, o32, AO
|
||||||
|
lxvd2x vs3, o48, AO
|
||||||
|
|
||||||
|
xvmaddadp vs44, vs12, vs29
|
||||||
|
xvmaddadp vs45, vs13, vs29
|
||||||
|
xvmaddadp vs46, vs14, vs29
|
||||||
|
xvmaddadp vs47, vs15, vs29
|
||||||
|
|
||||||
|
|
||||||
|
xvmaddadp vs48, vs8, vs30
|
||||||
|
xvmaddadp vs49, vs9, vs30
|
||||||
|
xvmaddadp vs50, vs10, vs30
|
||||||
|
xvmaddadp vs51, vs11, vs30
|
||||||
|
|
||||||
|
lxvd2x vs4, o64, AO
|
||||||
|
lxvd2x vs5, o80, AO
|
||||||
|
|
||||||
|
xvmaddadp vs52, vs12, vs30
|
||||||
|
xvmaddadp vs53, vs13, vs30
|
||||||
|
xvmaddadp vs54, vs14, vs30
|
||||||
|
xvmaddadp vs55, vs15, vs30
|
||||||
|
|
||||||
|
lxvd2x vs6, o96, AO
|
||||||
|
lxvd2x vs7, o112, AO
|
||||||
|
|
||||||
|
xvmaddadp vs56, vs8, vs31
|
||||||
|
xvmaddadp vs57, vs9, vs31
|
||||||
|
xvmaddadp vs58, vs10, vs31
|
||||||
|
xvmaddadp vs59, vs11, vs31
|
||||||
|
|
||||||
|
lxvdsx vs26, o48, BO
|
||||||
|
lxvdsx vs27, o56, BO
|
||||||
|
|
||||||
|
xvmaddadp vs60, vs12, vs31
|
||||||
|
addi AO, AO, 128
|
||||||
|
xvmaddadp vs61, vs13, vs31
|
||||||
|
xvmaddadp vs62, vs14, vs31
|
||||||
|
addi BO, BO, 64
|
||||||
|
xvmaddadp vs63, vs15, vs31
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
.macro KERNEL4x16_E2
|
.macro KERNEL4x16_E2
|
||||||
|
|
||||||
|
|
||||||
|
@ -378,15 +509,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
lxvdsx vs26, o16, BO
|
lxvdsx vs26, o16, BO
|
||||||
lxvdsx vs27, o24, BO
|
lxvdsx vs27, o24, BO
|
||||||
|
|
||||||
addi AO, AO, 64
|
|
||||||
addi BO, BO, 32
|
|
||||||
|
|
||||||
lxvd2x vs4, 0, AO
|
lxvd2x vs4, o64, AO
|
||||||
lxvd2x vs5, o16, AO
|
lxvd2x vs5, o80, AO
|
||||||
lxvd2x vs6, o32, AO
|
lxvd2x vs6, o96, AO
|
||||||
lxvd2x vs7, o48, AO
|
lxvd2x vs7, o112, AO
|
||||||
|
|
||||||
addi AO, AO, 64
|
|
||||||
|
|
||||||
|
|
||||||
xvmaddadp vs32, vs0, vs24
|
xvmaddadp vs32, vs0, vs24
|
||||||
|
@ -402,6 +530,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
xvmaddadp vs41, vs1, vs25
|
xvmaddadp vs41, vs1, vs25
|
||||||
xvmaddadp vs42, vs2, vs25
|
xvmaddadp vs42, vs2, vs25
|
||||||
xvmaddadp vs43, vs3, vs25
|
xvmaddadp vs43, vs3, vs25
|
||||||
|
addi BO, BO, 32
|
||||||
xvmaddadp vs44, vs4, vs25
|
xvmaddadp vs44, vs4, vs25
|
||||||
xvmaddadp vs45, vs5, vs25
|
xvmaddadp vs45, vs5, vs25
|
||||||
xvmaddadp vs46, vs6, vs25
|
xvmaddadp vs46, vs6, vs25
|
||||||
|
@ -411,6 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
xvmaddadp vs49, vs1, vs26
|
xvmaddadp vs49, vs1, vs26
|
||||||
xvmaddadp vs50, vs2, vs26
|
xvmaddadp vs50, vs2, vs26
|
||||||
xvmaddadp vs51, vs3, vs26
|
xvmaddadp vs51, vs3, vs26
|
||||||
|
addi AO, AO, 128
|
||||||
xvmaddadp vs52, vs4, vs26
|
xvmaddadp vs52, vs4, vs26
|
||||||
xvmaddadp vs53, vs5, vs26
|
xvmaddadp vs53, vs5, vs26
|
||||||
xvmaddadp vs54, vs6, vs26
|
xvmaddadp vs54, vs6, vs26
|
||||||
|
@ -430,21 +560,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.macro SAVE4x16
|
.macro SAVE4x16
|
||||||
|
|
||||||
mr T1, CO
|
mr T1, CO
|
||||||
addi T2, T1, 64
|
add T2, T1, LDC
|
||||||
|
add T3, T2, LDC
|
||||||
|
add T4, T3, LDC
|
||||||
|
|
||||||
#ifndef TRMMKERNEL
|
lxvd2x vs0, 0, CO
|
||||||
lxvd2x vs0, 0, T1
|
lxvd2x vs1, o16, CO
|
||||||
lxvd2x vs1, o16, T1
|
lxvd2x vs2, o32, CO
|
||||||
lxvd2x vs2, o32, T1
|
lxvd2x vs3, o48, CO
|
||||||
lxvd2x vs3, o48, T1
|
lxvd2x vs4, o64, CO
|
||||||
|
lxvd2x vs5, o80, CO
|
||||||
|
lxvd2x vs6, o96, CO
|
||||||
|
lxvd2x vs7, o112, CO
|
||||||
|
|
||||||
lxvd2x vs4, 0, T2
|
lxvd2x vs8, 0, T2
|
||||||
lxvd2x vs5, o16, T2
|
lxvd2x vs9, o16, T2
|
||||||
lxvd2x vs6, o32, T2
|
lxvd2x vs10, o32, T2
|
||||||
lxvd2x vs7, o48, T2
|
lxvd2x vs11, o48, T2
|
||||||
#endif
|
lxvd2x vs12, o64, T2
|
||||||
|
lxvd2x vs13, o80, T2
|
||||||
|
lxvd2x vs14, o96, T2
|
||||||
|
lxvd2x vs15, o112, T2
|
||||||
|
|
||||||
|
lxvd2x vs24, 0, T3
|
||||||
|
lxvd2x vs25, o16, T3
|
||||||
|
lxvd2x vs26, o32, T3
|
||||||
|
lxvd2x vs27, o48, T3
|
||||||
|
lxvd2x vs28, o64, T3
|
||||||
|
lxvd2x vs29, o80, T3
|
||||||
|
lxvd2x vs30, o96, T3
|
||||||
|
lxvd2x vs31, o112, T3
|
||||||
|
|
||||||
#ifndef TRMMKERNEL
|
|
||||||
xvmaddadp vs0, vs32, alpha_r
|
xvmaddadp vs0, vs32, alpha_r
|
||||||
xvmaddadp vs1, vs33, alpha_r
|
xvmaddadp vs1, vs33, alpha_r
|
||||||
xvmaddadp vs2, vs34, alpha_r
|
xvmaddadp vs2, vs34, alpha_r
|
||||||
|
@ -453,172 +599,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
xvmaddadp vs5, vs37, alpha_r
|
xvmaddadp vs5, vs37, alpha_r
|
||||||
xvmaddadp vs6, vs38, alpha_r
|
xvmaddadp vs6, vs38, alpha_r
|
||||||
xvmaddadp vs7, vs39, alpha_r
|
xvmaddadp vs7, vs39, alpha_r
|
||||||
#else
|
|
||||||
xvmuldp vs0, vs32, alpha_r
|
|
||||||
xvmuldp vs1, vs33, alpha_r
|
|
||||||
xvmuldp vs2, vs34, alpha_r
|
|
||||||
xvmuldp vs3, vs35, alpha_r
|
|
||||||
xvmuldp vs4, vs36, alpha_r
|
|
||||||
xvmuldp vs5, vs37, alpha_r
|
|
||||||
xvmuldp vs6, vs38, alpha_r
|
|
||||||
xvmuldp vs7, vs39, alpha_r
|
|
||||||
#endif
|
|
||||||
|
|
||||||
stxvd2x vs0, 0, T1
|
lxvd2x vs32, 0, T4
|
||||||
stxvd2x vs1, o16, T1
|
lxvd2x vs33, o16, T4
|
||||||
stxvd2x vs2, o32, T1
|
lxvd2x vs34, o32, T4
|
||||||
stxvd2x vs3, o48, T1
|
lxvd2x vs35, o48, T4
|
||||||
|
lxvd2x vs36, o64, T4
|
||||||
|
lxvd2x vs37, o80, T4
|
||||||
|
lxvd2x vs38, o96, T4
|
||||||
|
lxvd2x vs39, o112, T4
|
||||||
|
|
||||||
dcbt T1, PRE
|
|
||||||
|
|
||||||
stxvd2x vs4, 0, T2
|
|
||||||
stxvd2x vs5, o16, T2
|
|
||||||
stxvd2x vs6, o32, T2
|
|
||||||
stxvd2x vs7, o48, T2
|
|
||||||
|
|
||||||
add T1, T1, LDC
|
|
||||||
add T2, T2, LDC
|
|
||||||
|
|
||||||
#ifndef TRMMKERNEL
|
|
||||||
lxvd2x vs8, 0, T1
|
|
||||||
lxvd2x vs9, o16, T1
|
|
||||||
lxvd2x vs10, o32, T1
|
|
||||||
lxvd2x vs11, o48, T1
|
|
||||||
|
|
||||||
lxvd2x vs12, 0, T2
|
|
||||||
lxvd2x vs13, o16, T2
|
|
||||||
lxvd2x vs14, o32, T2
|
|
||||||
lxvd2x vs15, o48, T2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef TRMMKERNEL
|
|
||||||
xvmaddadp vs8, vs40, alpha_r
|
xvmaddadp vs8, vs40, alpha_r
|
||||||
xvmaddadp vs9, vs41, alpha_r
|
xvmaddadp vs9, vs41, alpha_r
|
||||||
xvmaddadp vs10, vs42, alpha_r
|
xvmaddadp vs10, vs42, alpha_r
|
||||||
xvmaddadp vs11, vs43, alpha_r
|
xvmaddadp vs11, vs43, alpha_r
|
||||||
xvmaddadp vs12, vs44, alpha_r
|
|
||||||
xvmaddadp vs13, vs45, alpha_r
|
|
||||||
xvmaddadp vs14, vs46, alpha_r
|
|
||||||
xvmaddadp vs15, vs47, alpha_r
|
|
||||||
#else
|
|
||||||
xvmuldp vs8, vs40, alpha_r
|
|
||||||
xvmuldp vs9, vs41, alpha_r
|
|
||||||
xvmuldp vs10, vs42, alpha_r
|
|
||||||
xvmuldp vs11, vs43, alpha_r
|
|
||||||
xvmuldp vs12, vs44, alpha_r
|
|
||||||
xvmuldp vs13, vs45, alpha_r
|
|
||||||
xvmuldp vs14, vs46, alpha_r
|
|
||||||
xvmuldp vs15, vs47, alpha_r
|
|
||||||
#endif
|
|
||||||
|
|
||||||
stxvd2x vs8, 0, T1
|
|
||||||
stxvd2x vs9, o16, T1
|
|
||||||
stxvd2x vs10, o32, T1
|
|
||||||
stxvd2x vs11, o48, T1
|
|
||||||
|
|
||||||
dcbt T1, PRE
|
|
||||||
|
|
||||||
stxvd2x vs12, 0, T2
|
|
||||||
stxvd2x vs13, o16, T2
|
|
||||||
stxvd2x vs14, o32, T2
|
|
||||||
stxvd2x vs15, o48, T2
|
|
||||||
|
|
||||||
add T1, T1, LDC
|
|
||||||
add T2, T2, LDC
|
|
||||||
|
|
||||||
#ifndef TRMMKERNEL
|
|
||||||
lxvd2x vs0, 0, T1
|
|
||||||
lxvd2x vs1, o16, T1
|
|
||||||
lxvd2x vs2, o32, T1
|
|
||||||
lxvd2x vs3, o48, T1
|
|
||||||
|
|
||||||
lxvd2x vs4, 0, T2
|
|
||||||
lxvd2x vs5, o16, T2
|
|
||||||
lxvd2x vs6, o32, T2
|
|
||||||
lxvd2x vs7, o48, T2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef TRMMKERNEL
|
|
||||||
xvmaddadp vs0, vs48, alpha_r
|
|
||||||
xvmaddadp vs1, vs49, alpha_r
|
|
||||||
xvmaddadp vs2, vs50, alpha_r
|
|
||||||
xvmaddadp vs3, vs51, alpha_r
|
|
||||||
xvmaddadp vs4, vs52, alpha_r
|
|
||||||
xvmaddadp vs5, vs53, alpha_r
|
|
||||||
xvmaddadp vs6, vs54, alpha_r
|
|
||||||
xvmaddadp vs7, vs55, alpha_r
|
|
||||||
#else
|
|
||||||
xvmuldp vs0, vs48, alpha_r
|
|
||||||
xvmuldp vs1, vs49, alpha_r
|
|
||||||
xvmuldp vs2, vs50, alpha_r
|
|
||||||
xvmuldp vs3, vs51, alpha_r
|
|
||||||
xvmuldp vs4, vs52, alpha_r
|
|
||||||
xvmuldp vs5, vs53, alpha_r
|
|
||||||
xvmuldp vs6, vs54, alpha_r
|
|
||||||
xvmuldp vs7, vs55, alpha_r
|
|
||||||
#endif
|
|
||||||
|
|
||||||
stxvd2x vs0, 0, T1
|
stxvd2x vs0, 0, T1
|
||||||
stxvd2x vs1, o16, T1
|
stxvd2x vs1, o16, T1
|
||||||
stxvd2x vs2, o32, T1
|
stxvd2x vs2, o32, T1
|
||||||
stxvd2x vs3, o48, T1
|
stxvd2x vs3, o48, T1
|
||||||
|
|
||||||
dcbt T1, PRE
|
xvmaddadp vs12, vs44, alpha_r
|
||||||
|
xvmaddadp vs13, vs45, alpha_r
|
||||||
|
xvmaddadp vs14, vs46, alpha_r
|
||||||
|
xvmaddadp vs15, vs47, alpha_r
|
||||||
|
|
||||||
stxvd2x vs4, 0, T2
|
stxvd2x vs4, o64, T1
|
||||||
stxvd2x vs5, o16, T2
|
stxvd2x vs5, o80, T1
|
||||||
stxvd2x vs6, o32, T2
|
stxvd2x vs6, o96, T1
|
||||||
stxvd2x vs7, o48, T2
|
stxvd2x vs7, o112, T1
|
||||||
|
|
||||||
add T1, T1, LDC
|
xvmaddadp vs24, vs48, alpha_r
|
||||||
add T2, T2, LDC
|
xvmaddadp vs25, vs49, alpha_r
|
||||||
|
xvmaddadp vs26, vs50, alpha_r
|
||||||
|
xvmaddadp vs27, vs51, alpha_r
|
||||||
|
|
||||||
#ifndef TRMMKERNEL
|
stxvd2x vs8, o0, T2
|
||||||
lxvd2x vs8, 0, T1
|
stxvd2x vs9, o16, T2
|
||||||
lxvd2x vs9, o16, T1
|
stxvd2x vs10, o32, T2
|
||||||
lxvd2x vs10, o32, T1
|
stxvd2x vs11, o48, T2
|
||||||
lxvd2x vs11, o48, T1
|
|
||||||
|
|
||||||
lxvd2x vs12, 0, T2
|
xvmaddadp vs28, vs52, alpha_r
|
||||||
lxvd2x vs13, o16, T2
|
xvmaddadp vs29, vs53, alpha_r
|
||||||
lxvd2x vs14, o32, T2
|
xvmaddadp vs30, vs54, alpha_r
|
||||||
lxvd2x vs15, o48, T2
|
xvmaddadp vs31, vs55, alpha_r
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef TRMMKERNEL
|
stxvd2x vs12, o64, T2
|
||||||
xvmaddadp vs8, vs56, alpha_r
|
stxvd2x vs13, o80, T2
|
||||||
xvmaddadp vs9, vs57, alpha_r
|
stxvd2x vs14, o96, T2
|
||||||
xvmaddadp vs10, vs58, alpha_r
|
stxvd2x vs15, o112, T2
|
||||||
xvmaddadp vs11, vs59, alpha_r
|
|
||||||
xvmaddadp vs12, vs60, alpha_r
|
|
||||||
xvmaddadp vs13, vs61, alpha_r
|
|
||||||
xvmaddadp vs14, vs62, alpha_r
|
|
||||||
xvmaddadp vs15, vs63, alpha_r
|
|
||||||
#else
|
|
||||||
xvmuldp vs8, vs56, alpha_r
|
|
||||||
xvmuldp vs9, vs57, alpha_r
|
|
||||||
xvmuldp vs10, vs58, alpha_r
|
|
||||||
xvmuldp vs11, vs59, alpha_r
|
|
||||||
xvmuldp vs12, vs60, alpha_r
|
|
||||||
xvmuldp vs13, vs61, alpha_r
|
|
||||||
xvmuldp vs14, vs62, alpha_r
|
|
||||||
xvmuldp vs15, vs63, alpha_r
|
|
||||||
#endif
|
|
||||||
|
|
||||||
stxvd2x vs8, 0, T1
|
xvmaddadp vs32, vs56, alpha_r
|
||||||
stxvd2x vs9, o16, T1
|
xvmaddadp vs33, vs57, alpha_r
|
||||||
stxvd2x vs10, o32, T1
|
xvmaddadp vs34, vs58, alpha_r
|
||||||
stxvd2x vs11, o48, T1
|
xvmaddadp vs35, vs59, alpha_r
|
||||||
|
|
||||||
dcbt T1, PRE
|
stxvd2x vs24, 0, T3
|
||||||
|
stxvd2x vs25, o16, T3
|
||||||
|
stxvd2x vs26, o32, T3
|
||||||
|
stxvd2x vs27, o48, T3
|
||||||
|
|
||||||
stxvd2x vs12, 0, T2
|
xvmaddadp vs36, vs60, alpha_r
|
||||||
stxvd2x vs13, o16, T2
|
xvmaddadp vs37, vs61, alpha_r
|
||||||
stxvd2x vs14, o32, T2
|
xvmaddadp vs38, vs62, alpha_r
|
||||||
stxvd2x vs15, o48, T2
|
xvmaddadp vs39, vs63, alpha_r
|
||||||
|
|
||||||
|
stxvd2x vs28, o64, T3
|
||||||
|
stxvd2x vs29, o80, T3
|
||||||
|
stxvd2x vs30, o96, T3
|
||||||
|
stxvd2x vs31, o112, T3
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, T4
|
||||||
|
stxvd2x vs33, o16, T4
|
||||||
|
stxvd2x vs34, o32, T4
|
||||||
|
stxvd2x vs35, o48, T4
|
||||||
|
|
||||||
addi CO, CO, 128
|
addi CO, CO, 128
|
||||||
|
|
||||||
|
stxvd2x vs36, o64, T4
|
||||||
|
stxvd2x vs37, o80, T4
|
||||||
|
stxvd2x vs38, o96, T4
|
||||||
|
stxvd2x vs39, o112, T4
|
||||||
|
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
/*********************************************************************
|
/*********************************************************************
|
||||||
|
|
|
@ -0,0 +1,228 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
#include "def_vsx.h"
|
||||||
|
|
||||||
|
#define M r3
|
||||||
|
#define N r4
|
||||||
|
#define A r5
|
||||||
|
#define LDA r6
|
||||||
|
#define B r7
|
||||||
|
|
||||||
|
#define A0 r8
|
||||||
|
#define A1 r9
|
||||||
|
#define A2 r10
|
||||||
|
#define A3 r11
|
||||||
|
|
||||||
|
#define J r12
|
||||||
|
|
||||||
|
#define PREA r14
|
||||||
|
#define PREB r15
|
||||||
|
#define BO r16
|
||||||
|
#define o64 r17
|
||||||
|
#define o80 r18
|
||||||
|
#define o96 r19
|
||||||
|
#define o112 r20
|
||||||
|
#define o8 r21
|
||||||
|
#define T2 r22
|
||||||
|
#define I r23
|
||||||
|
#define o16 r24
|
||||||
|
#define o32 r25
|
||||||
|
#define o48 r26
|
||||||
|
#define NOTU1 r27
|
||||||
|
#define NOTU2 r30
|
||||||
|
#define T1 r31
|
||||||
|
|
||||||
|
#define o0 0
|
||||||
|
|
||||||
|
#include "dgemm_ncopy_macros_4_power8.S"
|
||||||
|
|
||||||
|
#define STACKSIZE 384
|
||||||
|
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
addi SP, SP, -STACKSIZE
|
||||||
|
li r0, 0
|
||||||
|
|
||||||
|
stfd f14, 0(SP)
|
||||||
|
stfd f15, 8(SP)
|
||||||
|
stfd f16, 16(SP)
|
||||||
|
stfd f17, 24(SP)
|
||||||
|
stfd f18, 32(SP)
|
||||||
|
stfd f19, 40(SP)
|
||||||
|
stfd f20, 48(SP)
|
||||||
|
stfd f21, 56(SP)
|
||||||
|
stfd f22, 64(SP)
|
||||||
|
stfd f23, 72(SP)
|
||||||
|
stfd f24, 80(SP)
|
||||||
|
stfd f25, 88(SP)
|
||||||
|
stfd f26, 96(SP)
|
||||||
|
stfd f27, 104(SP)
|
||||||
|
stfd f28, 112(SP)
|
||||||
|
stfd f29, 120(SP)
|
||||||
|
stfd f30, 128(SP)
|
||||||
|
stfd f31, 136(SP)
|
||||||
|
|
||||||
|
|
||||||
|
std r31, 144(SP)
|
||||||
|
std r30, 152(SP)
|
||||||
|
std r29, 160(SP)
|
||||||
|
std r28, 168(SP)
|
||||||
|
std r27, 176(SP)
|
||||||
|
std r26, 184(SP)
|
||||||
|
std r25, 192(SP)
|
||||||
|
std r24, 200(SP)
|
||||||
|
std r23, 208(SP)
|
||||||
|
std r22, 216(SP)
|
||||||
|
std r21, 224(SP)
|
||||||
|
std r20, 232(SP)
|
||||||
|
std r19, 240(SP)
|
||||||
|
std r18, 248(SP)
|
||||||
|
std r17, 256(SP)
|
||||||
|
std r16, 264(SP)
|
||||||
|
std r15, 272(SP)
|
||||||
|
std r14, 280(SP)
|
||||||
|
|
||||||
|
cmpwi cr0, M, 0
|
||||||
|
ble- L999
|
||||||
|
cmpwi cr0, N, 0
|
||||||
|
ble- L999
|
||||||
|
|
||||||
|
slwi LDA, LDA, BASE_SHIFT
|
||||||
|
|
||||||
|
li PREA, 384
|
||||||
|
li PREB, 384
|
||||||
|
|
||||||
|
li o8, 8
|
||||||
|
li o16, 16
|
||||||
|
li o32, 32
|
||||||
|
li o48, 48
|
||||||
|
li o64, 64
|
||||||
|
li o80, 80
|
||||||
|
li o96, 96
|
||||||
|
li o112, 112
|
||||||
|
|
||||||
|
#include "dgemm_ncopy_logic_4_power8.S"
|
||||||
|
|
||||||
|
L999:
|
||||||
|
|
||||||
|
li r3, 0
|
||||||
|
|
||||||
|
lfd f14, 0(SP)
|
||||||
|
lfd f15, 8(SP)
|
||||||
|
lfd f16, 16(SP)
|
||||||
|
lfd f17, 24(SP)
|
||||||
|
lfd f18, 32(SP)
|
||||||
|
lfd f19, 40(SP)
|
||||||
|
lfd f20, 48(SP)
|
||||||
|
lfd f21, 56(SP)
|
||||||
|
lfd f22, 64(SP)
|
||||||
|
lfd f23, 72(SP)
|
||||||
|
lfd f24, 80(SP)
|
||||||
|
lfd f25, 88(SP)
|
||||||
|
lfd f26, 96(SP)
|
||||||
|
lfd f27, 104(SP)
|
||||||
|
lfd f28, 112(SP)
|
||||||
|
lfd f29, 120(SP)
|
||||||
|
lfd f30, 128(SP)
|
||||||
|
lfd f31, 136(SP)
|
||||||
|
|
||||||
|
ld r31, 144(SP)
|
||||||
|
ld r30, 152(SP)
|
||||||
|
ld r29, 160(SP)
|
||||||
|
ld r28, 168(SP)
|
||||||
|
ld r27, 176(SP)
|
||||||
|
ld r26, 184(SP)
|
||||||
|
ld r25, 192(SP)
|
||||||
|
ld r24, 200(SP)
|
||||||
|
ld r23, 208(SP)
|
||||||
|
ld r22, 216(SP)
|
||||||
|
ld r21, 224(SP)
|
||||||
|
ld r20, 232(SP)
|
||||||
|
ld r19, 240(SP)
|
||||||
|
ld r18, 248(SP)
|
||||||
|
ld r17, 256(SP)
|
||||||
|
ld r16, 264(SP)
|
||||||
|
ld r15, 272(SP)
|
||||||
|
ld r14, 280(SP)
|
||||||
|
|
||||||
|
addi SP, SP, STACKSIZE
|
||||||
|
|
||||||
|
blr
|
||||||
|
EPILOGUE
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,237 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
mr BO, B
|
||||||
|
srawi. I, N, 2
|
||||||
|
ble DCOPYN_L2_BEGIN
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L4_BEGIN:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L4_LOOP:
|
||||||
|
|
||||||
|
mr A0, A
|
||||||
|
add A1, A0, LDA
|
||||||
|
add A2, A1, LDA
|
||||||
|
add A3, A2, LDA
|
||||||
|
add A, A3, LDA
|
||||||
|
|
||||||
|
DCOPYN_L4x16_BEGIN:
|
||||||
|
|
||||||
|
srawi. J, M, 4
|
||||||
|
ble DCOPYN_L4x16_END
|
||||||
|
|
||||||
|
DCOPYN_L4x16_LOOP:
|
||||||
|
|
||||||
|
dcbt A0, PREA
|
||||||
|
dcbt A1, PREA
|
||||||
|
dcbt A2, PREA
|
||||||
|
dcbt A3, PREA
|
||||||
|
COPY_4x16
|
||||||
|
addic. J, J, -1
|
||||||
|
bgt DCOPYN_L4x16_LOOP
|
||||||
|
|
||||||
|
DCOPYN_L4x16_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L4x8_BEGIN:
|
||||||
|
|
||||||
|
andi. J, M, 8
|
||||||
|
ble DCOPYN_L4x8_END
|
||||||
|
COPY_4x8
|
||||||
|
|
||||||
|
DCOPYN_L4x8_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L4x4_BEGIN:
|
||||||
|
|
||||||
|
andi. J, M, 4
|
||||||
|
ble DCOPYN_L4x4_END
|
||||||
|
COPY_4x4
|
||||||
|
|
||||||
|
DCOPYN_L4x4_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L4x2_BEGIN:
|
||||||
|
|
||||||
|
andi. J, M, 2
|
||||||
|
ble DCOPYN_L4x2_END
|
||||||
|
COPY_4x2
|
||||||
|
|
||||||
|
DCOPYN_L4x2_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L4x1_BEGIN:
|
||||||
|
|
||||||
|
andi. J, M, 1
|
||||||
|
ble DCOPYN_L4x1_END
|
||||||
|
COPY_4x1
|
||||||
|
|
||||||
|
DCOPYN_L4x1_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L4_END:
|
||||||
|
|
||||||
|
addic. I, I, -1
|
||||||
|
bgt DCOPYN_L4_LOOP
|
||||||
|
|
||||||
|
DCOPYN_L2_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, 4, 2
|
||||||
|
ble DCOPYN_L2_END
|
||||||
|
|
||||||
|
DCOPYN_L2_LOOP:
|
||||||
|
|
||||||
|
mr A0, A
|
||||||
|
add A1, A0, LDA
|
||||||
|
add A, A1, LDA
|
||||||
|
|
||||||
|
DCOPYN_L2x16_BEGIN:
|
||||||
|
|
||||||
|
srawi. J, M, 4
|
||||||
|
ble DCOPYN_L2x16_END
|
||||||
|
|
||||||
|
DCOPYN_L2x16_LOOP:
|
||||||
|
|
||||||
|
COPY_2x16
|
||||||
|
addic. J, J, -1
|
||||||
|
bgt DCOPYN_L2x16_LOOP
|
||||||
|
|
||||||
|
DCOPYN_L2x16_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L2x8_BEGIN:
|
||||||
|
|
||||||
|
andi. J, M, 8
|
||||||
|
ble DCOPYN_L2x8_END
|
||||||
|
COPY_2x8
|
||||||
|
|
||||||
|
DCOPYN_L2x8_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L2x4_BEGIN:
|
||||||
|
|
||||||
|
andi. J, M, 4
|
||||||
|
ble DCOPYN_L2x4_END
|
||||||
|
COPY_2x4
|
||||||
|
|
||||||
|
DCOPYN_L2x4_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L2x2_BEGIN:
|
||||||
|
|
||||||
|
andi. J, M, 2
|
||||||
|
ble DCOPYN_L2x2_END
|
||||||
|
COPY_2x2
|
||||||
|
|
||||||
|
DCOPYN_L2x2_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L2x1_BEGIN:
|
||||||
|
|
||||||
|
andi. J, M, 1
|
||||||
|
ble DCOPYN_L2x1_END
|
||||||
|
COPY_2x1
|
||||||
|
|
||||||
|
DCOPYN_L2x1_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L2_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L1_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, 4, 1
|
||||||
|
ble DCOPYN_L1_END
|
||||||
|
|
||||||
|
DCOPYN_L1_LOOP:
|
||||||
|
|
||||||
|
mr A0, A
|
||||||
|
add A, A0, LDA
|
||||||
|
|
||||||
|
DCOPYN_L1x16_BEGIN:
|
||||||
|
|
||||||
|
srawi. J, M, 4
|
||||||
|
ble DCOPYN_L1x16_END
|
||||||
|
|
||||||
|
DCOPYN_L1x16_LOOP:
|
||||||
|
|
||||||
|
COPY_1x16
|
||||||
|
addic. J, J, -1
|
||||||
|
bgt DCOPYN_L1x16_LOOP
|
||||||
|
|
||||||
|
DCOPYN_L1x16_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L1x8_BEGIN:
|
||||||
|
|
||||||
|
andi. J, M, 8
|
||||||
|
ble DCOPYN_L1x8_END
|
||||||
|
COPY_1x8
|
||||||
|
|
||||||
|
DCOPYN_L1x8_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L1x4_BEGIN:
|
||||||
|
|
||||||
|
andi. J, M, 4
|
||||||
|
ble DCOPYN_L1x4_END
|
||||||
|
COPY_1x4
|
||||||
|
|
||||||
|
DCOPYN_L1x4_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L1x2_BEGIN:
|
||||||
|
|
||||||
|
andi. J, M, 2
|
||||||
|
ble DCOPYN_L1x2_END
|
||||||
|
COPY_1x2
|
||||||
|
|
||||||
|
DCOPYN_L1x2_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L1x1_BEGIN:
|
||||||
|
|
||||||
|
andi. J, M, 1
|
||||||
|
ble DCOPYN_L1x1_END
|
||||||
|
COPY_1x1
|
||||||
|
|
||||||
|
DCOPYN_L1x1_END:
|
||||||
|
|
||||||
|
|
||||||
|
DCOPYN_L1_END:
|
||||||
|
|
|
@ -0,0 +1,691 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=16
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x16
|
||||||
|
|
||||||
|
lxvd2x vs0, o0, A0
|
||||||
|
lxvd2x vs8, o0, A1
|
||||||
|
lxvd2x vs24, o0, A3
|
||||||
|
lxvd2x vs16, o0, A2
|
||||||
|
|
||||||
|
lxvd2x vs1, o16, A0
|
||||||
|
lxvd2x vs9, o16, A1
|
||||||
|
lxvd2x vs17, o16, A2
|
||||||
|
lxvd2x vs25, o16, A3
|
||||||
|
|
||||||
|
lxvd2x vs2, o32, A0
|
||||||
|
lxvd2x vs10, o32, A1
|
||||||
|
lxvd2x vs18, o32, A2
|
||||||
|
lxvd2x vs26, o32, A3
|
||||||
|
|
||||||
|
lxvd2x vs3, o48, A0
|
||||||
|
lxvd2x vs11, o48, A1
|
||||||
|
lxvd2x vs19, o48, A2
|
||||||
|
lxvd2x vs27, o48, A3
|
||||||
|
|
||||||
|
lxvd2x vs4, o64, A0
|
||||||
|
lxvd2x vs12, o64, A1
|
||||||
|
lxvd2x vs20, o64, A2
|
||||||
|
lxvd2x vs28, o64, A3
|
||||||
|
|
||||||
|
lxvd2x vs5, o80, A0
|
||||||
|
lxvd2x vs13, o80, A1
|
||||||
|
lxvd2x vs21, o80, A2
|
||||||
|
lxvd2x vs29, o80, A3
|
||||||
|
|
||||||
|
lxvd2x vs6, o96, A0
|
||||||
|
lxvd2x vs14, o96, A1
|
||||||
|
lxvd2x vs22, o96, A2
|
||||||
|
lxvd2x vs30, o96, A3
|
||||||
|
|
||||||
|
lxvd2x vs7, o112, A0
|
||||||
|
lxvd2x vs15, o112, A1
|
||||||
|
lxvd2x vs23, o112, A2
|
||||||
|
lxvd2x vs31, o112, A3
|
||||||
|
|
||||||
|
|
||||||
|
xxpermdi vs32, vs0, vs8, 0
|
||||||
|
xxpermdi vs33, vs16, vs24, 0
|
||||||
|
xxpermdi vs34, vs0, vs8, 3
|
||||||
|
xxpermdi vs35, vs16, vs24, 3
|
||||||
|
|
||||||
|
xxpermdi vs36, vs1, vs9, 0
|
||||||
|
xxpermdi vs37, vs17, vs25, 0
|
||||||
|
xxpermdi vs38, vs1, vs9, 3
|
||||||
|
xxpermdi vs39, vs17, vs25, 3
|
||||||
|
|
||||||
|
xxpermdi vs40, vs2, vs10, 0
|
||||||
|
xxpermdi vs41, vs18, vs26, 0
|
||||||
|
xxpermdi vs42, vs2, vs10, 3
|
||||||
|
xxpermdi vs43, vs18, vs26, 3
|
||||||
|
|
||||||
|
xxpermdi vs44, vs3, vs11, 0
|
||||||
|
xxpermdi vs45, vs19, vs27, 0
|
||||||
|
xxpermdi vs46, vs3, vs11, 3
|
||||||
|
xxpermdi vs47, vs19, vs27, 3
|
||||||
|
|
||||||
|
xxpermdi vs48, vs4, vs12, 0
|
||||||
|
xxpermdi vs49, vs20, vs28, 0
|
||||||
|
xxpermdi vs50, vs4, vs12, 3
|
||||||
|
xxpermdi vs51, vs20, vs28, 3
|
||||||
|
|
||||||
|
xxpermdi vs52, vs5, vs13, 0
|
||||||
|
xxpermdi vs53, vs21, vs29, 0
|
||||||
|
xxpermdi vs54, vs5, vs13, 3
|
||||||
|
xxpermdi vs55, vs21, vs29, 3
|
||||||
|
|
||||||
|
addi A0, A0, 128
|
||||||
|
addi A1, A1, 128
|
||||||
|
|
||||||
|
xxpermdi vs56, vs6, vs14, 0
|
||||||
|
xxpermdi vs57, vs22, vs30, 0
|
||||||
|
xxpermdi vs58, vs6, vs14, 3
|
||||||
|
xxpermdi vs59, vs22, vs30, 3
|
||||||
|
|
||||||
|
addi A3, A3, 128
|
||||||
|
addi A2, A2, 128
|
||||||
|
|
||||||
|
xxpermdi vs60, vs7, vs15, 0
|
||||||
|
xxpermdi vs61, vs23, vs31, 0
|
||||||
|
xxpermdi vs62, vs7, vs15, 3
|
||||||
|
xxpermdi vs63, vs23, vs31, 3
|
||||||
|
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, BO
|
||||||
|
stxvd2x vs33, o16, BO
|
||||||
|
stxvd2x vs34, o32, BO
|
||||||
|
stxvd2x vs35, o48, BO
|
||||||
|
stxvd2x vs36, o64, BO
|
||||||
|
stxvd2x vs37, o80, BO
|
||||||
|
stxvd2x vs38, o96, BO
|
||||||
|
stxvd2x vs39, o112, BO
|
||||||
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
stxvd2x vs40, o0, BO
|
||||||
|
stxvd2x vs41, o16, BO
|
||||||
|
stxvd2x vs42, o32, BO
|
||||||
|
stxvd2x vs43, o48, BO
|
||||||
|
stxvd2x vs44, o64, BO
|
||||||
|
stxvd2x vs45, o80, BO
|
||||||
|
stxvd2x vs46, o96, BO
|
||||||
|
stxvd2x vs47, o112, BO
|
||||||
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
stxvd2x vs48, o0, BO
|
||||||
|
stxvd2x vs49, o16, BO
|
||||||
|
stxvd2x vs50, o32, BO
|
||||||
|
stxvd2x vs51, o48, BO
|
||||||
|
stxvd2x vs52, o64, BO
|
||||||
|
stxvd2x vs53, o80, BO
|
||||||
|
stxvd2x vs54, o96, BO
|
||||||
|
stxvd2x vs55, o112, BO
|
||||||
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
stxvd2x vs56, o0, BO
|
||||||
|
stxvd2x vs57, o16, BO
|
||||||
|
stxvd2x vs58, o32, BO
|
||||||
|
stxvd2x vs59, o48, BO
|
||||||
|
stxvd2x vs60, o64, BO
|
||||||
|
stxvd2x vs61, o80, BO
|
||||||
|
stxvd2x vs62, o96, BO
|
||||||
|
stxvd2x vs63, o112, BO
|
||||||
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=8
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x8
|
||||||
|
|
||||||
|
lxvd2x vs0, o0, A0
|
||||||
|
lxvd2x vs1, o16, A0
|
||||||
|
lxvd2x vs2, o32, A0
|
||||||
|
lxvd2x vs3, o48, A0
|
||||||
|
addi A0, A0, 64
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs8, o0, A1
|
||||||
|
lxvd2x vs9, o16, A1
|
||||||
|
lxvd2x vs10, o32, A1
|
||||||
|
lxvd2x vs11, o48, A1
|
||||||
|
addi A1, A1, 64
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs16, o0, A2
|
||||||
|
lxvd2x vs17, o16, A2
|
||||||
|
lxvd2x vs18, o32, A2
|
||||||
|
lxvd2x vs19, o48, A2
|
||||||
|
addi A2, A2, 64
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs24, o0, A3
|
||||||
|
lxvd2x vs25, o16, A3
|
||||||
|
lxvd2x vs26, o32, A3
|
||||||
|
lxvd2x vs27, o48, A3
|
||||||
|
addi A3, A3, 64
|
||||||
|
|
||||||
|
|
||||||
|
xxpermdi vs32, vs0, vs8, 0
|
||||||
|
xxpermdi vs33, vs16, vs24, 0
|
||||||
|
xxpermdi vs34, vs0, vs8, 3
|
||||||
|
xxpermdi vs35, vs16, vs24, 3
|
||||||
|
|
||||||
|
xxpermdi vs36, vs1, vs9, 0
|
||||||
|
xxpermdi vs37, vs17, vs25, 0
|
||||||
|
xxpermdi vs38, vs1, vs9, 3
|
||||||
|
xxpermdi vs39, vs17, vs25, 3
|
||||||
|
|
||||||
|
xxpermdi vs40, vs2, vs10, 0
|
||||||
|
xxpermdi vs41, vs18, vs26, 0
|
||||||
|
xxpermdi vs42, vs2, vs10, 3
|
||||||
|
xxpermdi vs43, vs18, vs26, 3
|
||||||
|
|
||||||
|
xxpermdi vs44, vs3, vs11, 0
|
||||||
|
xxpermdi vs45, vs19, vs27, 0
|
||||||
|
xxpermdi vs46, vs3, vs11, 3
|
||||||
|
xxpermdi vs47, vs19, vs27, 3
|
||||||
|
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, BO
|
||||||
|
stxvd2x vs33, o16, BO
|
||||||
|
stxvd2x vs34, o32, BO
|
||||||
|
stxvd2x vs35, o48, BO
|
||||||
|
stxvd2x vs36, o64, BO
|
||||||
|
stxvd2x vs37, o80, BO
|
||||||
|
stxvd2x vs38, o96, BO
|
||||||
|
stxvd2x vs39, o112, BO
|
||||||
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
stxvd2x vs40, o0, BO
|
||||||
|
stxvd2x vs41, o16, BO
|
||||||
|
stxvd2x vs42, o32, BO
|
||||||
|
stxvd2x vs43, o48, BO
|
||||||
|
stxvd2x vs44, o64, BO
|
||||||
|
stxvd2x vs45, o80, BO
|
||||||
|
stxvd2x vs46, o96, BO
|
||||||
|
stxvd2x vs47, o112, BO
|
||||||
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=4
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x4
|
||||||
|
|
||||||
|
lxvd2x vs0, o0, A0
|
||||||
|
lxvd2x vs1, o16, A0
|
||||||
|
addi A0, A0, 32
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs8, o0, A1
|
||||||
|
lxvd2x vs9, o16, A1
|
||||||
|
addi A1, A1, 32
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs16, o0, A2
|
||||||
|
lxvd2x vs17, o16, A2
|
||||||
|
addi A2, A2, 32
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs24, o0, A3
|
||||||
|
lxvd2x vs25, o16, A3
|
||||||
|
addi A3, A3, 32
|
||||||
|
|
||||||
|
|
||||||
|
xxpermdi vs32, vs0, vs8, 0
|
||||||
|
xxpermdi vs33, vs16, vs24, 0
|
||||||
|
xxpermdi vs34, vs0, vs8, 3
|
||||||
|
xxpermdi vs35, vs16, vs24, 3
|
||||||
|
|
||||||
|
xxpermdi vs36, vs1, vs9, 0
|
||||||
|
xxpermdi vs37, vs17, vs25, 0
|
||||||
|
xxpermdi vs38, vs1, vs9, 3
|
||||||
|
xxpermdi vs39, vs17, vs25, 3
|
||||||
|
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, BO
|
||||||
|
stxvd2x vs33, o16, BO
|
||||||
|
stxvd2x vs34, o32, BO
|
||||||
|
stxvd2x vs35, o48, BO
|
||||||
|
stxvd2x vs36, o64, BO
|
||||||
|
stxvd2x vs37, o80, BO
|
||||||
|
stxvd2x vs38, o96, BO
|
||||||
|
stxvd2x vs39, o112, BO
|
||||||
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=2
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x2
|
||||||
|
|
||||||
|
lxvd2x vs0, o0, A0
|
||||||
|
addi A0, A0, 16
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs8, o0, A1
|
||||||
|
addi A1, A1, 16
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs16, o0, A2
|
||||||
|
addi A2, A2, 16
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs24, o0, A3
|
||||||
|
addi A3, A3, 16
|
||||||
|
|
||||||
|
|
||||||
|
xxpermdi vs32, vs0, vs8, 0
|
||||||
|
xxpermdi vs33, vs16, vs24, 0
|
||||||
|
xxpermdi vs34, vs0, vs8, 3
|
||||||
|
xxpermdi vs35, vs16, vs24, 3
|
||||||
|
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, BO
|
||||||
|
stxvd2x vs33, o16, BO
|
||||||
|
stxvd2x vs34, o32, BO
|
||||||
|
stxvd2x vs35, o48, BO
|
||||||
|
addi BO, BO, 64
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=1
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x1
|
||||||
|
|
||||||
|
lxsdx vs0, o0, A0
|
||||||
|
addi A0, A0, 8
|
||||||
|
|
||||||
|
|
||||||
|
lxsdx vs8, o0, A1
|
||||||
|
addi A1, A1, 8
|
||||||
|
|
||||||
|
|
||||||
|
lxsdx vs16, o0, A2
|
||||||
|
addi A2, A2, 8
|
||||||
|
|
||||||
|
|
||||||
|
lxsdx vs24, o0, A3
|
||||||
|
addi A3, A3, 8
|
||||||
|
|
||||||
|
|
||||||
|
xxpermdi vs32, vs0, vs8, 0
|
||||||
|
xxpermdi vs33, vs16, vs24, 0
|
||||||
|
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, BO
|
||||||
|
stxvd2x vs33, o16, BO
|
||||||
|
addi BO, BO, 32
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=16
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x16
|
||||||
|
|
||||||
|
lxvd2x vs0, o0, A0
|
||||||
|
lxvd2x vs1, o16, A0
|
||||||
|
lxvd2x vs2, o32, A0
|
||||||
|
lxvd2x vs3, o48, A0
|
||||||
|
lxvd2x vs4, o64, A0
|
||||||
|
lxvd2x vs5, o80, A0
|
||||||
|
lxvd2x vs6, o96, A0
|
||||||
|
lxvd2x vs7, o112, A0
|
||||||
|
addi A0, A0, 128
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs8, o0, A1
|
||||||
|
lxvd2x vs9, o16, A1
|
||||||
|
lxvd2x vs10, o32, A1
|
||||||
|
lxvd2x vs11, o48, A1
|
||||||
|
lxvd2x vs12, o64, A1
|
||||||
|
lxvd2x vs13, o80, A1
|
||||||
|
lxvd2x vs14, o96, A1
|
||||||
|
lxvd2x vs15, o112, A1
|
||||||
|
addi A1, A1, 128
|
||||||
|
|
||||||
|
|
||||||
|
xxpermdi vs32, vs0, vs8, 0
|
||||||
|
xxpermdi vs33, vs0, vs8, 3
|
||||||
|
|
||||||
|
xxpermdi vs34, vs1, vs9, 0
|
||||||
|
xxpermdi vs35, vs1, vs9, 3
|
||||||
|
|
||||||
|
xxpermdi vs36, vs2, vs10, 0
|
||||||
|
xxpermdi vs37, vs2, vs10, 3
|
||||||
|
|
||||||
|
xxpermdi vs38, vs3, vs11, 0
|
||||||
|
xxpermdi vs39, vs3, vs11, 3
|
||||||
|
|
||||||
|
xxpermdi vs40, vs4, vs12, 0
|
||||||
|
xxpermdi vs41, vs4, vs12, 3
|
||||||
|
|
||||||
|
xxpermdi vs42, vs5, vs13, 0
|
||||||
|
xxpermdi vs43, vs5, vs13, 3
|
||||||
|
|
||||||
|
xxpermdi vs44, vs6, vs14, 0
|
||||||
|
xxpermdi vs45, vs6, vs14, 3
|
||||||
|
|
||||||
|
xxpermdi vs46, vs7, vs15, 0
|
||||||
|
xxpermdi vs47, vs7, vs15, 3
|
||||||
|
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, BO
|
||||||
|
stxvd2x vs33, o16, BO
|
||||||
|
stxvd2x vs34, o32, BO
|
||||||
|
stxvd2x vs35, o48, BO
|
||||||
|
stxvd2x vs36, o64, BO
|
||||||
|
stxvd2x vs37, o80, BO
|
||||||
|
stxvd2x vs38, o96, BO
|
||||||
|
stxvd2x vs39, o112, BO
|
||||||
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
stxvd2x vs40, o0, BO
|
||||||
|
stxvd2x vs41, o16, BO
|
||||||
|
stxvd2x vs42, o32, BO
|
||||||
|
stxvd2x vs43, o48, BO
|
||||||
|
stxvd2x vs44, o64, BO
|
||||||
|
stxvd2x vs45, o80, BO
|
||||||
|
stxvd2x vs46, o96, BO
|
||||||
|
stxvd2x vs47, o112, BO
|
||||||
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=8
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x8
|
||||||
|
|
||||||
|
lxvd2x vs0, o0, A0
|
||||||
|
lxvd2x vs1, o16, A0
|
||||||
|
lxvd2x vs2, o32, A0
|
||||||
|
lxvd2x vs3, o48, A0
|
||||||
|
addi A0, A0, 64
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs8, o0, A1
|
||||||
|
lxvd2x vs9, o16, A1
|
||||||
|
lxvd2x vs10, o32, A1
|
||||||
|
lxvd2x vs11, o48, A1
|
||||||
|
addi A1, A1, 64
|
||||||
|
|
||||||
|
|
||||||
|
xxpermdi vs32, vs0, vs8, 0
|
||||||
|
xxpermdi vs33, vs0, vs8, 3
|
||||||
|
|
||||||
|
xxpermdi vs34, vs1, vs9, 0
|
||||||
|
xxpermdi vs35, vs1, vs9, 3
|
||||||
|
|
||||||
|
xxpermdi vs36, vs2, vs10, 0
|
||||||
|
xxpermdi vs37, vs2, vs10, 3
|
||||||
|
|
||||||
|
xxpermdi vs38, vs3, vs11, 0
|
||||||
|
xxpermdi vs39, vs3, vs11, 3
|
||||||
|
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, BO
|
||||||
|
stxvd2x vs33, o16, BO
|
||||||
|
stxvd2x vs34, o32, BO
|
||||||
|
stxvd2x vs35, o48, BO
|
||||||
|
stxvd2x vs36, o64, BO
|
||||||
|
stxvd2x vs37, o80, BO
|
||||||
|
stxvd2x vs38, o96, BO
|
||||||
|
stxvd2x vs39, o112, BO
|
||||||
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=4
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x4
|
||||||
|
|
||||||
|
lxvd2x vs0, o0, A0
|
||||||
|
lxvd2x vs1, o16, A0
|
||||||
|
addi A0, A0, 32
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs8, o0, A1
|
||||||
|
lxvd2x vs9, o16, A1
|
||||||
|
addi A1, A1, 32
|
||||||
|
|
||||||
|
|
||||||
|
xxpermdi vs32, vs0, vs8, 0
|
||||||
|
xxpermdi vs33, vs0, vs8, 3
|
||||||
|
|
||||||
|
xxpermdi vs34, vs1, vs9, 0
|
||||||
|
xxpermdi vs35, vs1, vs9, 3
|
||||||
|
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, BO
|
||||||
|
stxvd2x vs33, o16, BO
|
||||||
|
stxvd2x vs34, o32, BO
|
||||||
|
stxvd2x vs35, o48, BO
|
||||||
|
addi BO, BO, 64
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=2
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x2
|
||||||
|
|
||||||
|
lxvd2x vs0, o0, A0
|
||||||
|
addi A0, A0, 16
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs8, o0, A1
|
||||||
|
addi A1, A1, 16
|
||||||
|
|
||||||
|
|
||||||
|
xxpermdi vs32, vs0, vs8, 0
|
||||||
|
xxpermdi vs33, vs0, vs8, 3
|
||||||
|
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, BO
|
||||||
|
stxvd2x vs33, o16, BO
|
||||||
|
addi BO, BO, 32
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=1
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x1
|
||||||
|
|
||||||
|
lxsdx vs0, o0, A0
|
||||||
|
addi A0, A0, 8
|
||||||
|
|
||||||
|
|
||||||
|
lxsdx vs8, o0, A1
|
||||||
|
addi A1, A1, 8
|
||||||
|
|
||||||
|
|
||||||
|
xxpermdi vs32, vs0, vs8, 0
|
||||||
|
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, BO
|
||||||
|
addi BO, BO, 16
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=16
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x16
|
||||||
|
|
||||||
|
lxvd2x vs0, o0, A0
|
||||||
|
lxvd2x vs1, o16, A0
|
||||||
|
lxvd2x vs2, o32, A0
|
||||||
|
lxvd2x vs3, o48, A0
|
||||||
|
lxvd2x vs4, o64, A0
|
||||||
|
lxvd2x vs5, o80, A0
|
||||||
|
lxvd2x vs6, o96, A0
|
||||||
|
lxvd2x vs7, o112, A0
|
||||||
|
addi A0, A0, 128
|
||||||
|
|
||||||
|
|
||||||
|
stxvd2x vs0, o0, BO
|
||||||
|
stxvd2x vs1, o16, BO
|
||||||
|
stxvd2x vs2, o32, BO
|
||||||
|
stxvd2x vs3, o48, BO
|
||||||
|
addi BO, BO, 64
|
||||||
|
|
||||||
|
stxvd2x vs4, o0, BO
|
||||||
|
stxvd2x vs5, o16, BO
|
||||||
|
stxvd2x vs6, o32, BO
|
||||||
|
stxvd2x vs7, o48, BO
|
||||||
|
addi BO, BO, 64
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=8
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x8
|
||||||
|
|
||||||
|
lxvd2x vs0, o0, A0
|
||||||
|
lxvd2x vs1, o16, A0
|
||||||
|
lxvd2x vs2, o32, A0
|
||||||
|
lxvd2x vs3, o48, A0
|
||||||
|
addi A0, A0, 64
|
||||||
|
|
||||||
|
|
||||||
|
stxvd2x vs0, o0, BO
|
||||||
|
stxvd2x vs1, o16, BO
|
||||||
|
stxvd2x vs2, o32, BO
|
||||||
|
stxvd2x vs3, o48, BO
|
||||||
|
addi BO, BO, 64
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=4
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x4
|
||||||
|
|
||||||
|
lxvd2x vs0, o0, A0
|
||||||
|
lxvd2x vs1, o16, A0
|
||||||
|
addi A0, A0, 32
|
||||||
|
|
||||||
|
|
||||||
|
stxvd2x vs0, o0, BO
|
||||||
|
stxvd2x vs1, o16, BO
|
||||||
|
addi BO, BO, 32
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=2
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x2
|
||||||
|
|
||||||
|
lxvd2x vs0, o0, A0
|
||||||
|
addi A0, A0, 16
|
||||||
|
|
||||||
|
|
||||||
|
stxvd2x vs0, o0, BO
|
||||||
|
addi BO, BO, 16
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=1
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x1
|
||||||
|
|
||||||
|
lxsdx vs0, o0, A0
|
||||||
|
addi A0, A0, 8
|
||||||
|
|
||||||
|
|
||||||
|
stxsdx vs0, o0, BO
|
||||||
|
addi BO, BO, 8
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
|
@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
add B2, B2, B
|
add B2, B2, B
|
||||||
add B1, B1, B
|
add B1, B1, B
|
||||||
|
|
||||||
li PREA, 768
|
li PREA, 256
|
||||||
addi PREB, M16, 128
|
addi PREB, M16, 128
|
||||||
|
|
||||||
li o8, 8
|
li o8, 8
|
||||||
|
|
|
@ -57,16 +57,20 @@ DCOPYT_L4_BEGIN:
|
||||||
|
|
||||||
DCOPYT_L4x16_LOOP:
|
DCOPYT_L4x16_LOOP:
|
||||||
|
|
||||||
|
/*
|
||||||
addi T1, PREB, 128
|
addi T1, PREB, 128
|
||||||
addi T2, PREB, 256
|
addi T2, PREB, 256
|
||||||
|
*/
|
||||||
dcbt A0, PREA
|
dcbt A0, PREA
|
||||||
dcbt A1, PREA
|
dcbt A1, PREA
|
||||||
dcbt A2, PREA
|
dcbt A2, PREA
|
||||||
dcbt A3, PREA
|
dcbt A3, PREA
|
||||||
|
/*
|
||||||
dcbtst BO, M16
|
dcbtst BO, M16
|
||||||
dcbtst BO, PREB
|
dcbtst BO, PREB
|
||||||
dcbtst BO, T1
|
dcbtst BO, T1
|
||||||
dcbtst BO, T2
|
dcbtst BO, T2
|
||||||
|
*/
|
||||||
COPY_4x16
|
COPY_4x16
|
||||||
|
|
||||||
add BO, BO, M16
|
add BO, BO, M16
|
||||||
|
|
|
@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define PRE r30
|
#define PRE r30
|
||||||
#define T2 r31
|
#define T2 r31
|
||||||
|
|
||||||
#include "dgemm_macros_16x4_power8.S"
|
#include "dtrmm_macros_16x4_power8.S"
|
||||||
|
|
||||||
|
|
||||||
#ifndef NEEDPARAM
|
#ifndef NEEDPARAM
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,207 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
#include "def_vsx.h"
|
||||||
|
|
||||||
|
#define M r3
|
||||||
|
#define N r4
|
||||||
|
#define A r5
|
||||||
|
#define LDA r6
|
||||||
|
#define B r7
|
||||||
|
|
||||||
|
#define A0 r8
|
||||||
|
#define A1 r9
|
||||||
|
#define A2 r10
|
||||||
|
#define A3 r11
|
||||||
|
|
||||||
|
#define J r12
|
||||||
|
|
||||||
|
#define PREA r14
|
||||||
|
#define PREB r15
|
||||||
|
#define BO r16
|
||||||
|
#define B8 r17
|
||||||
|
#define B4 r18
|
||||||
|
#define B2 r19
|
||||||
|
#define B1 r20
|
||||||
|
#define o4 r21
|
||||||
|
#define T2 r22
|
||||||
|
#define I r23
|
||||||
|
#define o16 r24
|
||||||
|
#define o32 r25
|
||||||
|
#define o48 r26
|
||||||
|
#define NOTU1 r29
|
||||||
|
#define M8 r30
|
||||||
|
#define T1 r31
|
||||||
|
|
||||||
|
#define o0 0
|
||||||
|
|
||||||
|
#include "sgemm_tcopy_macros_8_power8.S"
|
||||||
|
|
||||||
|
#define STACKSIZE 384
|
||||||
|
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
addi SP, SP, -STACKSIZE
|
||||||
|
li r0, 0
|
||||||
|
|
||||||
|
std r31, 144(SP)
|
||||||
|
std r30, 152(SP)
|
||||||
|
std r29, 160(SP)
|
||||||
|
std r28, 168(SP)
|
||||||
|
std r27, 176(SP)
|
||||||
|
std r26, 184(SP)
|
||||||
|
std r25, 192(SP)
|
||||||
|
std r24, 200(SP)
|
||||||
|
std r23, 208(SP)
|
||||||
|
std r22, 216(SP)
|
||||||
|
std r21, 224(SP)
|
||||||
|
std r20, 232(SP)
|
||||||
|
std r19, 240(SP)
|
||||||
|
std r18, 248(SP)
|
||||||
|
std r17, 256(SP)
|
||||||
|
std r16, 264(SP)
|
||||||
|
std r15, 272(SP)
|
||||||
|
std r14, 280(SP)
|
||||||
|
|
||||||
|
cmpwi cr0, M, 0
|
||||||
|
ble- L999
|
||||||
|
cmpwi cr0, N, 0
|
||||||
|
ble- L999
|
||||||
|
|
||||||
|
slwi LDA, LDA, BASE_SHIFT
|
||||||
|
slwi M8, M, 3 + BASE_SHIFT
|
||||||
|
|
||||||
|
li T2, -8
|
||||||
|
li PREA, -4
|
||||||
|
li PREB, -2
|
||||||
|
|
||||||
|
and B4, N, T2
|
||||||
|
and B2, N, PREA
|
||||||
|
and B1, N, PREB
|
||||||
|
|
||||||
|
mullw B4, B4, M
|
||||||
|
mullw B2, B2, M
|
||||||
|
mullw B1, B1, M
|
||||||
|
|
||||||
|
slwi B4, B4, BASE_SHIFT
|
||||||
|
slwi B2, B2, BASE_SHIFT
|
||||||
|
slwi B1, B1, BASE_SHIFT
|
||||||
|
|
||||||
|
add B4, B4, B
|
||||||
|
add B2, B2, B
|
||||||
|
add B1, B1, B
|
||||||
|
|
||||||
|
li PREA, 384
|
||||||
|
addi PREB, M8, 128
|
||||||
|
|
||||||
|
li o4, 4
|
||||||
|
li o16, 16
|
||||||
|
li o32, 32
|
||||||
|
li o48, 48
|
||||||
|
|
||||||
|
#include "sgemm_tcopy_logic_8_power8.S"
|
||||||
|
|
||||||
|
L999:
|
||||||
|
|
||||||
|
li r3, 0
|
||||||
|
|
||||||
|
ld r31, 144(SP)
|
||||||
|
ld r30, 152(SP)
|
||||||
|
ld r29, 160(SP)
|
||||||
|
ld r28, 168(SP)
|
||||||
|
ld r27, 176(SP)
|
||||||
|
ld r26, 184(SP)
|
||||||
|
ld r25, 192(SP)
|
||||||
|
ld r24, 200(SP)
|
||||||
|
ld r23, 208(SP)
|
||||||
|
ld r22, 216(SP)
|
||||||
|
ld r21, 224(SP)
|
||||||
|
ld r20, 232(SP)
|
||||||
|
ld r19, 240(SP)
|
||||||
|
ld r18, 248(SP)
|
||||||
|
ld r17, 256(SP)
|
||||||
|
ld r16, 264(SP)
|
||||||
|
ld r15, 272(SP)
|
||||||
|
ld r14, 280(SP)
|
||||||
|
|
||||||
|
addi SP, SP, STACKSIZE
|
||||||
|
|
||||||
|
blr
|
||||||
|
EPILOGUE
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,299 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
srawi. I, M, 2
|
||||||
|
ble SCOPYOT_L2_BEGIN
|
||||||
|
|
||||||
|
|
||||||
|
SCOPYOT_L4_BEGIN:
|
||||||
|
|
||||||
|
mr A0, A
|
||||||
|
add A1, A0, LDA
|
||||||
|
add A2, A1, LDA
|
||||||
|
add A3, A2, LDA
|
||||||
|
add A, A3, LDA
|
||||||
|
mr B8, B
|
||||||
|
addi B, B, 32*SIZE
|
||||||
|
|
||||||
|
sradi. J, N, 3
|
||||||
|
ble SCOPYOT_L4x4_BEGIN
|
||||||
|
|
||||||
|
mr BO, B8
|
||||||
|
.align 5
|
||||||
|
|
||||||
|
SCOPYOT_L4x8_LOOP:
|
||||||
|
|
||||||
|
dcbt A0, PREA
|
||||||
|
dcbt A1, PREA
|
||||||
|
dcbt A2, PREA
|
||||||
|
dcbt A3, PREA
|
||||||
|
COPY_4x8
|
||||||
|
|
||||||
|
addi A0, A0, 8*SIZE
|
||||||
|
addi A1, A1, 8*SIZE
|
||||||
|
addi A2, A2, 8*SIZE
|
||||||
|
addi A3, A3, 8*SIZE
|
||||||
|
add BO, BO, M8
|
||||||
|
|
||||||
|
addic. J, J, -1
|
||||||
|
ble SCOPYOT_L4x4_BEGIN
|
||||||
|
|
||||||
|
COPY_4x8
|
||||||
|
|
||||||
|
addi A0, A0, 8*SIZE
|
||||||
|
addi A1, A1, 8*SIZE
|
||||||
|
addi A2, A2, 8*SIZE
|
||||||
|
addi A3, A3, 8*SIZE
|
||||||
|
add BO, BO, M8
|
||||||
|
|
||||||
|
addic. J, J, -1
|
||||||
|
ble SCOPYOT_L4x4_BEGIN
|
||||||
|
|
||||||
|
COPY_4x8
|
||||||
|
|
||||||
|
addi A0, A0, 8*SIZE
|
||||||
|
addi A1, A1, 8*SIZE
|
||||||
|
addi A2, A2, 8*SIZE
|
||||||
|
addi A3, A3, 8*SIZE
|
||||||
|
add BO, BO, M8
|
||||||
|
|
||||||
|
addic. J, J, -1
|
||||||
|
ble SCOPYOT_L4x4_BEGIN
|
||||||
|
|
||||||
|
COPY_4x8
|
||||||
|
|
||||||
|
addi A0, A0, 8*SIZE
|
||||||
|
addi A1, A1, 8*SIZE
|
||||||
|
addi A2, A2, 8*SIZE
|
||||||
|
addi A3, A3, 8*SIZE
|
||||||
|
add BO, BO, M8
|
||||||
|
|
||||||
|
addic. J, J, -1
|
||||||
|
bgt SCOPYOT_L4x8_LOOP
|
||||||
|
|
||||||
|
SCOPYOT_L4x4_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 4
|
||||||
|
ble SCOPYOT_L4x2_BEGIN
|
||||||
|
|
||||||
|
mr BO, B4
|
||||||
|
|
||||||
|
COPY_4x4
|
||||||
|
|
||||||
|
addi A0, A0, 4*SIZE
|
||||||
|
addi A1, A1, 4*SIZE
|
||||||
|
addi A2, A2, 4*SIZE
|
||||||
|
addi A3, A3, 4*SIZE
|
||||||
|
|
||||||
|
addi B4, B4, 16*SIZE
|
||||||
|
|
||||||
|
SCOPYOT_L4x2_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 2
|
||||||
|
ble SCOPYOT_L4x1_BEGIN
|
||||||
|
|
||||||
|
mr BO, B2
|
||||||
|
|
||||||
|
COPY_4x2
|
||||||
|
|
||||||
|
addi A0, A0, 2*SIZE
|
||||||
|
addi A1, A1, 2*SIZE
|
||||||
|
addi A2, A2, 2*SIZE
|
||||||
|
addi A3, A3, 2*SIZE
|
||||||
|
|
||||||
|
addi B2, B2, 8*SIZE
|
||||||
|
|
||||||
|
SCOPYOT_L4x1_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 1
|
||||||
|
ble SCOPYOT_L4_END
|
||||||
|
|
||||||
|
mr BO, B1
|
||||||
|
|
||||||
|
COPY_4x1
|
||||||
|
|
||||||
|
addi A0, A0, 1*SIZE
|
||||||
|
addi A1, A1, 1*SIZE
|
||||||
|
addi A2, A2, 1*SIZE
|
||||||
|
addi A3, A3, 1*SIZE
|
||||||
|
|
||||||
|
addi B1, B1, 4*SIZE
|
||||||
|
|
||||||
|
SCOPYOT_L4_END:
|
||||||
|
|
||||||
|
addic. I, I, -1
|
||||||
|
bgt SCOPYOT_L4_BEGIN
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
SCOPYOT_L2_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, M, 2
|
||||||
|
ble SCOPYOT_L1_BEGIN
|
||||||
|
|
||||||
|
mr A0, A
|
||||||
|
add A1, A0, LDA
|
||||||
|
add A, A1, LDA
|
||||||
|
mr B8, B
|
||||||
|
addi B, B, 16*SIZE
|
||||||
|
|
||||||
|
sradi. J, N, 3
|
||||||
|
ble SCOPYOT_L2x4_BEGIN
|
||||||
|
|
||||||
|
mr BO, B8
|
||||||
|
|
||||||
|
SCOPYOT_L2x8_LOOP:
|
||||||
|
|
||||||
|
COPY_2x8
|
||||||
|
|
||||||
|
addi A0, A0, 8*SIZE
|
||||||
|
addi A1, A1, 8*SIZE
|
||||||
|
add BO, BO, M8
|
||||||
|
|
||||||
|
addic. J, J, -1
|
||||||
|
bgt SCOPYOT_L2x8_LOOP
|
||||||
|
|
||||||
|
SCOPYOT_L2x4_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 4
|
||||||
|
ble SCOPYOT_L2x2_BEGIN
|
||||||
|
|
||||||
|
mr BO, B4
|
||||||
|
|
||||||
|
COPY_2x4
|
||||||
|
|
||||||
|
addi A0, A0, 4*SIZE
|
||||||
|
addi A1, A1, 4*SIZE
|
||||||
|
|
||||||
|
addi B4, B4, 8*SIZE
|
||||||
|
|
||||||
|
SCOPYOT_L2x2_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 2
|
||||||
|
ble SCOPYOT_L2x1_BEGIN
|
||||||
|
|
||||||
|
mr BO, B2
|
||||||
|
|
||||||
|
COPY_2x2
|
||||||
|
|
||||||
|
addi A0, A0, 2*SIZE
|
||||||
|
addi A1, A1, 2*SIZE
|
||||||
|
|
||||||
|
addi B2, B2, 4*SIZE
|
||||||
|
|
||||||
|
SCOPYOT_L2x1_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 1
|
||||||
|
ble SCOPYOT_L2_END
|
||||||
|
|
||||||
|
mr BO, B1
|
||||||
|
|
||||||
|
COPY_2x1
|
||||||
|
|
||||||
|
addi A0, A0, 1*SIZE
|
||||||
|
addi A1, A1, 1*SIZE
|
||||||
|
|
||||||
|
addi B1, B1, 2*SIZE
|
||||||
|
|
||||||
|
SCOPYOT_L2_END:
|
||||||
|
|
||||||
|
|
||||||
|
SCOPYOT_L1_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, M, 1
|
||||||
|
ble L999
|
||||||
|
|
||||||
|
mr A0, A
|
||||||
|
add A, A0, LDA
|
||||||
|
mr B8, B
|
||||||
|
addi B, B, 8*SIZE
|
||||||
|
|
||||||
|
sradi. J, N, 3
|
||||||
|
ble SCOPYOT_L1x4_BEGIN
|
||||||
|
|
||||||
|
mr BO, B8
|
||||||
|
|
||||||
|
SCOPYOT_L1x8_LOOP:
|
||||||
|
|
||||||
|
COPY_1x8
|
||||||
|
|
||||||
|
addi A0, A0, 8*SIZE
|
||||||
|
add BO, BO, M8
|
||||||
|
|
||||||
|
addic. J, J, -1
|
||||||
|
bgt SCOPYOT_L1x8_LOOP
|
||||||
|
|
||||||
|
SCOPYOT_L1x4_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 4
|
||||||
|
ble SCOPYOT_L1x2_BEGIN
|
||||||
|
|
||||||
|
mr BO, B4
|
||||||
|
|
||||||
|
COPY_1x4
|
||||||
|
|
||||||
|
addi A0, A0, 4*SIZE
|
||||||
|
|
||||||
|
addi B4, B4, 4*SIZE
|
||||||
|
|
||||||
|
SCOPYOT_L1x2_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 2
|
||||||
|
ble SCOPYOT_L1x1_BEGIN
|
||||||
|
|
||||||
|
mr BO, B2
|
||||||
|
|
||||||
|
COPY_1x2
|
||||||
|
|
||||||
|
addi A0, A0, 2*SIZE
|
||||||
|
|
||||||
|
addi B2, B2, 2*SIZE
|
||||||
|
|
||||||
|
SCOPYOT_L1x1_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 1
|
||||||
|
ble SCOPYOT_L1_END
|
||||||
|
|
||||||
|
mr BO, B1
|
||||||
|
|
||||||
|
COPY_1x1
|
||||||
|
|
||||||
|
addi A0, A0, 1*SIZE
|
||||||
|
|
||||||
|
addi B1, B1, 1*SIZE
|
||||||
|
|
||||||
|
SCOPYOT_L1_END:
|
||||||
|
|
|
@ -0,0 +1,308 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=8
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x8
|
||||||
|
|
||||||
|
lxvw4x vs32, o0, A0
|
||||||
|
lxvw4x vs33, o16, A0
|
||||||
|
|
||||||
|
lxvw4x vs34, o0, A1
|
||||||
|
lxvw4x vs35, o16, A1
|
||||||
|
|
||||||
|
lxvw4x vs36, o0, A2
|
||||||
|
lxvw4x vs37, o16, A2
|
||||||
|
|
||||||
|
lxvw4x vs38, o0, A3
|
||||||
|
lxvw4x vs39, o16, A3
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvw4x vs32, o0, T1
|
||||||
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
stxvw4x vs34, o32, T1
|
||||||
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvw4x vs36, o0, T1
|
||||||
|
stxvw4x vs37, o16, T1
|
||||||
|
|
||||||
|
stxvw4x vs38, o32, T1
|
||||||
|
stxvw4x vs39, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=4
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x4
|
||||||
|
|
||||||
|
lxvw4x vs32, o0, A0
|
||||||
|
|
||||||
|
lxvw4x vs33, o0, A1
|
||||||
|
|
||||||
|
lxvw4x vs34, o0, A2
|
||||||
|
|
||||||
|
lxvw4x vs35, o0, A3
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvw4x vs32, o0, T1
|
||||||
|
|
||||||
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
stxvw4x vs34, o32, T1
|
||||||
|
|
||||||
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=2
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x2
|
||||||
|
|
||||||
|
lxsspx vs32, o0, A0
|
||||||
|
lxsspx vs33, o4, A0
|
||||||
|
|
||||||
|
lxsspx vs34, o0, A1
|
||||||
|
lxsspx vs35, o4, A1
|
||||||
|
|
||||||
|
lxsspx vs36, o0, A2
|
||||||
|
lxsspx vs37, o4, A2
|
||||||
|
|
||||||
|
lxsspx vs38, o0, A3
|
||||||
|
lxsspx vs39, o4, A3
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxsspx vs32, o0, T1
|
||||||
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
addi T1, T1, 8
|
||||||
|
|
||||||
|
stxsspx vs34, o0, T1
|
||||||
|
stxsspx vs35, o4, T1
|
||||||
|
|
||||||
|
addi T1, T1, 8
|
||||||
|
|
||||||
|
stxsspx vs36, o0, T1
|
||||||
|
stxsspx vs37, o4, T1
|
||||||
|
|
||||||
|
addi T1, T1, 8
|
||||||
|
|
||||||
|
stxsspx vs38, o0, T1
|
||||||
|
stxsspx vs39, o4, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=1
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x1
|
||||||
|
|
||||||
|
lxsspx vs32, o0, A0
|
||||||
|
|
||||||
|
lxsspx vs33, o0, A1
|
||||||
|
|
||||||
|
lxsspx vs34, o0, A2
|
||||||
|
|
||||||
|
lxsspx vs35, o0, A3
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxsspx vs32, o0, T1
|
||||||
|
|
||||||
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
addi T1, T1, 8
|
||||||
|
|
||||||
|
stxsspx vs34, o0, T1
|
||||||
|
|
||||||
|
stxsspx vs35, o4, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=8
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x8
|
||||||
|
|
||||||
|
lxvw4x vs32, o0, A0
|
||||||
|
lxvw4x vs33, o16, A0
|
||||||
|
|
||||||
|
lxvw4x vs34, o0, A1
|
||||||
|
lxvw4x vs35, o16, A1
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvw4x vs32, o0, T1
|
||||||
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
stxvw4x vs34, o32, T1
|
||||||
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=4
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x4
|
||||||
|
|
||||||
|
lxvw4x vs32, o0, A0
|
||||||
|
|
||||||
|
lxvw4x vs33, o0, A1
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvw4x vs32, o0, T1
|
||||||
|
|
||||||
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=2
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x2
|
||||||
|
|
||||||
|
lxsspx vs32, o0, A0
|
||||||
|
lxsspx vs33, o4, A0
|
||||||
|
|
||||||
|
lxsspx vs34, o0, A1
|
||||||
|
lxsspx vs35, o4, A1
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxsspx vs32, o0, T1
|
||||||
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
addi T1, T1, 8
|
||||||
|
|
||||||
|
stxsspx vs34, o0, T1
|
||||||
|
stxsspx vs35, o4, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=1
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x1
|
||||||
|
|
||||||
|
lxsspx vs32, o0, A0
|
||||||
|
|
||||||
|
lxsspx vs33, o0, A1
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxsspx vs32, o0, T1
|
||||||
|
|
||||||
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=8
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x8
|
||||||
|
|
||||||
|
lxvw4x vs32, o0, A0
|
||||||
|
lxvw4x vs33, o16, A0
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvw4x vs32, o0, T1
|
||||||
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=4
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x4
|
||||||
|
|
||||||
|
lxvw4x vs32, o0, A0
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvw4x vs32, o0, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=2
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x2
|
||||||
|
|
||||||
|
lxsspx vs32, o0, A0
|
||||||
|
lxsspx vs33, o4, A0
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxsspx vs32, o0, T1
|
||||||
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=1
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x1
|
||||||
|
|
||||||
|
lxsspx vs32, o0, A0
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxsspx vs32, o0, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
|
@ -1,3 +1,73 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
/* All rights reserved. */
|
/* All rights reserved. */
|
||||||
|
@ -250,7 +320,7 @@
|
||||||
ble L999
|
ble L999
|
||||||
|
|
||||||
slwi LDC, LDC, ZBASE_SHIFT
|
slwi LDC, LDC, ZBASE_SHIFT
|
||||||
li PRE, 384
|
li PRE, 512
|
||||||
li o8 , 8
|
li o8 , 8
|
||||||
li o16 , 16
|
li o16 , 16
|
||||||
li o24 , 24
|
li o24 , 24
|
||||||
|
|
|
@ -1,3 +1,39 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
srawi. J, N, 1
|
srawi. J, N, 1
|
||||||
ble ZGEMM_L2_END
|
ble ZGEMM_L2_END
|
||||||
|
|
||||||
|
@ -5,20 +41,34 @@ ZGEMM_L2_BEGIN:
|
||||||
|
|
||||||
mr BO, B
|
mr BO, B
|
||||||
mr BBO, BBUFFER
|
mr BBO, BBUFFER
|
||||||
slwi T1, K, 1
|
srawi. T1, K, 2
|
||||||
|
ble ZGEMM_L2_COPYB1
|
||||||
|
|
||||||
ZGEMM_L2_COPYB:
|
ZGEMM_L2_COPYB8:
|
||||||
|
|
||||||
lxvdsx vs4, o0, BO // b0_r
|
addi T2, PRE, 128
|
||||||
lxvdsx vs5, o8, BO // b0_i
|
dcbt BO, PRE
|
||||||
addi BO, BO, 16
|
dcbtst BBO, PRE
|
||||||
stxvd2x vs4, o0, BBO
|
dcbtst BBO, T2
|
||||||
stxvd2x vs5, o16, BBO
|
ZCOPYB_8x1
|
||||||
addic. T1, T1, -1
|
addic. T1, T1, -1
|
||||||
addi BBO, BBO, 32
|
|
||||||
|
|
||||||
bge ZGEMM_L2_COPYB
|
bgt ZGEMM_L2_COPYB8
|
||||||
|
|
||||||
|
ZGEMM_L2_COPYB1:
|
||||||
|
|
||||||
|
andi. T1, K, 3
|
||||||
|
ble ZGEMM_L2_COPYB_END
|
||||||
|
|
||||||
|
ZGEMM_L2_COPYB_LOOP:
|
||||||
|
|
||||||
|
ZCOPYB_1x1
|
||||||
|
ZCOPYB_1x1
|
||||||
|
addic. T1, T1, -1
|
||||||
|
|
||||||
|
bgt ZGEMM_L2_COPYB_LOOP
|
||||||
|
|
||||||
|
ZGEMM_L2_COPYB_END:
|
||||||
|
|
||||||
mr CO, C
|
mr CO, C
|
||||||
mr AO, A
|
mr AO, A
|
||||||
|
@ -493,6 +543,7 @@ ZGEMM_L1_BEGIN:
|
||||||
slwi T1, K, 0
|
slwi T1, K, 0
|
||||||
|
|
||||||
ZGEMM_L1_COPYB:
|
ZGEMM_L1_COPYB:
|
||||||
|
dcbtst BBO, PRE
|
||||||
|
|
||||||
lxvdsx vs4, o0, BO // b0_r
|
lxvdsx vs4, o0, BO // b0_r
|
||||||
lxvdsx vs5, o8, BO // b0_i
|
lxvdsx vs5, o8, BO // b0_i
|
||||||
|
|
|
@ -1,3 +1,38 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||||
|
|
||||||
#define XSFADD_R1 xsadddp
|
#define XSFADD_R1 xsadddp
|
||||||
|
@ -3055,3 +3090,76 @@
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
.macro ZCOPYB_1x1
|
||||||
|
|
||||||
|
lxvdsx vs4, o0, BO // b0_r
|
||||||
|
lxvdsx vs5, o8, BO // b0_i
|
||||||
|
addi BO, BO, 16
|
||||||
|
stxvd2x vs4, o0, BBO
|
||||||
|
stxvd2x vs5, o16, BBO
|
||||||
|
addi BBO, BBO, 32
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro ZCOPYB_8x1
|
||||||
|
|
||||||
|
lxvd2x vs32, o0, BO
|
||||||
|
lxvd2x vs33, o16, BO
|
||||||
|
lxvd2x vs34, o32, BO
|
||||||
|
lxvd2x vs35, o48, BO
|
||||||
|
addi BO, BO, 64
|
||||||
|
|
||||||
|
lxvd2x vs36, o0, BO
|
||||||
|
lxvd2x vs37, o16, BO
|
||||||
|
lxvd2x vs38, o32, BO
|
||||||
|
lxvd2x vs39, o48, BO
|
||||||
|
addi BO, BO, 64
|
||||||
|
|
||||||
|
xxspltd vs40, vs32, 0
|
||||||
|
xxspltd vs41, vs32, 1
|
||||||
|
xxspltd vs42, vs33, 0
|
||||||
|
xxspltd vs43, vs33, 1
|
||||||
|
xxspltd vs44, vs34, 0
|
||||||
|
xxspltd vs45, vs34, 1
|
||||||
|
xxspltd vs46, vs35, 0
|
||||||
|
xxspltd vs47, vs35, 1
|
||||||
|
|
||||||
|
xxspltd vs48, vs36, 0
|
||||||
|
xxspltd vs49, vs36, 1
|
||||||
|
xxspltd vs50, vs37, 0
|
||||||
|
xxspltd vs51, vs37, 1
|
||||||
|
xxspltd vs52, vs38, 0
|
||||||
|
xxspltd vs53, vs38, 1
|
||||||
|
xxspltd vs54, vs39, 0
|
||||||
|
xxspltd vs55, vs39, 1
|
||||||
|
|
||||||
|
stxvd2x vs40, o0, BBO
|
||||||
|
stxvd2x vs41, o16, BBO
|
||||||
|
stxvd2x vs42, o32, BBO
|
||||||
|
stxvd2x vs43, o48, BBO
|
||||||
|
addi BBO, BBO, 64
|
||||||
|
|
||||||
|
stxvd2x vs44, o0, BBO
|
||||||
|
stxvd2x vs45, o16, BBO
|
||||||
|
stxvd2x vs46, o32, BBO
|
||||||
|
stxvd2x vs47, o48, BBO
|
||||||
|
addi BBO, BBO, 64
|
||||||
|
|
||||||
|
stxvd2x vs48, o0, BBO
|
||||||
|
stxvd2x vs49, o16, BBO
|
||||||
|
stxvd2x vs50, o32, BBO
|
||||||
|
stxvd2x vs51, o48, BBO
|
||||||
|
addi BBO, BBO, 64
|
||||||
|
|
||||||
|
stxvd2x vs52, o0, BBO
|
||||||
|
stxvd2x vs53, o16, BBO
|
||||||
|
stxvd2x vs54, o32, BBO
|
||||||
|
stxvd2x vs55, o48, BBO
|
||||||
|
addi BBO, BBO, 64
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,205 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
#include "def_vsx.h"
|
||||||
|
|
||||||
|
#define M r3
|
||||||
|
#define N r4
|
||||||
|
#define A r5
|
||||||
|
#define LDA r6
|
||||||
|
#define B r7
|
||||||
|
|
||||||
|
#define A0 r8
|
||||||
|
#define A1 r9
|
||||||
|
#define A2 r10
|
||||||
|
#define A3 r11
|
||||||
|
|
||||||
|
#define J r12
|
||||||
|
|
||||||
|
#define PREA r14
|
||||||
|
#define PREB r15
|
||||||
|
#define BO r16
|
||||||
|
#define B8 r17
|
||||||
|
#define B4 r18
|
||||||
|
#define B2 r19
|
||||||
|
#define B1 r20
|
||||||
|
#define NOTUS1 r21
|
||||||
|
#define T2 r22
|
||||||
|
#define I r23
|
||||||
|
#define o16 r24
|
||||||
|
#define o32 r25
|
||||||
|
#define o48 r26
|
||||||
|
#define NOTUS2 r27
|
||||||
|
#define M8 r30
|
||||||
|
#define T1 r31
|
||||||
|
|
||||||
|
#define o0 0
|
||||||
|
|
||||||
|
#include "zgemm_tcopy_macros_8_power8.S"
|
||||||
|
|
||||||
|
#define STACKSIZE 384
|
||||||
|
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
addi SP, SP, -STACKSIZE
|
||||||
|
li r0, 0
|
||||||
|
|
||||||
|
std r31, 144(SP)
|
||||||
|
std r30, 152(SP)
|
||||||
|
std r29, 160(SP)
|
||||||
|
std r28, 168(SP)
|
||||||
|
std r27, 176(SP)
|
||||||
|
std r26, 184(SP)
|
||||||
|
std r25, 192(SP)
|
||||||
|
std r24, 200(SP)
|
||||||
|
std r23, 208(SP)
|
||||||
|
std r22, 216(SP)
|
||||||
|
std r21, 224(SP)
|
||||||
|
std r20, 232(SP)
|
||||||
|
std r19, 240(SP)
|
||||||
|
std r18, 248(SP)
|
||||||
|
std r17, 256(SP)
|
||||||
|
std r16, 264(SP)
|
||||||
|
std r15, 272(SP)
|
||||||
|
std r14, 280(SP)
|
||||||
|
|
||||||
|
cmpwi cr0, M, 0
|
||||||
|
ble- L999
|
||||||
|
cmpwi cr0, N, 0
|
||||||
|
ble- L999
|
||||||
|
|
||||||
|
slwi LDA, LDA, ZBASE_SHIFT
|
||||||
|
slwi M8, M, 3 + ZBASE_SHIFT
|
||||||
|
|
||||||
|
li T2, -8
|
||||||
|
li PREA, -4
|
||||||
|
li PREB, -2
|
||||||
|
|
||||||
|
and B4, N, T2
|
||||||
|
and B2, N, PREA
|
||||||
|
and B1, N, PREB
|
||||||
|
|
||||||
|
mullw B4, B4, M
|
||||||
|
mullw B2, B2, M
|
||||||
|
mullw B1, B1, M
|
||||||
|
|
||||||
|
slwi B4, B4, ZBASE_SHIFT
|
||||||
|
slwi B2, B2, ZBASE_SHIFT
|
||||||
|
slwi B1, B1, ZBASE_SHIFT
|
||||||
|
|
||||||
|
add B4, B4, B
|
||||||
|
add B2, B2, B
|
||||||
|
add B1, B1, B
|
||||||
|
|
||||||
|
li PREA, 384
|
||||||
|
addi PREB, M8, 128
|
||||||
|
|
||||||
|
li o16, 16
|
||||||
|
li o32, 32
|
||||||
|
li o48, 48
|
||||||
|
|
||||||
|
#include "zgemm_tcopy_logic_8_power8.S"
|
||||||
|
|
||||||
|
L999:
|
||||||
|
|
||||||
|
li r3, 0
|
||||||
|
|
||||||
|
ld r31, 144(SP)
|
||||||
|
ld r30, 152(SP)
|
||||||
|
ld r29, 160(SP)
|
||||||
|
ld r28, 168(SP)
|
||||||
|
ld r27, 176(SP)
|
||||||
|
ld r26, 184(SP)
|
||||||
|
ld r25, 192(SP)
|
||||||
|
ld r24, 200(SP)
|
||||||
|
ld r23, 208(SP)
|
||||||
|
ld r22, 216(SP)
|
||||||
|
ld r21, 224(SP)
|
||||||
|
ld r20, 232(SP)
|
||||||
|
ld r19, 240(SP)
|
||||||
|
ld r18, 248(SP)
|
||||||
|
ld r17, 256(SP)
|
||||||
|
ld r16, 264(SP)
|
||||||
|
ld r15, 272(SP)
|
||||||
|
ld r14, 280(SP)
|
||||||
|
|
||||||
|
addi SP, SP, STACKSIZE
|
||||||
|
|
||||||
|
blr
|
||||||
|
EPILOGUE
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,246 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
srawi. I, M, 2
|
||||||
|
ble ZCOPYT_L2_BEGIN
|
||||||
|
|
||||||
|
|
||||||
|
ZCOPYT_L4_BEGIN:
|
||||||
|
|
||||||
|
mr A0, A
|
||||||
|
add A1, A0, LDA
|
||||||
|
add A2, A1, LDA
|
||||||
|
add A3, A2, LDA
|
||||||
|
add A, A3, LDA
|
||||||
|
mr B8, B
|
||||||
|
addi B, B, 64*SIZE
|
||||||
|
|
||||||
|
sradi. J, N, 3
|
||||||
|
ble ZCOPYT_L4x4_BEGIN
|
||||||
|
|
||||||
|
mr BO, B8
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
|
||||||
|
ZCOPYT_L4x8_LOOP:
|
||||||
|
|
||||||
|
addi T1, PREB, 128
|
||||||
|
addi T2, PREB, 256
|
||||||
|
dcbt A0, PREA
|
||||||
|
dcbt A1, PREA
|
||||||
|
dcbt A2, PREA
|
||||||
|
dcbt A3, PREA
|
||||||
|
dcbtst BO, M8
|
||||||
|
dcbtst BO, PREB
|
||||||
|
dcbtst BO, T1
|
||||||
|
dcbtst BO, T2
|
||||||
|
|
||||||
|
COPY_4x8
|
||||||
|
|
||||||
|
add BO, BO, M8
|
||||||
|
|
||||||
|
addic. J, J, -1
|
||||||
|
bgt ZCOPYT_L4x8_LOOP
|
||||||
|
|
||||||
|
ZCOPYT_L4x4_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 4
|
||||||
|
ble ZCOPYT_L4x2_BEGIN
|
||||||
|
|
||||||
|
mr BO, B4
|
||||||
|
|
||||||
|
COPY_4x4
|
||||||
|
|
||||||
|
|
||||||
|
addi B4, B4, 32*SIZE
|
||||||
|
|
||||||
|
ZCOPYT_L4x2_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 2
|
||||||
|
ble ZCOPYT_L4x1_BEGIN
|
||||||
|
|
||||||
|
mr BO, B2
|
||||||
|
|
||||||
|
COPY_4x2
|
||||||
|
|
||||||
|
|
||||||
|
addi B2, B2, 16*SIZE
|
||||||
|
|
||||||
|
ZCOPYT_L4x1_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 1
|
||||||
|
ble ZCOPYT_L4_END
|
||||||
|
|
||||||
|
mr BO, B1
|
||||||
|
|
||||||
|
COPY_4x1
|
||||||
|
|
||||||
|
|
||||||
|
addi B1, B1, 8*SIZE
|
||||||
|
|
||||||
|
ZCOPYT_L4_END:
|
||||||
|
|
||||||
|
addic. I, I, -1
|
||||||
|
bgt ZCOPYT_L4_BEGIN
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
ZCOPYT_L2_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, M, 2
|
||||||
|
ble ZCOPYT_L1_BEGIN
|
||||||
|
|
||||||
|
mr A0, A
|
||||||
|
add A1, A0, LDA
|
||||||
|
add A, A1, LDA
|
||||||
|
mr B8, B
|
||||||
|
addi B, B, 32*SIZE
|
||||||
|
|
||||||
|
sradi. J, N, 3
|
||||||
|
ble ZCOPYT_L2x4_BEGIN
|
||||||
|
|
||||||
|
mr BO, B8
|
||||||
|
|
||||||
|
ZCOPYT_L2x8_LOOP:
|
||||||
|
|
||||||
|
COPY_2x8
|
||||||
|
|
||||||
|
add BO, BO, M8
|
||||||
|
|
||||||
|
addic. J, J, -1
|
||||||
|
bgt ZCOPYT_L2x8_LOOP
|
||||||
|
|
||||||
|
ZCOPYT_L2x4_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 4
|
||||||
|
ble ZCOPYT_L2x2_BEGIN
|
||||||
|
|
||||||
|
mr BO, B4
|
||||||
|
|
||||||
|
COPY_2x4
|
||||||
|
|
||||||
|
|
||||||
|
addi B4, B4, 16*SIZE
|
||||||
|
|
||||||
|
ZCOPYT_L2x2_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 2
|
||||||
|
ble ZCOPYT_L2x1_BEGIN
|
||||||
|
|
||||||
|
mr BO, B2
|
||||||
|
|
||||||
|
COPY_2x2
|
||||||
|
|
||||||
|
|
||||||
|
addi B2, B2, 8*SIZE
|
||||||
|
|
||||||
|
ZCOPYT_L2x1_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 1
|
||||||
|
ble ZCOPYT_L2_END
|
||||||
|
|
||||||
|
mr BO, B1
|
||||||
|
|
||||||
|
COPY_2x1
|
||||||
|
|
||||||
|
|
||||||
|
addi B1, B1, 4*SIZE
|
||||||
|
|
||||||
|
ZCOPYT_L2_END:
|
||||||
|
|
||||||
|
|
||||||
|
ZCOPYT_L1_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, M, 1
|
||||||
|
ble L999
|
||||||
|
|
||||||
|
mr A0, A
|
||||||
|
add A, A0, LDA
|
||||||
|
mr B8, B
|
||||||
|
addi B, B, 16*SIZE
|
||||||
|
|
||||||
|
sradi. J, N, 3
|
||||||
|
ble ZCOPYT_L1x4_BEGIN
|
||||||
|
|
||||||
|
mr BO, B8
|
||||||
|
|
||||||
|
ZCOPYT_L1x8_LOOP:
|
||||||
|
|
||||||
|
COPY_1x8
|
||||||
|
|
||||||
|
add BO, BO, M8
|
||||||
|
|
||||||
|
addic. J, J, -1
|
||||||
|
bgt ZCOPYT_L1x8_LOOP
|
||||||
|
|
||||||
|
ZCOPYT_L1x4_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 4
|
||||||
|
ble ZCOPYT_L1x2_BEGIN
|
||||||
|
|
||||||
|
mr BO, B4
|
||||||
|
|
||||||
|
COPY_1x4
|
||||||
|
|
||||||
|
|
||||||
|
addi B4, B4, 8*SIZE
|
||||||
|
|
||||||
|
ZCOPYT_L1x2_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 2
|
||||||
|
ble ZCOPYT_L1x1_BEGIN
|
||||||
|
|
||||||
|
mr BO, B2
|
||||||
|
|
||||||
|
COPY_1x2
|
||||||
|
|
||||||
|
|
||||||
|
addi B2, B2, 4*SIZE
|
||||||
|
|
||||||
|
ZCOPYT_L1x1_BEGIN:
|
||||||
|
|
||||||
|
andi. T1, N, 1
|
||||||
|
ble ZCOPYT_L1_END
|
||||||
|
|
||||||
|
mr BO, B1
|
||||||
|
|
||||||
|
COPY_1x1
|
||||||
|
|
||||||
|
|
||||||
|
addi B1, B1, 2*SIZE
|
||||||
|
|
||||||
|
ZCOPYT_L1_END:
|
||||||
|
|
|
@ -0,0 +1,535 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=8
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x8
|
||||||
|
|
||||||
|
lxvd2x vs32, o0, A0
|
||||||
|
lxvd2x vs33, o16, A0
|
||||||
|
lxvd2x vs34, o32, A0
|
||||||
|
lxvd2x vs35, o48, A0
|
||||||
|
addi A0, A0, 64
|
||||||
|
|
||||||
|
lxvd2x vs36, o0, A0
|
||||||
|
lxvd2x vs37, o16, A0
|
||||||
|
lxvd2x vs38, o32, A0
|
||||||
|
lxvd2x vs39, o48, A0
|
||||||
|
addi A0, A0, 64
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs40, o0, A1
|
||||||
|
lxvd2x vs41, o16, A1
|
||||||
|
lxvd2x vs42, o32, A1
|
||||||
|
lxvd2x vs43, o48, A1
|
||||||
|
addi A1, A1, 64
|
||||||
|
|
||||||
|
lxvd2x vs44, o0, A1
|
||||||
|
lxvd2x vs45, o16, A1
|
||||||
|
lxvd2x vs46, o32, A1
|
||||||
|
lxvd2x vs47, o48, A1
|
||||||
|
addi A1, A1, 64
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs48, o0, A2
|
||||||
|
lxvd2x vs49, o16, A2
|
||||||
|
lxvd2x vs50, o32, A2
|
||||||
|
lxvd2x vs51, o48, A2
|
||||||
|
addi A2, A2, 64
|
||||||
|
|
||||||
|
lxvd2x vs52, o0, A2
|
||||||
|
lxvd2x vs53, o16, A2
|
||||||
|
lxvd2x vs54, o32, A2
|
||||||
|
lxvd2x vs55, o48, A2
|
||||||
|
addi A2, A2, 64
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs56, o0, A3
|
||||||
|
lxvd2x vs57, o16, A3
|
||||||
|
lxvd2x vs58, o32, A3
|
||||||
|
lxvd2x vs59, o48, A3
|
||||||
|
addi A3, A3, 64
|
||||||
|
|
||||||
|
lxvd2x vs60, o0, A3
|
||||||
|
lxvd2x vs61, o16, A3
|
||||||
|
lxvd2x vs62, o32, A3
|
||||||
|
lxvd2x vs63, o48, A3
|
||||||
|
addi A3, A3, 64
|
||||||
|
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, T1
|
||||||
|
stxvd2x vs33, o16, T1
|
||||||
|
stxvd2x vs34, o32, T1
|
||||||
|
stxvd2x vs35, o48, T1
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs36, o0, T1
|
||||||
|
stxvd2x vs37, o16, T1
|
||||||
|
stxvd2x vs38, o32, T1
|
||||||
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs40, o0, T1
|
||||||
|
stxvd2x vs41, o16, T1
|
||||||
|
stxvd2x vs42, o32, T1
|
||||||
|
stxvd2x vs43, o48, T1
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs44, o0, T1
|
||||||
|
stxvd2x vs45, o16, T1
|
||||||
|
stxvd2x vs46, o32, T1
|
||||||
|
stxvd2x vs47, o48, T1
|
||||||
|
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs48, o0, T1
|
||||||
|
stxvd2x vs49, o16, T1
|
||||||
|
stxvd2x vs50, o32, T1
|
||||||
|
stxvd2x vs51, o48, T1
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs52, o0, T1
|
||||||
|
stxvd2x vs53, o16, T1
|
||||||
|
stxvd2x vs54, o32, T1
|
||||||
|
stxvd2x vs55, o48, T1
|
||||||
|
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs56, o0, T1
|
||||||
|
stxvd2x vs57, o16, T1
|
||||||
|
stxvd2x vs58, o32, T1
|
||||||
|
stxvd2x vs59, o48, T1
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs60, o0, T1
|
||||||
|
stxvd2x vs61, o16, T1
|
||||||
|
stxvd2x vs62, o32, T1
|
||||||
|
stxvd2x vs63, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=4
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x4
|
||||||
|
|
||||||
|
lxvd2x vs32, o0, A0
|
||||||
|
lxvd2x vs33, o16, A0
|
||||||
|
lxvd2x vs34, o32, A0
|
||||||
|
lxvd2x vs35, o48, A0
|
||||||
|
addi A0, A0, 64
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs36, o0, A1
|
||||||
|
lxvd2x vs37, o16, A1
|
||||||
|
lxvd2x vs38, o32, A1
|
||||||
|
lxvd2x vs39, o48, A1
|
||||||
|
addi A1, A1, 64
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs40, o0, A2
|
||||||
|
lxvd2x vs41, o16, A2
|
||||||
|
lxvd2x vs42, o32, A2
|
||||||
|
lxvd2x vs43, o48, A2
|
||||||
|
addi A2, A2, 64
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs44, o0, A3
|
||||||
|
lxvd2x vs45, o16, A3
|
||||||
|
lxvd2x vs46, o32, A3
|
||||||
|
lxvd2x vs47, o48, A3
|
||||||
|
addi A3, A3, 64
|
||||||
|
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, T1
|
||||||
|
stxvd2x vs33, o16, T1
|
||||||
|
stxvd2x vs34, o32, T1
|
||||||
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs36, o0, T1
|
||||||
|
stxvd2x vs37, o16, T1
|
||||||
|
stxvd2x vs38, o32, T1
|
||||||
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs40, o0, T1
|
||||||
|
stxvd2x vs41, o16, T1
|
||||||
|
stxvd2x vs42, o32, T1
|
||||||
|
stxvd2x vs43, o48, T1
|
||||||
|
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs44, o0, T1
|
||||||
|
stxvd2x vs45, o16, T1
|
||||||
|
stxvd2x vs46, o32, T1
|
||||||
|
stxvd2x vs47, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=2
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x2
|
||||||
|
|
||||||
|
lxvd2x vs32, o0, A0
|
||||||
|
lxvd2x vs33, o16, A0
|
||||||
|
addi A0, A0, 32
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs34, o0, A1
|
||||||
|
lxvd2x vs35, o16, A1
|
||||||
|
addi A1, A1, 32
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs36, o0, A2
|
||||||
|
lxvd2x vs37, o16, A2
|
||||||
|
addi A2, A2, 32
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs38, o0, A3
|
||||||
|
lxvd2x vs39, o16, A3
|
||||||
|
addi A3, A3, 32
|
||||||
|
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, T1
|
||||||
|
stxvd2x vs33, o16, T1
|
||||||
|
|
||||||
|
stxvd2x vs34, o32, T1
|
||||||
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs36, o0, T1
|
||||||
|
stxvd2x vs37, o16, T1
|
||||||
|
|
||||||
|
stxvd2x vs38, o32, T1
|
||||||
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=4 and M=1
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_4x1
|
||||||
|
|
||||||
|
lxvd2x vs32, o0, A0
|
||||||
|
addi A0, A0, 16
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs33, o0, A1
|
||||||
|
addi A1, A1, 16
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs34, o0, A2
|
||||||
|
addi A2, A2, 16
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs35, o0, A3
|
||||||
|
addi A3, A3, 16
|
||||||
|
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, T1
|
||||||
|
|
||||||
|
stxvd2x vs33, o16, T1
|
||||||
|
|
||||||
|
stxvd2x vs34, o32, T1
|
||||||
|
|
||||||
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=8
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x8
|
||||||
|
|
||||||
|
lxvd2x vs32, o0, A0
|
||||||
|
lxvd2x vs33, o16, A0
|
||||||
|
lxvd2x vs34, o32, A0
|
||||||
|
lxvd2x vs35, o48, A0
|
||||||
|
addi A0, A0, 64
|
||||||
|
|
||||||
|
lxvd2x vs36, o0, A0
|
||||||
|
lxvd2x vs37, o16, A0
|
||||||
|
lxvd2x vs38, o32, A0
|
||||||
|
lxvd2x vs39, o48, A0
|
||||||
|
addi A0, A0, 64
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs40, o0, A1
|
||||||
|
lxvd2x vs41, o16, A1
|
||||||
|
lxvd2x vs42, o32, A1
|
||||||
|
lxvd2x vs43, o48, A1
|
||||||
|
addi A1, A1, 64
|
||||||
|
|
||||||
|
lxvd2x vs44, o0, A1
|
||||||
|
lxvd2x vs45, o16, A1
|
||||||
|
lxvd2x vs46, o32, A1
|
||||||
|
lxvd2x vs47, o48, A1
|
||||||
|
addi A1, A1, 64
|
||||||
|
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, T1
|
||||||
|
stxvd2x vs33, o16, T1
|
||||||
|
stxvd2x vs34, o32, T1
|
||||||
|
stxvd2x vs35, o48, T1
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs36, o0, T1
|
||||||
|
stxvd2x vs37, o16, T1
|
||||||
|
stxvd2x vs38, o32, T1
|
||||||
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs40, o0, T1
|
||||||
|
stxvd2x vs41, o16, T1
|
||||||
|
stxvd2x vs42, o32, T1
|
||||||
|
stxvd2x vs43, o48, T1
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs44, o0, T1
|
||||||
|
stxvd2x vs45, o16, T1
|
||||||
|
stxvd2x vs46, o32, T1
|
||||||
|
stxvd2x vs47, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=4
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x4
|
||||||
|
|
||||||
|
lxvd2x vs32, o0, A0
|
||||||
|
lxvd2x vs33, o16, A0
|
||||||
|
lxvd2x vs34, o32, A0
|
||||||
|
lxvd2x vs35, o48, A0
|
||||||
|
addi A0, A0, 64
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs36, o0, A1
|
||||||
|
lxvd2x vs37, o16, A1
|
||||||
|
lxvd2x vs38, o32, A1
|
||||||
|
lxvd2x vs39, o48, A1
|
||||||
|
addi A1, A1, 64
|
||||||
|
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, T1
|
||||||
|
stxvd2x vs33, o16, T1
|
||||||
|
stxvd2x vs34, o32, T1
|
||||||
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs36, o0, T1
|
||||||
|
stxvd2x vs37, o16, T1
|
||||||
|
stxvd2x vs38, o32, T1
|
||||||
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=2
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x2
|
||||||
|
|
||||||
|
lxvd2x vs32, o0, A0
|
||||||
|
lxvd2x vs33, o16, A0
|
||||||
|
addi A0, A0, 32
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs34, o0, A1
|
||||||
|
lxvd2x vs35, o16, A1
|
||||||
|
addi A1, A1, 32
|
||||||
|
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, T1
|
||||||
|
stxvd2x vs33, o16, T1
|
||||||
|
|
||||||
|
stxvd2x vs34, o32, T1
|
||||||
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=2 and M=1
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_2x1
|
||||||
|
|
||||||
|
lxvd2x vs32, o0, A0
|
||||||
|
addi A0, A0, 16
|
||||||
|
|
||||||
|
|
||||||
|
lxvd2x vs33, o0, A1
|
||||||
|
addi A1, A1, 16
|
||||||
|
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, T1
|
||||||
|
|
||||||
|
stxvd2x vs33, o16, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=8
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x8
|
||||||
|
|
||||||
|
lxvd2x vs32, o0, A0
|
||||||
|
lxvd2x vs33, o16, A0
|
||||||
|
lxvd2x vs34, o32, A0
|
||||||
|
lxvd2x vs35, o48, A0
|
||||||
|
addi A0, A0, 64
|
||||||
|
|
||||||
|
lxvd2x vs36, o0, A0
|
||||||
|
lxvd2x vs37, o16, A0
|
||||||
|
lxvd2x vs38, o32, A0
|
||||||
|
lxvd2x vs39, o48, A0
|
||||||
|
addi A0, A0, 64
|
||||||
|
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, T1
|
||||||
|
stxvd2x vs33, o16, T1
|
||||||
|
stxvd2x vs34, o32, T1
|
||||||
|
stxvd2x vs35, o48, T1
|
||||||
|
addi T1, T1, 64
|
||||||
|
|
||||||
|
stxvd2x vs36, o0, T1
|
||||||
|
stxvd2x vs37, o16, T1
|
||||||
|
stxvd2x vs38, o32, T1
|
||||||
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=4
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x4
|
||||||
|
|
||||||
|
lxvd2x vs32, o0, A0
|
||||||
|
lxvd2x vs33, o16, A0
|
||||||
|
lxvd2x vs34, o32, A0
|
||||||
|
lxvd2x vs35, o48, A0
|
||||||
|
addi A0, A0, 64
|
||||||
|
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, T1
|
||||||
|
stxvd2x vs33, o16, T1
|
||||||
|
stxvd2x vs34, o32, T1
|
||||||
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=2
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x2
|
||||||
|
|
||||||
|
lxvd2x vs32, o0, A0
|
||||||
|
lxvd2x vs33, o16, A0
|
||||||
|
addi A0, A0, 32
|
||||||
|
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, T1
|
||||||
|
stxvd2x vs33, o16, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************************************
|
||||||
|
* Macros for N=1 and M=1
|
||||||
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY_1x1
|
||||||
|
|
||||||
|
lxvd2x vs32, o0, A0
|
||||||
|
addi A0, A0, 16
|
||||||
|
|
||||||
|
|
||||||
|
mr T1, BO
|
||||||
|
|
||||||
|
stxvd2x vs32, o0, T1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
|
@ -933,6 +933,23 @@ static void init_parameter(void) {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef EXCAVATOR
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
fprintf(stderr, "Excavator\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||||
|
#ifdef EXPRECISION
|
||||||
|
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef PILEDRIVER
|
#ifdef PILEDRIVER
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
|
DSCALKERNEL = dscal.c
|
||||||
|
CSCALKERNEL = cscal.c
|
||||||
|
ZSCALKERNEL = zscal.c
|
||||||
|
|
||||||
SAXPYKERNEL = saxpy.c
|
SAXPYKERNEL = saxpy.c
|
||||||
DAXPYKERNEL = daxpy.c
|
DAXPYKERNEL = daxpy.c
|
||||||
CAXPYKERNEL = caxpy.c
|
CAXPYKERNEL = caxpy.c
|
||||||
|
@ -20,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c
|
||||||
DGEMVNKERNEL = dgemv_n_4.c
|
DGEMVNKERNEL = dgemv_n_4.c
|
||||||
DGEMVTKERNEL = dgemv_t_4.c
|
DGEMVTKERNEL = dgemv_t_4.c
|
||||||
|
|
||||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
ZGEMVNKERNEL = zgemv_n_4.c
|
||||||
ZGEMVTKERNEL = zgemv_t_4.c
|
ZGEMVTKERNEL = zgemv_t_4.c
|
||||||
|
|
||||||
DCOPYKERNEL = dcopy_bulldozer.S
|
DCOPYKERNEL = dcopy_bulldozer.S
|
||||||
|
@ -68,25 +72,23 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
|
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
|
||||||
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
||||||
|
|
||||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
STRSMKERNEL_LN = strsm_kernel_LN_bulldozer.c
|
||||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
STRSMKERNEL_LT = strsm_kernel_LT_bulldozer.c
|
||||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
STRSMKERNEL_RN = strsm_kernel_RN_bulldozer.c
|
||||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
STRSMKERNEL_RT = strsm_kernel_RT_bulldozer.c
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = dtrsm_kernel_LN_bulldozer.c
|
||||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
|
||||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
|
DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
|
||||||
DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
|
DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
|
||||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
DTRSMKERNEL_RT = dtrsm_kernel_RT_bulldozer.c
|
||||||
|
|
||||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
CTRSMKERNEL_LN = ctrsm_kernel_LN_bulldozer.c
|
||||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
CTRSMKERNEL_LT = ctrsm_kernel_LT_bulldozer.c
|
||||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
CTRSMKERNEL_RN = ctrsm_kernel_RN_bulldozer.c
|
||||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
CTRSMKERNEL_RT = ctrsm_kernel_RT_bulldozer.c
|
||||||
|
|
||||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
|
||||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
|
||||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
|
||||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ztrsm_kernel_LN_bulldozer.c
|
||||||
|
ZTRSMKERNEL_LT = ztrsm_kernel_LT_bulldozer.c
|
||||||
|
ZTRSMKERNEL_RN = ztrsm_kernel_RN_bulldozer.c
|
||||||
|
ZTRSMKERNEL_RT = ztrsm_kernel_RT_bulldozer.c
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(PILEDRIVER) || defined(STEAMROLLER)
|
#if defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "caxpy_microk_steamroller-2.c"
|
#include "caxpy_microk_steamroller-2.c"
|
||||||
#elif defined(BULLDOZER)
|
#elif defined(BULLDOZER)
|
||||||
#include "caxpy_microk_bulldozer-2.c"
|
#include "caxpy_microk_bulldozer-2.c"
|
||||||
|
|
|
@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(BULLDOZER)
|
#if defined(BULLDOZER)
|
||||||
#include "cdot_microk_bulldozer-2.c"
|
#include "cdot_microk_bulldozer-2.c"
|
||||||
#elif defined(STEAMROLLER) || defined(PILEDRIVER)
|
#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
|
||||||
#include "cdot_microk_steamroller-2.c"
|
#include "cdot_microk_steamroller-2.c"
|
||||||
#elif defined(HASWELL)
|
#elif defined(HASWELL)
|
||||||
#include "cdot_microk_haswell-2.c"
|
#include "cdot_microk_haswell-2.c"
|
||||||
|
|
|
@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(HASWELL)
|
#if defined(HASWELL)
|
||||||
#include "cgemv_n_microk_haswell-4.c"
|
#include "cgemv_n_microk_haswell-4.c"
|
||||||
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "cgemv_n_microk_bulldozer-4.c"
|
#include "cgemv_n_microk_bulldozer-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(HASWELL)
|
#if defined(HASWELL)
|
||||||
#include "cgemv_t_microk_haswell-4.c"
|
#include "cgemv_t_microk_haswell-4.c"
|
||||||
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "cgemv_t_microk_bulldozer-4.c"
|
#include "cgemv_t_microk_bulldozer-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "cscal_microk_haswell-2.c"
|
#include "cscal_microk_haswell-2.c"
|
||||||
#elif defined(BULLDOZER) || defined(PILEDRIVER)
|
#elif defined(BULLDOZER) || defined(PILEDRIVER)
|
||||||
#include "cscal_microk_bulldozer-2.c"
|
#include "cscal_microk_bulldozer-2.c"
|
||||||
#elif defined(STEAMROLLER)
|
#elif defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "cscal_microk_steamroller-2.c"
|
#include "cscal_microk_steamroller-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "cscal_microk_bulldozer-2.c"
|
#include "cscal_microk_bulldozer-2.c"
|
||||||
|
|
|
@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "daxpy_microk_nehalem-2.c"
|
#include "daxpy_microk_nehalem-2.c"
|
||||||
#elif defined(BULLDOZER)
|
#elif defined(BULLDOZER)
|
||||||
#include "daxpy_microk_bulldozer-2.c"
|
#include "daxpy_microk_bulldozer-2.c"
|
||||||
#elif defined(STEAMROLLER)
|
#elif defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "daxpy_microk_steamroller-2.c"
|
#include "daxpy_microk_steamroller-2.c"
|
||||||
#elif defined(PILEDRIVER)
|
#elif defined(PILEDRIVER)
|
||||||
#include "daxpy_microk_piledriver-2.c"
|
#include "daxpy_microk_piledriver-2.c"
|
||||||
|
|
|
@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(BULLDOZER)
|
#if defined(BULLDOZER)
|
||||||
#include "ddot_microk_bulldozer-2.c"
|
#include "ddot_microk_bulldozer-2.c"
|
||||||
#elif defined(STEAMROLLER)
|
#elif defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "ddot_microk_steamroller-2.c"
|
#include "ddot_microk_steamroller-2.c"
|
||||||
#elif defined(PILEDRIVER)
|
#elif defined(PILEDRIVER)
|
||||||
#include "ddot_microk_piledriver-2.c"
|
#include "ddot_microk_piledriver-2.c"
|
||||||
|
|
|
@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(NEHALEM)
|
#if defined(NEHALEM)
|
||||||
#include "dgemv_n_microk_nehalem-4.c"
|
#include "dgemv_n_microk_nehalem-4.c"
|
||||||
#elif defined(HASWELL) || defined(STEAMROLLER)
|
#elif defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "dgemv_n_microk_haswell-4.c"
|
#include "dgemv_n_microk_haswell-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(HASWELL) || defined(STEAMROLLER)
|
#if defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "dgemv_t_microk_haswell-4.c"
|
#include "dgemv_t_microk_haswell-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "dscal_microk_bulldozer-2.c"
|
#include "dscal_microk_bulldozer-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "dscal_microk_sandy-2.c"
|
#include "dscal_microk_sandy-2.c"
|
||||||
|
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "dsymv_L_microk_bulldozer-2.c"
|
#include "dsymv_L_microk_bulldozer-2.c"
|
||||||
#elif defined(HASWELL)
|
#elif defined(HASWELL)
|
||||||
#include "dsymv_L_microk_haswell-2.c"
|
#include "dsymv_L_microk_haswell-2.c"
|
||||||
|
|
|
@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "dsymv_U_microk_bulldozer-2.c"
|
#include "dsymv_U_microk_bulldozer-2.c"
|
||||||
#elif defined(HASWELL)
|
#elif defined(HASWELL)
|
||||||
#include "dsymv_U_microk_haswell-2.c"
|
#include "dsymv_U_microk_haswell-2.c"
|
||||||
|
|
|
@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "saxpy_microk_haswell-2.c"
|
#include "saxpy_microk_haswell-2.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "saxpy_microk_sandy-2.c"
|
#include "saxpy_microk_sandy-2.c"
|
||||||
#elif defined(PILEDRIVER) || defined(STEAMROLLER)
|
#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "saxpy_microk_piledriver-2.c"
|
#include "saxpy_microk_piledriver-2.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(BULLDOZER)
|
#if defined(BULLDOZER)
|
||||||
#include "sdot_microk_bulldozer-2.c"
|
#include "sdot_microk_bulldozer-2.c"
|
||||||
#elif defined(STEAMROLLER) || defined(PILEDRIVER)
|
#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
|
||||||
#include "sdot_microk_steamroller-2.c"
|
#include "sdot_microk_steamroller-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "sdot_microk_nehalem-2.c"
|
#include "sdot_microk_nehalem-2.c"
|
||||||
|
|
|
@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "sgemv_n_microk_bulldozer-4.c"
|
#include "sgemv_n_microk_bulldozer-4.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "sgemv_n_microk_nehalem-4.c"
|
#include "sgemv_n_microk_nehalem-4.c"
|
||||||
|
@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "sgemv_n_microk_haswell-4.c"
|
#include "sgemv_n_microk_haswell-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(STEAMROLLER)
|
#if defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#define NBMAX 2048
|
#define NBMAX 2048
|
||||||
#else
|
#else
|
||||||
#define NBMAX 4096
|
#define NBMAX 4096
|
||||||
|
|
|
@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(NEHALEM)
|
#if defined(NEHALEM)
|
||||||
#include "sgemv_t_microk_nehalem-4.c"
|
#include "sgemv_t_microk_nehalem-4.c"
|
||||||
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "sgemv_t_microk_bulldozer-4.c"
|
#include "sgemv_t_microk_bulldozer-4.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "sgemv_t_microk_sandy-4.c"
|
#include "sgemv_t_microk_sandy-4.c"
|
||||||
|
@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "sgemv_t_microk_haswell-4.c"
|
#include "sgemv_t_microk_haswell-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(STEAMROLLER)
|
#if defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#define NBMAX 2048
|
#define NBMAX 2048
|
||||||
#else
|
#else
|
||||||
#define NBMAX 4096
|
#define NBMAX 4096
|
||||||
|
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "ssymv_L_microk_bulldozer-2.c"
|
#include "ssymv_L_microk_bulldozer-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "ssymv_L_microk_nehalem-2.c"
|
#include "ssymv_L_microk_nehalem-2.c"
|
||||||
|
|
|
@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "ssymv_U_microk_bulldozer-2.c"
|
#include "ssymv_U_microk_bulldozer-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "ssymv_U_microk_nehalem-2.c"
|
#include "ssymv_U_microk_nehalem-2.c"
|
||||||
|
|
|
@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(BULLDOZER)
|
#if defined(BULLDOZER)
|
||||||
#include "zaxpy_microk_bulldozer-2.c"
|
#include "zaxpy_microk_bulldozer-2.c"
|
||||||
#elif defined(PILEDRIVER) || defined(STEAMROLLER)
|
#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "zaxpy_microk_steamroller-2.c"
|
#include "zaxpy_microk_steamroller-2.c"
|
||||||
#elif defined(HASWELL)
|
#elif defined(HASWELL)
|
||||||
#include "zaxpy_microk_haswell-2.c"
|
#include "zaxpy_microk_haswell-2.c"
|
||||||
|
|
|
@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(BULLDOZER)
|
#if defined(BULLDOZER)
|
||||||
#include "zdot_microk_bulldozer-2.c"
|
#include "zdot_microk_bulldozer-2.c"
|
||||||
#elif defined(STEAMROLLER) || defined(PILEDRIVER)
|
#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
|
||||||
#include "zdot_microk_steamroller-2.c"
|
#include "zdot_microk_steamroller-2.c"
|
||||||
#elif defined(HASWELL)
|
#elif defined(HASWELL)
|
||||||
#include "zdot_microk_haswell-2.c"
|
#include "zdot_microk_haswell-2.c"
|
||||||
|
|
|
@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "zgemv_n_microk_haswell-4.c"
|
#include "zgemv_n_microk_haswell-4.c"
|
||||||
#elif defined(SANDYBRIDGE)
|
#elif defined(SANDYBRIDGE)
|
||||||
#include "zgemv_n_microk_sandy-4.c"
|
#include "zgemv_n_microk_sandy-4.c"
|
||||||
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "zgemv_n_microk_bulldozer-4.c"
|
#include "zgemv_n_microk_bulldozer-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "zgemv_t_microk_bulldozer-4.c"
|
#include "zgemv_t_microk_bulldozer-4.c"
|
||||||
#elif defined(HASWELL)
|
#elif defined(HASWELL)
|
||||||
#include "zgemv_t_microk_haswell-4.c"
|
#include "zgemv_t_microk_haswell-4.c"
|
||||||
|
|
|
@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "zscal_microk_haswell-2.c"
|
#include "zscal_microk_haswell-2.c"
|
||||||
#elif defined(BULLDOZER) || defined(PILEDRIVER)
|
#elif defined(BULLDOZER) || defined(PILEDRIVER)
|
||||||
#include "zscal_microk_bulldozer-2.c"
|
#include "zscal_microk_bulldozer-2.c"
|
||||||
#elif defined(STEAMROLLER)
|
#elif defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
#include "zscal_microk_steamroller-2.c"
|
#include "zscal_microk_steamroller-2.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
16
param.h
16
param.h
|
@ -1977,15 +1977,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define ZGEMM_DEFAULT_UNROLL_M 8
|
#define ZGEMM_DEFAULT_UNROLL_M 8
|
||||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_P 960
|
#define SGEMM_DEFAULT_P 1280
|
||||||
#define DGEMM_DEFAULT_P 480
|
#define DGEMM_DEFAULT_P 640
|
||||||
#define CGEMM_DEFAULT_P 720
|
#define CGEMM_DEFAULT_P 640
|
||||||
#define ZGEMM_DEFAULT_P 480
|
#define ZGEMM_DEFAULT_P 320
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_Q 720
|
#define SGEMM_DEFAULT_Q 640
|
||||||
#define DGEMM_DEFAULT_Q 720
|
#define DGEMM_DEFAULT_Q 640
|
||||||
#define CGEMM_DEFAULT_Q 720
|
#define CGEMM_DEFAULT_Q 640
|
||||||
#define ZGEMM_DEFAULT_Q 720
|
#define ZGEMM_DEFAULT_Q 640
|
||||||
|
|
||||||
#define SYMV_P 8
|
#define SYMV_P 8
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ include ../Makefile.system
|
||||||
all :: level1 level2 level3
|
all :: level1 level2 level3
|
||||||
|
|
||||||
level1 : sblat1 dblat1 cblat1 zblat1
|
level1 : sblat1 dblat1 cblat1 zblat1
|
||||||
|
ifndef CROSS
|
||||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1
|
||||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1
|
||||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1
|
||||||
|
@ -21,8 +22,10 @@ else
|
||||||
OPENBLAS_NUM_THREADS=2 ./zblat1
|
OPENBLAS_NUM_THREADS=2 ./zblat1
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
level2 : sblat2 dblat2 cblat2 zblat2
|
level2 : sblat2 dblat2 cblat2 zblat2
|
||||||
|
ifndef CROSS
|
||||||
rm -f ?BLAT2.SUMM
|
rm -f ?BLAT2.SUMM
|
||||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat
|
||||||
@$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0
|
@$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0
|
||||||
|
@ -54,8 +57,10 @@ else
|
||||||
@$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0
|
@$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
level3 : sblat3 dblat3 cblat3 zblat3
|
level3 : sblat3 dblat3 cblat3 zblat3
|
||||||
|
ifndef CROSS
|
||||||
rm -f ?BLAT3.SUMM
|
rm -f ?BLAT3.SUMM
|
||||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat
|
||||||
@$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0
|
@$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0
|
||||||
|
@ -87,9 +92,11 @@ else
|
||||||
@$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0
|
@$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
level3_3m : zblat3_3m cblat3_3m
|
level3_3m : zblat3_3m cblat3_3m
|
||||||
|
ifndef CROSS
|
||||||
rm -f ?BLAT3_3M.SUMM
|
rm -f ?BLAT3_3M.SUMM
|
||||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat
|
||||||
@$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
|
@$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
|
||||||
|
@ -109,6 +116,7 @@ else
|
||||||
@$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0
|
@$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,9 @@ $(UTESTBIN): $(OBJS)
|
||||||
$(CC) $(CFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
|
$(CC) $(CFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
run_test: $(UTESTBIN)
|
run_test: $(UTESTBIN)
|
||||||
|
ifndef CROSS
|
||||||
./$(UTESTBIN)
|
./$(UTESTBIN)
|
||||||
|
endif
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
-rm -f *.o $(UTESTBIN)
|
-rm -f *.o $(UTESTBIN)
|
||||||
|
|
Loading…
Reference in New Issue