conflict resolved by syncing with 'xianyi:develop'

Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
This commit is contained in:
Shivraj Patil 2016-05-04 11:07:14 +05:30
commit 085cf236c2
61 changed files with 8435 additions and 789 deletions

View File

@ -151,5 +151,9 @@ In chronological order:
* [2016-03-20] Fix compiler error in VisualStudio with CMake * [2016-03-20] Fix compiler error in VisualStudio with CMake
* [2016-03-22] Fix access violation on Windows while static linking * [2016-03-22] Fix access violation on Windows while static linking
* Paul Mustière <https://github.com/buffer51/>
* [2016-02-04] Fix Android build on ARMV7
* [2016-04-26] Android build with LAPACK for ARMV7 & ARMV8
* Shivraj Patil <https://github.com/sva-img/> * Shivraj Patil <https://github.com/sva-img/>
* [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA * [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA

View File

@ -108,8 +108,6 @@ endif
tests : tests :
ifndef NOFORTRAN ifndef NOFORTRAN
ifndef TARGET
ifndef CROSS
touch $(LIBNAME) touch $(LIBNAME)
ifndef NO_FBLAS ifndef NO_FBLAS
$(MAKE) -C test all $(MAKE) -C test all
@ -119,8 +117,6 @@ ifndef NO_CBLAS
$(MAKE) -C ctest all $(MAKE) -C ctest all
endif endif
endif endif
endif
endif
libs : libs :
ifeq ($(CORE), UNKOWN) ifeq ($(CORE), UNKOWN)

View File

@ -20,75 +20,75 @@ lib.grd :
$(error OpenBLAS: Please run "make" firstly) $(error OpenBLAS: Please run "make" firstly)
install : lib.grd install : lib.grd
@-mkdir -p $(DESTDIR)$(PREFIX) @-mkdir -p "$(DESTDIR)$(PREFIX)"
@-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)"
@-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR) @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR) @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)"
@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
#for inc #for inc
@echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#define OPENBLAS_CONFIG_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h @cat openblas_config_template.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@echo \#ifndef OPENBLAS_F77BLAS_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h @echo \#ifndef OPENBLAS_F77BLAS_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@echo \#define OPENBLAS_F77BLAS_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h @echo \#define OPENBLAS_F77BLAS_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@echo \#include \"openblas_config.h\" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h @echo \#include \"openblas_config.h\" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@cat common_interface.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h @cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@echo \#endif >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h @echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
ifndef NO_CBLAS ifndef NO_CBLAS
@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@sed 's/common/openblas_config/g' cblas.h > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
endif endif
ifndef NO_LAPACKE ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
endif endif
#for install static library #for install static library
ifndef NO_STATIC ifndef NO_STATIC
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif endif
#for install shared library #for install shared library
ifndef NO_SHARED ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
@install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif
ifeq ($(OSNAME), FreeBSD) ifeq ($(OSNAME), FreeBSD)
@cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif endif
ifeq ($(OSNAME), NetBSD) ifeq ($(OSNAME), NetBSD)
@cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif endif
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR) @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR) @-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
@ -96,34 +96,34 @@ endif
endif endif
#Generating OpenBLASConfig.cmake #Generating OpenBLASConfig.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
ifndef NO_SHARED ifndef NO_SHARED
#ifeq logical or #ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif endif
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif endif
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif endif
else else
#only static #only static
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif endif
#Generating OpenBLASConfigVersion.cmake #Generating OpenBLASConfigVersion.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) @echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) @echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) @echo "else ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo Install OK! @echo Install OK!

View File

@ -82,6 +82,7 @@ Please read GotoBLAS_01Readme.txt
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. - **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supported by community. We didn't test the library on this OS. - **FreeBSD**: Supported by community. We didn't test the library on this OS.
- **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
## Usages ## Usages
Link with libopenblas.a or -lopenblas for shared library. Link with libopenblas.a or -lopenblas for shared library.

18
c_check
View File

@ -1,5 +1,7 @@
#!/usr/bin/perl #!/usr/bin/perl
use File::Basename;
# Checking cross compile # Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
@ -26,14 +28,12 @@ if ($?) {
$cross_suffix = ""; $cross_suffix = "";
if ($ARGV[0] =~ /(.*)(-[.\d]+)/) { if (dirname($compiler_name) ne ".") {
if ($1 =~ /(.*-)(.*)/) { $cross_suffix .= dirname($compiler_name) . "/";
$cross_suffix = $1; }
}
} else { if (basename($compiler_name) =~ /(.*-)(.*)/) {
if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) { $cross_suffix .= $1;
$cross_suffix = $1;
}
} }
$compiler = ""; $compiler = "";
@ -243,7 +243,7 @@ print MAKEFILE "BINARY64=\n" if $binformat ne bin64;
print MAKEFILE "BINARY32=1\n" if $binformat eq bin32; print MAKEFILE "BINARY32=1\n" if $binformat eq bin32;
print MAKEFILE "BINARY64=1\n" if $binformat eq bin64; print MAKEFILE "BINARY64=1\n" if $binformat eq bin64;
print MAKEFILE "FU=$need_fu\n" if $need_fu ne ""; print MAKEFILE "FU=$need_fu\n" if $need_fu ne "";
print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne ""; print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne "";
print MAKEFILE "CROSS=1\n" if $cross != 0; print MAKEFILE "CROSS=1\n" if $cross != 0;
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";

View File

@ -42,6 +42,7 @@ ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
all :: all1 all2 all3 all :: all1 all2 all3
all1: xscblat1 xdcblat1 xccblat1 xzcblat1 all1: xscblat1 xdcblat1 xccblat1 xzcblat1
ifndef CROSS
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat1 OMP_NUM_THREADS=2 ./xscblat1
OMP_NUM_THREADS=2 ./xdcblat1 OMP_NUM_THREADS=2 ./xdcblat1
@ -53,8 +54,10 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat1 OPENBLAS_NUM_THREADS=2 ./xccblat1
OPENBLAS_NUM_THREADS=2 ./xzcblat1 OPENBLAS_NUM_THREADS=2 ./xzcblat1
endif endif
endif
all2: xscblat2 xdcblat2 xccblat2 xzcblat2 all2: xscblat2 xdcblat2 xccblat2 xzcblat2
ifndef CROSS
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat2 < sin2 OMP_NUM_THREADS=2 ./xscblat2 < sin2
OMP_NUM_THREADS=2 ./xdcblat2 < din2 OMP_NUM_THREADS=2 ./xdcblat2 < din2
@ -66,8 +69,10 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2 OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2
OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2 OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2
endif endif
endif
all3: xscblat3 xdcblat3 xccblat3 xzcblat3 all3: xscblat3 xdcblat3 xccblat3 xzcblat3
ifndef CROSS
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat3 < sin3 OMP_NUM_THREADS=2 ./xscblat3 < sin3
OMP_NUM_THREADS=2 ./xdcblat3 < din3 OMP_NUM_THREADS=2 ./xdcblat3 < din3
@ -88,6 +93,7 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
endif endif
endif

View File

@ -439,7 +439,7 @@ static gotoblas_t *force_coretype(char *coretype){
char message[128]; char message[128];
//char mname[20]; //char mname[20];
for ( i=1 ; i <= 21; i++) for ( i=1 ; i <= 22; i++)
{ {
if (!strncasecmp(coretype,corename[i],20)) if (!strncasecmp(coretype,corename[i],20))
{ {

View File

@ -361,6 +361,9 @@ static void numa_mapping(void) {
unsigned long work, bit; unsigned long work, bit;
int count = 0; int count = 0;
int bitmask_idx = 0; int bitmask_idx = 0;
int current_cpu;
int current_node = 0;
int cpu_count = 0;
for (node = 0; node < common -> num_nodes; node ++) { for (node = 0; node < common -> num_nodes; node ++) {
core = 0; core = 0;
@ -382,6 +385,56 @@ static void numa_mapping(void) {
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
#endif #endif
current_cpu = sched_getcpu();
for (cpu = 0; cpu < count; cpu++) {
if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) {
current_node = READ_NODE(common -> cpu_info[cpu]);
break;
}
}
for (i = 0; i < MAX_BITMASK_LEN; i++)
cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]);
/*
* If all the processes can be accommodated in the
* in the current node itself, then bind to cores
* from the current node only
*/
if (numprocs <= cpu_count) {
/*
* First sort all the cores in order from the current node.
* Then take remaining nodes one by one in order,
* and sort their cores in order.
*/
for (i = 0; i < count; i++) {
for (j = 0; j < count - 1; j++) {
int node_1, node_2;
int core_1, core_2;
int swap = 0;
node_1 = READ_NODE(common -> cpu_info[j]);
node_2 = READ_NODE(common -> cpu_info[j + 1]);
core_1 = READ_CORE(common -> cpu_info[j]);
core_2 = READ_CORE(common -> cpu_info[j + 1]);
if (node_1 == node_2) {
if (core_1 > core_2)
swap = 1;
} else {
if ((node_2 == current_node) ||
((node_1 != current_node) && (node_1 > node_2)))
swap = 1;
}
if (swap) {
unsigned long temp;
temp = common->cpu_info[j];
common->cpu_info[j] = common->cpu_info[j + 1];
common->cpu_info[j + 1] = temp;
}
}
}
} else {
h = 1; h = 1;
while (h < count) h = 2 * h + 1; while (h < count) h = 2 * h + 1;
@ -411,12 +464,16 @@ static void numa_mapping(void) {
} }
} }
}
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "\nSorting ...\n\n"); fprintf(stderr, "\nSorting ...\n\n");
for (cpu = 0; cpu < count; cpu++) for (cpu = 0; cpu < count; cpu++)
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu],
READ_CPU(common -> cpu_info[cpu]),
READ_CORE(common -> cpu_info[cpu]),
READ_NODE(common -> cpu_info[cpu]));
#endif #endif
} }

View File

@ -167,7 +167,7 @@ int get_L2_size(void){
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx); cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@ -251,7 +251,7 @@ int get_L2_size(void){
void blas_set_parameter(void){ void blas_set_parameter(void){
int factor; int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
int size = 16; int size = 16;
#else #else
int size = get_L2_size(); int size = get_L2_size();

View File

@ -110,9 +110,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
endif endif
ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2)) ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2))
#only build without Fortran #only build without Fortran
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else else
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
endif endif
dllinit.$(SUFFIX) : dllinit.c dllinit.$(SUFFIX) : dllinit.c

View File

@ -12,7 +12,7 @@ SGEMMKERNEL = sgemm_kernel_16x8_power8.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMITCOPY = sgemm_tcopy_16_power8.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S
SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o
@ -21,8 +21,8 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_16x4_power8.S DGEMMKERNEL = dgemm_kernel_16x4_power8.S
DGEMMINCOPY = ../generic/gemm_ncopy_16.c DGEMMINCOPY = ../generic/gemm_ncopy_16.c
DGEMMITCOPY = dgemm_tcopy_16_power8.S DGEMMITCOPY = dgemm_tcopy_16_power8.S
DGEMMONCOPY = gemm_ncopy_4.S DGEMMONCOPY = dgemm_ncopy_4_power8.S
DGEMMOTCOPY = gemm_tcopy_4.S DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o
@ -30,7 +30,7 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMITCOPY = cgemm_tcopy_8_power8.S
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMONCOPYOBJ = cgemm_oncopy.o
@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c ZGEMMITCOPY = zgemm_tcopy_8_power8.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o
ZGEMMINCOPYOBJ = zgemm_incopy.o ZGEMMINCOPYOBJ = zgemm_incopy.o

View File

@ -0,0 +1,206 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#define M r3
#define N r4
#define A r5
#define LDA r6
#define B r7
#define A0 r8
#define A1 r9
#define A2 r10
#define A3 r11
#define J r12
#define PREA r14
#define PREB r15
#define BO r16
#define B8 r17
#define B4 r18
#define B2 r19
#define B1 r20
#define o4 r21
#define T2 r22
#define I r23
#define o16 r24
#define o32 r25
#define o48 r26
#define NOTUS2 r27
#define M8 r30
#define T1 r31
#define o0 0
#include "cgemm_tcopy_macros_8_power8.S"
#define STACKSIZE 384
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
cmpwi cr0, M, 0
ble- L999
cmpwi cr0, N, 0
ble- L999
slwi LDA, LDA, ZBASE_SHIFT
slwi M8, M, 3 + ZBASE_SHIFT
li T2, -8
li PREA, -4
li PREB, -2
and B4, N, T2
and B2, N, PREA
and B1, N, PREB
mullw B4, B4, M
mullw B2, B2, M
mullw B1, B1, M
slwi B4, B4, ZBASE_SHIFT
slwi B2, B2, ZBASE_SHIFT
slwi B1, B1, ZBASE_SHIFT
add B4, B4, B
add B2, B2, B
add B1, B1, B
li PREA, 384
addi PREB, M8, 128
li o4, 4
li o16, 16
li o32, 32
li o48, 48
#include "cgemm_tcopy_logic_8_power8.S"
L999:
li r3, 0
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi SP, SP, STACKSIZE
blr
EPILOGUE

View File

@ -0,0 +1,247 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
srawi. I, M, 2
ble CCOPYT_L2_BEGIN
CCOPYT_L4_BEGIN:
mr A0, A
add A1, A0, LDA
add A2, A1, LDA
add A3, A2, LDA
add A, A3, LDA
mr B8, B
addi B, B, 64*SIZE
sradi. J, N, 3
ble CCOPYT_L4x4_BEGIN
mr BO, B8
CCOPYT_L4x8_LOOP:
dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
dcbtst BO, M8
dcbtst BO, PREB
COPY_4x8
add BO, BO, M8
addic. J, J, -1
ble CCOPYT_L4x4_BEGIN
COPY_4x8
add BO, BO, M8
addic. J, J, -1
bgt CCOPYT_L4x8_LOOP
CCOPYT_L4x4_BEGIN:
andi. T1, N, 4
ble CCOPYT_L4x2_BEGIN
mr BO, B4
COPY_4x4
addi B4, B4, 32*SIZE
CCOPYT_L4x2_BEGIN:
andi. T1, N, 2
ble CCOPYT_L4x1_BEGIN
mr BO, B2
COPY_4x2
addi B2, B2, 16*SIZE
CCOPYT_L4x1_BEGIN:
andi. T1, N, 1
ble CCOPYT_L4_END
mr BO, B1
COPY_4x1
addi B1, B1, 8*SIZE
CCOPYT_L4_END:
addic. I, I, -1
bgt CCOPYT_L4_BEGIN
CCOPYT_L2_BEGIN:
andi. T1, M, 2
ble CCOPYT_L1_BEGIN
mr A0, A
add A1, A0, LDA
add A, A1, LDA
mr B8, B
addi B, B, 32*SIZE
sradi. J, N, 3
ble CCOPYT_L2x4_BEGIN
mr BO, B8
CCOPYT_L2x8_LOOP:
COPY_2x8
add BO, BO, M8
addic. J, J, -1
bgt CCOPYT_L2x8_LOOP
CCOPYT_L2x4_BEGIN:
andi. T1, N, 4
ble CCOPYT_L2x2_BEGIN
mr BO, B4
COPY_2x4
addi B4, B4, 16*SIZE
CCOPYT_L2x2_BEGIN:
andi. T1, N, 2
ble CCOPYT_L2x1_BEGIN
mr BO, B2
COPY_2x2
addi B2, B2, 8*SIZE
CCOPYT_L2x1_BEGIN:
andi. T1, N, 1
ble CCOPYT_L2_END
mr BO, B1
COPY_2x1
addi B1, B1, 4*SIZE
CCOPYT_L2_END:
CCOPYT_L1_BEGIN:
andi. T1, M, 1
ble L999
mr A0, A
add A, A0, LDA
mr B8, B
addi B, B, 16*SIZE
sradi. J, N, 3
ble CCOPYT_L1x4_BEGIN
mr BO, B8
CCOPYT_L1x8_LOOP:
COPY_1x8
add BO, BO, M8
addic. J, J, -1
bgt CCOPYT_L1x8_LOOP
CCOPYT_L1x4_BEGIN:
andi. T1, N, 4
ble CCOPYT_L1x2_BEGIN
mr BO, B4
COPY_1x4
addi B4, B4, 8*SIZE
CCOPYT_L1x2_BEGIN:
andi. T1, N, 2
ble CCOPYT_L1x1_BEGIN
mr BO, B2
COPY_1x2
addi B2, B2, 4*SIZE
CCOPYT_L1x1_BEGIN:
andi. T1, N, 1
ble CCOPYT_L1_END
mr BO, B1
COPY_1x1
addi B1, B1, 2*SIZE
CCOPYT_L1_END:

View File

@ -0,0 +1,385 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/**********************************************************************************************
* Macros for N=4 and M=8
**********************************************************************************************/
.macro COPY_4x8
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
lxvw4x vs34, o32, A0
lxvw4x vs35, o48, A0
lxvw4x vs36, o0, A1
lxvw4x vs37, o16, A1
lxvw4x vs38, o32, A1
lxvw4x vs39, o48, A1
addi A0, A0, 64
addi A1, A1, 64
lxvw4x vs40, o0, A2
lxvw4x vs41, o16, A2
lxvw4x vs42, o32, A2
lxvw4x vs43, o48, A2
lxvw4x vs44, o0, A3
lxvw4x vs45, o16, A3
lxvw4x vs46, o32, A3
lxvw4x vs47, o48, A3
mr T1, BO
addi A2, A2, 64
addi A3, A3, 64
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
addi T1, T1, 64
stxvw4x vs36, o0, T1
stxvw4x vs37, o16, T1
stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1
addi T1, T1, 64
stxvw4x vs40, o0, T1
stxvw4x vs41, o16, T1
stxvw4x vs42, o32, T1
stxvw4x vs43, o48, T1
addi T1, T1, 64
stxvw4x vs44, o0, T1
stxvw4x vs45, o16, T1
stxvw4x vs46, o32, T1
stxvw4x vs47, o48, T1
.endm
/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/
.macro COPY_4x4
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
addi A0, A0, 32
lxvw4x vs34, o0, A1
lxvw4x vs35, o16, A1
addi A1, A1, 32
lxvw4x vs36, o0, A2
lxvw4x vs37, o16, A2
addi A2, A2, 32
lxvw4x vs38, o0, A3
lxvw4x vs39, o16, A3
addi A3, A3, 32
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
addi T1, T1, 64
stxvw4x vs36, o0, T1
stxvw4x vs37, o16, T1
stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1
.endm
/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/
.macro COPY_4x2
lxvw4x vs32, o0, A0
addi A0, A0, 16
lxvw4x vs33, o0, A1
addi A1, A1, 16
lxvw4x vs34, o0, A2
addi A2, A2, 16
lxvw4x vs35, o0, A3
addi A3, A3, 16
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
.endm
/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/
.macro COPY_4x1
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
addi A0, A0, 8
lxsspx vs34, o0, A1
lxsspx vs35, o4, A1
addi A1, A1, 8
lxsspx vs36, o0, A2
lxsspx vs37, o4, A2
addi A2, A2, 8
lxsspx vs38, o0, A3
lxsspx vs39, o4, A3
addi A3, A3, 8
mr T1, BO
stxsspx vs32, o0, T1
stxsspx vs33, o4, T1
addi T1, T1, 8
stxsspx vs34, o0, T1
stxsspx vs35, o4, T1
addi T1, T1, 8
stxsspx vs36, o0, T1
stxsspx vs37, o4, T1
addi T1, T1, 8
stxsspx vs38, o0, T1
stxsspx vs39, o4, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/
.macro COPY_2x8
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
lxvw4x vs34, o32, A0
lxvw4x vs35, o48, A0
addi A0, A0, 64
lxvw4x vs36, o0, A1
lxvw4x vs37, o16, A1
lxvw4x vs38, o32, A1
lxvw4x vs39, o48, A1
addi A1, A1, 64
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
addi T1, T1, 64
stxvw4x vs36, o0, T1
stxvw4x vs37, o16, T1
stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/
.macro COPY_2x4
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
addi A0, A0, 32
lxvw4x vs34, o0, A1
lxvw4x vs35, o16, A1
addi A1, A1, 32
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/
.macro COPY_2x2
lxvw4x vs32, o0, A0
addi A0, A0, 16
lxvw4x vs33, o0, A1
addi A1, A1, 16
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/
.macro COPY_2x1
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
addi A0, A0, 8
lxsspx vs34, o0, A1
lxsspx vs35, o4, A1
addi A1, A1, 8
mr T1, BO
stxsspx vs32, o0, T1
stxsspx vs33, o4, T1
addi T1, T1, 8
stxsspx vs34, o0, T1
stxsspx vs35, o4, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/
.macro COPY_1x8
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
lxvw4x vs34, o32, A0
lxvw4x vs35, o48, A0
addi A0, A0, 64
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/
.macro COPY_1x4
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
addi A0, A0, 32
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/
.macro COPY_1x2
lxvw4x vs32, o0, A0
addi A0, A0, 16
mr T1, BO
stxvw4x vs32, o0, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/
.macro COPY_1x1
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
addi A0, A0, 8
mr T1, BO
stxsspx vs32, o0, T1
stxsspx vs33, o4, T1
.endm

View File

@ -131,13 +131,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define o0 0 #define o0 0
#define T4 r12
#define T3 r11
#define o40 r12
#define o56 r11
#define o112 r14
#define o8 r15 #define o8 r15
#define o24 r16 #define o24 r16
#define ALPHA r17 #define o64 r17
#define L r18 #define L r18
#define T1 r19 #define T1 r19
#define KK r20 #define o80 r20
#define BB r21 #define o96 r21
#define I r22 #define I r22
#define J r23 #define J r23
#define AO r24 #define AO r24
@ -202,6 +209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r17, 256(SP) std r17, 256(SP)
std r16, 264(SP) std r16, 264(SP)
std r15, 272(SP) std r15, 272(SP)
std r14, 280(SP)
#else #else
stw r31, 144(SP) stw r31, 144(SP)
stw r30, 148(SP) stw r30, 148(SP)
@ -220,6 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stw r17, 200(SP) stw r17, 200(SP)
stw r16, 204(SP) stw r16, 204(SP)
stw r15, 208(SP) stw r15, 208(SP)
stw r14, 212(SP)
#endif #endif
stfd f1, ALPHA_SP stfd f1, ALPHA_SP
@ -260,19 +269,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble .L999_H1 ble .L999_H1
#ifdef __64BIT__ #ifdef __64BIT__
addi ALPHA, SP, 296 addi T1, SP, 296
#else #else
addi ALPHA, SP, 224 addi T1, SP, 224
#endif #endif
li PRE, 256 li PRE, 384
li o8 , 8 li o8 , 8
li o16, 16 li o16, 16
li o24, 24 li o24, 24
li o32, 32 li o32, 32
li o48, 48 li o48, 48
li o64, 64
li o80, 80
li o96, 96
li o112, 112
lxvdsx alpha_r, 0, ALPHA lxvdsx alpha_r, 0, T1
#include "dgemm_logic_16x4_power8.S" #include "dgemm_logic_16x4_power8.S"
@ -320,6 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r17, 256(SP) ld r17, 256(SP)
ld r16, 264(SP) ld r16, 264(SP)
ld r15, 272(SP) ld r15, 272(SP)
ld r14, 280(SP)
#else #else
lwz r31, 144(SP) lwz r31, 144(SP)
lwz r30, 148(SP) lwz r30, 148(SP)
@ -338,6 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lwz r17, 200(SP) lwz r17, 200(SP)
lwz r16, 204(SP) lwz r16, 204(SP)
lwz r15, 208(SP) lwz r15, 208(SP)
lwz r14, 212(SP)
#endif #endif
addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE

File diff suppressed because it is too large Load Diff

View File

@ -47,21 +47,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvdsx vs24, 0, BO lxvdsx vs24, 0, BO
lxvdsx vs25, o8, BO lxvdsx vs25, o8, BO
addi AO, AO, 64 lxvd2x vs4, o64, AO
lxvd2x vs5, o80, AO
lxvd2x vs4, 0, AO lxvd2x vs6, o96, AO
lxvd2x vs5, o16, AO lxvd2x vs7, o112, AO
lxvd2x vs6, o32, AO
lxvd2x vs7, o48, AO
lxvdsx vs26, o16, BO lxvdsx vs26, o16, BO
lxvdsx vs27, o24, BO lxvdsx vs27, o24, BO
addi AO, AO, 64 addi AO, AO, 128
addi BO, BO, 32 addi BO, BO, 32
.endm .endm
.macro KERNEL4x16_I1 .macro KERNEL4x16_I1
xvmuldp vs32, vs0, vs24 xvmuldp vs32, vs0, vs24
@ -69,8 +68,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmuldp vs34, vs2, vs24 xvmuldp vs34, vs2, vs24
xvmuldp vs35, vs3, vs24 xvmuldp vs35, vs3, vs24
lxvd2x vs8, 0, AO lxvd2x vs8, o0, AO
lxvd2x vs9, o16, AO lxvd2x vs9, o16, AO
lxvd2x vs10, o32, AO
lxvd2x vs11, o48, AO
xvmuldp vs36, vs4, vs24 xvmuldp vs36, vs4, vs24
xvmuldp vs37, vs5, vs24 xvmuldp vs37, vs5, vs24
@ -85,37 +86,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmuldp vs42, vs2, vs25 xvmuldp vs42, vs2, vs25
xvmuldp vs43, vs3, vs25 xvmuldp vs43, vs3, vs25
lxvd2x vs10, o32, AO
lxvd2x vs11, o48, AO
xvmuldp vs44, vs4, vs25 xvmuldp vs44, vs4, vs25
xvmuldp vs45, vs5, vs25 xvmuldp vs45, vs5, vs25
xvmuldp vs46, vs6, vs25 xvmuldp vs46, vs6, vs25
xvmuldp vs47, vs7, vs25 xvmuldp vs47, vs7, vs25
addi AO, AO, 64
xvmuldp vs48, vs0, vs26 xvmuldp vs48, vs0, vs26
xvmuldp vs49, vs1, vs26 xvmuldp vs49, vs1, vs26
xvmuldp vs50, vs2, vs26 xvmuldp vs50, vs2, vs26
xvmuldp vs51, vs3, vs26 xvmuldp vs51, vs3, vs26
lxvd2x vs12, 0, AO lxvd2x vs12, o64, AO
lxvd2x vs13, o16, AO lxvd2x vs13, o80, AO
xvmuldp vs52, vs4, vs26 xvmuldp vs52, vs4, vs26
xvmuldp vs53, vs5, vs26 xvmuldp vs53, vs5, vs26
xvmuldp vs54, vs6, vs26 xvmuldp vs54, vs6, vs26
xvmuldp vs55, vs7, vs26 xvmuldp vs55, vs7, vs26
lxvd2x vs14, o32, AO lxvd2x vs14, o96, AO
lxvd2x vs15, o48, AO lxvd2x vs15, o112, AO
xvmuldp vs56, vs0, vs27 xvmuldp vs56, vs0, vs27
xvmuldp vs57, vs1, vs27 xvmuldp vs57, vs1, vs27
xvmuldp vs58, vs2, vs27 xvmuldp vs58, vs2, vs27
xvmuldp vs59, vs3, vs27 xvmuldp vs59, vs3, vs27
lxvdsx vs30, o16, BO lxvdsx vs30, o16, BO
lxvdsx vs31, o24, BO lxvdsx vs31, o24, BO
@ -124,11 +123,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmuldp vs62, vs6, vs27 xvmuldp vs62, vs6, vs27
xvmuldp vs63, vs7, vs27 xvmuldp vs63, vs7, vs27
addi AO, AO, 64 addi AO, AO, 128
addi BO, BO, 32
.endm .endm
.macro KERNEL4x16_1 .macro KERNEL4x16_1
xvmaddadp vs32, vs0, vs24 xvmaddadp vs32, vs0, vs24
@ -136,8 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs34, vs2, vs24 xvmaddadp vs34, vs2, vs24
xvmaddadp vs35, vs3, vs24 xvmaddadp vs35, vs3, vs24
lxvd2x vs8, 0, AO lxvd2x vs8, o0, AO
lxvd2x vs9, o16, AO lxvd2x vs9, o16, AO
lxvd2x vs10, o32, AO
lxvd2x vs11, o48, AO
xvmaddadp vs36, vs4, vs24 xvmaddadp vs36, vs4, vs24
xvmaddadp vs37, vs5, vs24 xvmaddadp vs37, vs5, vs24
@ -152,31 +154,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs42, vs2, vs25 xvmaddadp vs42, vs2, vs25
xvmaddadp vs43, vs3, vs25 xvmaddadp vs43, vs3, vs25
lxvd2x vs10, o32, AO
lxvd2x vs11, o48, AO
xvmaddadp vs44, vs4, vs25 xvmaddadp vs44, vs4, vs25
xvmaddadp vs45, vs5, vs25 xvmaddadp vs45, vs5, vs25
xvmaddadp vs46, vs6, vs25 xvmaddadp vs46, vs6, vs25
xvmaddadp vs47, vs7, vs25 xvmaddadp vs47, vs7, vs25
addi AO, AO, 64
xvmaddadp vs48, vs0, vs26 xvmaddadp vs48, vs0, vs26
xvmaddadp vs49, vs1, vs26 xvmaddadp vs49, vs1, vs26
xvmaddadp vs50, vs2, vs26 xvmaddadp vs50, vs2, vs26
xvmaddadp vs51, vs3, vs26 xvmaddadp vs51, vs3, vs26
lxvd2x vs12, 0, AO lxvd2x vs12, o64, AO
lxvd2x vs13, o16, AO lxvd2x vs13, o80, AO
xvmaddadp vs52, vs4, vs26 xvmaddadp vs52, vs4, vs26
xvmaddadp vs53, vs5, vs26 xvmaddadp vs53, vs5, vs26
xvmaddadp vs54, vs6, vs26 xvmaddadp vs54, vs6, vs26
xvmaddadp vs55, vs7, vs26 xvmaddadp vs55, vs7, vs26
lxvd2x vs14, o32, AO lxvd2x vs14, o96, AO
lxvd2x vs15, o48, AO lxvd2x vs15, o112, AO
xvmaddadp vs56, vs0, vs27 xvmaddadp vs56, vs0, vs27
xvmaddadp vs57, vs1, vs27 xvmaddadp vs57, vs1, vs27
@ -192,7 +191,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs62, vs6, vs27 xvmaddadp vs62, vs6, vs27
xvmaddadp vs63, vs7, vs27 xvmaddadp vs63, vs7, vs27
addi AO, AO, 64 addi AO, AO, 128
addi BO, BO, 32 addi BO, BO, 32
.endm .endm
@ -228,23 +227,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs46, vs14, vs29 xvmaddadp vs46, vs14, vs29
xvmaddadp vs47, vs15, vs29 xvmaddadp vs47, vs15, vs29
addi AO, AO, 64
xvmaddadp vs48, vs8, vs30 xvmaddadp vs48, vs8, vs30
xvmaddadp vs49, vs9, vs30 xvmaddadp vs49, vs9, vs30
xvmaddadp vs50, vs10, vs30 xvmaddadp vs50, vs10, vs30
xvmaddadp vs51, vs11, vs30 xvmaddadp vs51, vs11, vs30
lxvd2x vs4, 0, AO lxvd2x vs4, o64, AO
lxvd2x vs5, o16, AO lxvd2x vs5, o80, AO
xvmaddadp vs52, vs12, vs30 xvmaddadp vs52, vs12, vs30
xvmaddadp vs53, vs13, vs30 xvmaddadp vs53, vs13, vs30
xvmaddadp vs54, vs14, vs30 xvmaddadp vs54, vs14, vs30
xvmaddadp vs55, vs15, vs30 xvmaddadp vs55, vs15, vs30
lxvd2x vs6, o32, AO lxvd2x vs6, o96, AO
lxvd2x vs7, o48, AO lxvd2x vs7, o112, AO
xvmaddadp vs56, vs8, vs31 xvmaddadp vs56, vs8, vs31
xvmaddadp vs57, vs9, vs31 xvmaddadp vs57, vs9, vs31
@ -259,11 +257,144 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs62, vs14, vs31 xvmaddadp vs62, vs14, vs31
xvmaddadp vs63, vs15, vs31 xvmaddadp vs63, vs15, vs31
addi AO, AO, 64 addi AO, AO, 128
addi BO, BO, 32 addi BO, BO, 32
.endm .endm
.macro KERNEL4x16_L1
xvmaddadp vs32, vs0, vs24
xvmaddadp vs33, vs1, vs24
xvmaddadp vs34, vs2, vs24
xvmaddadp vs35, vs3, vs24
lxvd2x vs8, o0, AO
lxvd2x vs9, o16, AO
lxvd2x vs10, o32, AO
lxvd2x vs11, o48, AO
xvmaddadp vs36, vs4, vs24
xvmaddadp vs37, vs5, vs24
xvmaddadp vs38, vs6, vs24
xvmaddadp vs39, vs7, vs24
lxvdsx vs28, 0, BO
lxvdsx vs29, o8, BO
xvmaddadp vs40, vs0, vs25
xvmaddadp vs41, vs1, vs25
xvmaddadp vs42, vs2, vs25
xvmaddadp vs43, vs3, vs25
xvmaddadp vs44, vs4, vs25
xvmaddadp vs45, vs5, vs25
xvmaddadp vs46, vs6, vs25
xvmaddadp vs47, vs7, vs25
xvmaddadp vs48, vs0, vs26
xvmaddadp vs49, vs1, vs26
xvmaddadp vs50, vs2, vs26
xvmaddadp vs51, vs3, vs26
lxvd2x vs12, o64, AO
lxvd2x vs13, o80, AO
xvmaddadp vs52, vs4, vs26
xvmaddadp vs53, vs5, vs26
xvmaddadp vs54, vs6, vs26
xvmaddadp vs55, vs7, vs26
lxvd2x vs14, o96, AO
lxvd2x vs15, o112, AO
xvmaddadp vs56, vs0, vs27
xvmaddadp vs57, vs1, vs27
xvmaddadp vs58, vs2, vs27
xvmaddadp vs59, vs3, vs27
lxvdsx vs30, o16, BO
lxvdsx vs31, o24, BO
xvmaddadp vs60, vs4, vs27
xvmaddadp vs61, vs5, vs27
xvmaddadp vs62, vs6, vs27
xvmaddadp vs63, vs7, vs27
addi AO, AO, 128
.endm
.macro KERNEL4x16_L2
xvmaddadp vs32, vs8, vs28
xvmaddadp vs33, vs9, vs28
xvmaddadp vs34, vs10, vs28
xvmaddadp vs35, vs11, vs28
lxvd2x vs0, 0, AO
lxvd2x vs1, o16, AO
xvmaddadp vs36, vs12, vs28
xvmaddadp vs37, vs13, vs28
xvmaddadp vs38, vs14, vs28
xvmaddadp vs39, vs15, vs28
lxvdsx vs24, o32, BO
lxvdsx vs25, o40, BO
xvmaddadp vs40, vs8, vs29
xvmaddadp vs41, vs9, vs29
xvmaddadp vs42, vs10, vs29
xvmaddadp vs43, vs11, vs29
lxvd2x vs2, o32, AO
lxvd2x vs3, o48, AO
xvmaddadp vs44, vs12, vs29
xvmaddadp vs45, vs13, vs29
xvmaddadp vs46, vs14, vs29
xvmaddadp vs47, vs15, vs29
xvmaddadp vs48, vs8, vs30
xvmaddadp vs49, vs9, vs30
xvmaddadp vs50, vs10, vs30
xvmaddadp vs51, vs11, vs30
lxvd2x vs4, o64, AO
lxvd2x vs5, o80, AO
xvmaddadp vs52, vs12, vs30
xvmaddadp vs53, vs13, vs30
xvmaddadp vs54, vs14, vs30
xvmaddadp vs55, vs15, vs30
lxvd2x vs6, o96, AO
lxvd2x vs7, o112, AO
xvmaddadp vs56, vs8, vs31
xvmaddadp vs57, vs9, vs31
xvmaddadp vs58, vs10, vs31
xvmaddadp vs59, vs11, vs31
lxvdsx vs26, o48, BO
lxvdsx vs27, o56, BO
xvmaddadp vs60, vs12, vs31
addi AO, AO, 128
xvmaddadp vs61, vs13, vs31
xvmaddadp vs62, vs14, vs31
addi BO, BO, 64
xvmaddadp vs63, vs15, vs31
.endm
.macro KERNEL4x16_E2 .macro KERNEL4x16_E2
@ -378,15 +509,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvdsx vs26, o16, BO lxvdsx vs26, o16, BO
lxvdsx vs27, o24, BO lxvdsx vs27, o24, BO
addi AO, AO, 64
addi BO, BO, 32
lxvd2x vs4, 0, AO lxvd2x vs4, o64, AO
lxvd2x vs5, o16, AO lxvd2x vs5, o80, AO
lxvd2x vs6, o32, AO lxvd2x vs6, o96, AO
lxvd2x vs7, o48, AO lxvd2x vs7, o112, AO
addi AO, AO, 64
xvmaddadp vs32, vs0, vs24 xvmaddadp vs32, vs0, vs24
@ -402,6 +530,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs41, vs1, vs25 xvmaddadp vs41, vs1, vs25
xvmaddadp vs42, vs2, vs25 xvmaddadp vs42, vs2, vs25
xvmaddadp vs43, vs3, vs25 xvmaddadp vs43, vs3, vs25
addi BO, BO, 32
xvmaddadp vs44, vs4, vs25 xvmaddadp vs44, vs4, vs25
xvmaddadp vs45, vs5, vs25 xvmaddadp vs45, vs5, vs25
xvmaddadp vs46, vs6, vs25 xvmaddadp vs46, vs6, vs25
@ -411,6 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs49, vs1, vs26 xvmaddadp vs49, vs1, vs26
xvmaddadp vs50, vs2, vs26 xvmaddadp vs50, vs2, vs26
xvmaddadp vs51, vs3, vs26 xvmaddadp vs51, vs3, vs26
addi AO, AO, 128
xvmaddadp vs52, vs4, vs26 xvmaddadp vs52, vs4, vs26
xvmaddadp vs53, vs5, vs26 xvmaddadp vs53, vs5, vs26
xvmaddadp vs54, vs6, vs26 xvmaddadp vs54, vs6, vs26
@ -430,21 +560,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x16 .macro SAVE4x16
mr T1, CO mr T1, CO
addi T2, T1, 64 add T2, T1, LDC
add T3, T2, LDC
add T4, T3, LDC
#ifndef TRMMKERNEL lxvd2x vs0, 0, CO
lxvd2x vs0, 0, T1 lxvd2x vs1, o16, CO
lxvd2x vs1, o16, T1 lxvd2x vs2, o32, CO
lxvd2x vs2, o32, T1 lxvd2x vs3, o48, CO
lxvd2x vs3, o48, T1 lxvd2x vs4, o64, CO
lxvd2x vs5, o80, CO
lxvd2x vs6, o96, CO
lxvd2x vs7, o112, CO
lxvd2x vs4, 0, T2 lxvd2x vs8, 0, T2
lxvd2x vs5, o16, T2 lxvd2x vs9, o16, T2
lxvd2x vs6, o32, T2 lxvd2x vs10, o32, T2
lxvd2x vs7, o48, T2 lxvd2x vs11, o48, T2
#endif lxvd2x vs12, o64, T2
lxvd2x vs13, o80, T2
lxvd2x vs14, o96, T2
lxvd2x vs15, o112, T2
lxvd2x vs24, 0, T3
lxvd2x vs25, o16, T3
lxvd2x vs26, o32, T3
lxvd2x vs27, o48, T3
lxvd2x vs28, o64, T3
lxvd2x vs29, o80, T3
lxvd2x vs30, o96, T3
lxvd2x vs31, o112, T3
#ifndef TRMMKERNEL
xvmaddadp vs0, vs32, alpha_r xvmaddadp vs0, vs32, alpha_r
xvmaddadp vs1, vs33, alpha_r xvmaddadp vs1, vs33, alpha_r
xvmaddadp vs2, vs34, alpha_r xvmaddadp vs2, vs34, alpha_r
@ -453,172 +599,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs5, vs37, alpha_r xvmaddadp vs5, vs37, alpha_r
xvmaddadp vs6, vs38, alpha_r xvmaddadp vs6, vs38, alpha_r
xvmaddadp vs7, vs39, alpha_r xvmaddadp vs7, vs39, alpha_r
#else
xvmuldp vs0, vs32, alpha_r
xvmuldp vs1, vs33, alpha_r
xvmuldp vs2, vs34, alpha_r
xvmuldp vs3, vs35, alpha_r
xvmuldp vs4, vs36, alpha_r
xvmuldp vs5, vs37, alpha_r
xvmuldp vs6, vs38, alpha_r
xvmuldp vs7, vs39, alpha_r
#endif
stxvd2x vs0, 0, T1 lxvd2x vs32, 0, T4
stxvd2x vs1, o16, T1 lxvd2x vs33, o16, T4
stxvd2x vs2, o32, T1 lxvd2x vs34, o32, T4
stxvd2x vs3, o48, T1 lxvd2x vs35, o48, T4
lxvd2x vs36, o64, T4
lxvd2x vs37, o80, T4
lxvd2x vs38, o96, T4
lxvd2x vs39, o112, T4
dcbt T1, PRE
stxvd2x vs4, 0, T2
stxvd2x vs5, o16, T2
stxvd2x vs6, o32, T2
stxvd2x vs7, o48, T2
add T1, T1, LDC
add T2, T2, LDC
#ifndef TRMMKERNEL
lxvd2x vs8, 0, T1
lxvd2x vs9, o16, T1
lxvd2x vs10, o32, T1
lxvd2x vs11, o48, T1
lxvd2x vs12, 0, T2
lxvd2x vs13, o16, T2
lxvd2x vs14, o32, T2
lxvd2x vs15, o48, T2
#endif
#ifndef TRMMKERNEL
xvmaddadp vs8, vs40, alpha_r xvmaddadp vs8, vs40, alpha_r
xvmaddadp vs9, vs41, alpha_r xvmaddadp vs9, vs41, alpha_r
xvmaddadp vs10, vs42, alpha_r xvmaddadp vs10, vs42, alpha_r
xvmaddadp vs11, vs43, alpha_r xvmaddadp vs11, vs43, alpha_r
xvmaddadp vs12, vs44, alpha_r
xvmaddadp vs13, vs45, alpha_r
xvmaddadp vs14, vs46, alpha_r
xvmaddadp vs15, vs47, alpha_r
#else
xvmuldp vs8, vs40, alpha_r
xvmuldp vs9, vs41, alpha_r
xvmuldp vs10, vs42, alpha_r
xvmuldp vs11, vs43, alpha_r
xvmuldp vs12, vs44, alpha_r
xvmuldp vs13, vs45, alpha_r
xvmuldp vs14, vs46, alpha_r
xvmuldp vs15, vs47, alpha_r
#endif
stxvd2x vs8, 0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1
dcbt T1, PRE
stxvd2x vs12, 0, T2
stxvd2x vs13, o16, T2
stxvd2x vs14, o32, T2
stxvd2x vs15, o48, T2
add T1, T1, LDC
add T2, T2, LDC
#ifndef TRMMKERNEL
lxvd2x vs0, 0, T1
lxvd2x vs1, o16, T1
lxvd2x vs2, o32, T1
lxvd2x vs3, o48, T1
lxvd2x vs4, 0, T2
lxvd2x vs5, o16, T2
lxvd2x vs6, o32, T2
lxvd2x vs7, o48, T2
#endif
#ifndef TRMMKERNEL
xvmaddadp vs0, vs48, alpha_r
xvmaddadp vs1, vs49, alpha_r
xvmaddadp vs2, vs50, alpha_r
xvmaddadp vs3, vs51, alpha_r
xvmaddadp vs4, vs52, alpha_r
xvmaddadp vs5, vs53, alpha_r
xvmaddadp vs6, vs54, alpha_r
xvmaddadp vs7, vs55, alpha_r
#else
xvmuldp vs0, vs48, alpha_r
xvmuldp vs1, vs49, alpha_r
xvmuldp vs2, vs50, alpha_r
xvmuldp vs3, vs51, alpha_r
xvmuldp vs4, vs52, alpha_r
xvmuldp vs5, vs53, alpha_r
xvmuldp vs6, vs54, alpha_r
xvmuldp vs7, vs55, alpha_r
#endif
stxvd2x vs0, 0, T1 stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1 stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1 stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1 stxvd2x vs3, o48, T1
dcbt T1, PRE xvmaddadp vs12, vs44, alpha_r
xvmaddadp vs13, vs45, alpha_r
xvmaddadp vs14, vs46, alpha_r
xvmaddadp vs15, vs47, alpha_r
stxvd2x vs4, 0, T2 stxvd2x vs4, o64, T1
stxvd2x vs5, o16, T2 stxvd2x vs5, o80, T1
stxvd2x vs6, o32, T2 stxvd2x vs6, o96, T1
stxvd2x vs7, o48, T2 stxvd2x vs7, o112, T1
add T1, T1, LDC xvmaddadp vs24, vs48, alpha_r
add T2, T2, LDC xvmaddadp vs25, vs49, alpha_r
xvmaddadp vs26, vs50, alpha_r
xvmaddadp vs27, vs51, alpha_r
#ifndef TRMMKERNEL stxvd2x vs8, o0, T2
lxvd2x vs8, 0, T1 stxvd2x vs9, o16, T2
lxvd2x vs9, o16, T1 stxvd2x vs10, o32, T2
lxvd2x vs10, o32, T1 stxvd2x vs11, o48, T2
lxvd2x vs11, o48, T1
lxvd2x vs12, 0, T2 xvmaddadp vs28, vs52, alpha_r
lxvd2x vs13, o16, T2 xvmaddadp vs29, vs53, alpha_r
lxvd2x vs14, o32, T2 xvmaddadp vs30, vs54, alpha_r
lxvd2x vs15, o48, T2 xvmaddadp vs31, vs55, alpha_r
#endif
#ifndef TRMMKERNEL stxvd2x vs12, o64, T2
xvmaddadp vs8, vs56, alpha_r stxvd2x vs13, o80, T2
xvmaddadp vs9, vs57, alpha_r stxvd2x vs14, o96, T2
xvmaddadp vs10, vs58, alpha_r stxvd2x vs15, o112, T2
xvmaddadp vs11, vs59, alpha_r
xvmaddadp vs12, vs60, alpha_r
xvmaddadp vs13, vs61, alpha_r
xvmaddadp vs14, vs62, alpha_r
xvmaddadp vs15, vs63, alpha_r
#else
xvmuldp vs8, vs56, alpha_r
xvmuldp vs9, vs57, alpha_r
xvmuldp vs10, vs58, alpha_r
xvmuldp vs11, vs59, alpha_r
xvmuldp vs12, vs60, alpha_r
xvmuldp vs13, vs61, alpha_r
xvmuldp vs14, vs62, alpha_r
xvmuldp vs15, vs63, alpha_r
#endif
stxvd2x vs8, 0, T1 xvmaddadp vs32, vs56, alpha_r
stxvd2x vs9, o16, T1 xvmaddadp vs33, vs57, alpha_r
stxvd2x vs10, o32, T1 xvmaddadp vs34, vs58, alpha_r
stxvd2x vs11, o48, T1 xvmaddadp vs35, vs59, alpha_r
dcbt T1, PRE stxvd2x vs24, 0, T3
stxvd2x vs25, o16, T3
stxvd2x vs26, o32, T3
stxvd2x vs27, o48, T3
stxvd2x vs12, 0, T2 xvmaddadp vs36, vs60, alpha_r
stxvd2x vs13, o16, T2 xvmaddadp vs37, vs61, alpha_r
stxvd2x vs14, o32, T2 xvmaddadp vs38, vs62, alpha_r
stxvd2x vs15, o48, T2 xvmaddadp vs39, vs63, alpha_r
stxvd2x vs28, o64, T3
stxvd2x vs29, o80, T3
stxvd2x vs30, o96, T3
stxvd2x vs31, o112, T3
stxvd2x vs32, o0, T4
stxvd2x vs33, o16, T4
stxvd2x vs34, o32, T4
stxvd2x vs35, o48, T4
addi CO, CO, 128 addi CO, CO, 128
stxvd2x vs36, o64, T4
stxvd2x vs37, o80, T4
stxvd2x vs38, o96, T4
stxvd2x vs39, o112, T4
.endm .endm
/********************************************************************* /*********************************************************************

View File

@ -0,0 +1,228 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#define M r3
#define N r4
#define A r5
#define LDA r6
#define B r7
#define A0 r8
#define A1 r9
#define A2 r10
#define A3 r11
#define J r12
#define PREA r14
#define PREB r15
#define BO r16
#define o64 r17
#define o80 r18
#define o96 r19
#define o112 r20
#define o8 r21
#define T2 r22
#define I r23
#define o16 r24
#define o32 r25
#define o48 r26
#define NOTU1 r27
#define NOTU2 r30
#define T1 r31
#define o0 0
#include "dgemm_ncopy_macros_4_power8.S"
#define STACKSIZE 384
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
cmpwi cr0, M, 0
ble- L999
cmpwi cr0, N, 0
ble- L999
slwi LDA, LDA, BASE_SHIFT
li PREA, 384
li PREB, 384
li o8, 8
li o16, 16
li o32, 32
li o48, 48
li o64, 64
li o80, 80
li o96, 96
li o112, 112
#include "dgemm_ncopy_logic_4_power8.S"
L999:
li r3, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi SP, SP, STACKSIZE
blr
EPILOGUE

View File

@ -0,0 +1,237 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
mr BO, B
srawi. I, N, 2
ble DCOPYN_L2_BEGIN
DCOPYN_L4_BEGIN:
DCOPYN_L4_LOOP:
mr A0, A
add A1, A0, LDA
add A2, A1, LDA
add A3, A2, LDA
add A, A3, LDA
DCOPYN_L4x16_BEGIN:
srawi. J, M, 4
ble DCOPYN_L4x16_END
DCOPYN_L4x16_LOOP:
dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
COPY_4x16
addic. J, J, -1
bgt DCOPYN_L4x16_LOOP
DCOPYN_L4x16_END:
DCOPYN_L4x8_BEGIN:
andi. J, M, 8
ble DCOPYN_L4x8_END
COPY_4x8
DCOPYN_L4x8_END:
DCOPYN_L4x4_BEGIN:
andi. J, M, 4
ble DCOPYN_L4x4_END
COPY_4x4
DCOPYN_L4x4_END:
DCOPYN_L4x2_BEGIN:
andi. J, M, 2
ble DCOPYN_L4x2_END
COPY_4x2
DCOPYN_L4x2_END:
DCOPYN_L4x1_BEGIN:
andi. J, M, 1
ble DCOPYN_L4x1_END
COPY_4x1
DCOPYN_L4x1_END:
DCOPYN_L4_END:
addic. I, I, -1
bgt DCOPYN_L4_LOOP
DCOPYN_L2_BEGIN:
andi. T1, 4, 2
ble DCOPYN_L2_END
DCOPYN_L2_LOOP:
mr A0, A
add A1, A0, LDA
add A, A1, LDA
DCOPYN_L2x16_BEGIN:
srawi. J, M, 4
ble DCOPYN_L2x16_END
DCOPYN_L2x16_LOOP:
COPY_2x16
addic. J, J, -1
bgt DCOPYN_L2x16_LOOP
DCOPYN_L2x16_END:
DCOPYN_L2x8_BEGIN:
andi. J, M, 8
ble DCOPYN_L2x8_END
COPY_2x8
DCOPYN_L2x8_END:
DCOPYN_L2x4_BEGIN:
andi. J, M, 4
ble DCOPYN_L2x4_END
COPY_2x4
DCOPYN_L2x4_END:
DCOPYN_L2x2_BEGIN:
andi. J, M, 2
ble DCOPYN_L2x2_END
COPY_2x2
DCOPYN_L2x2_END:
DCOPYN_L2x1_BEGIN:
andi. J, M, 1
ble DCOPYN_L2x1_END
COPY_2x1
DCOPYN_L2x1_END:
DCOPYN_L2_END:
DCOPYN_L1_BEGIN:
andi. T1, 4, 1
ble DCOPYN_L1_END
DCOPYN_L1_LOOP:
mr A0, A
add A, A0, LDA
DCOPYN_L1x16_BEGIN:
srawi. J, M, 4
ble DCOPYN_L1x16_END
DCOPYN_L1x16_LOOP:
COPY_1x16
addic. J, J, -1
bgt DCOPYN_L1x16_LOOP
DCOPYN_L1x16_END:
DCOPYN_L1x8_BEGIN:
andi. J, M, 8
ble DCOPYN_L1x8_END
COPY_1x8
DCOPYN_L1x8_END:
DCOPYN_L1x4_BEGIN:
andi. J, M, 4
ble DCOPYN_L1x4_END
COPY_1x4
DCOPYN_L1x4_END:
DCOPYN_L1x2_BEGIN:
andi. J, M, 2
ble DCOPYN_L1x2_END
COPY_1x2
DCOPYN_L1x2_END:
DCOPYN_L1x1_BEGIN:
andi. J, M, 1
ble DCOPYN_L1x1_END
COPY_1x1
DCOPYN_L1x1_END:
DCOPYN_L1_END:

View File

@ -0,0 +1,691 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/**********************************************************************************************
* Macros for N=4 and M=16
**********************************************************************************************/
.macro COPY_4x16
lxvd2x vs0, o0, A0
lxvd2x vs8, o0, A1
lxvd2x vs24, o0, A3
lxvd2x vs16, o0, A2
lxvd2x vs1, o16, A0
lxvd2x vs9, o16, A1
lxvd2x vs17, o16, A2
lxvd2x vs25, o16, A3
lxvd2x vs2, o32, A0
lxvd2x vs10, o32, A1
lxvd2x vs18, o32, A2
lxvd2x vs26, o32, A3
lxvd2x vs3, o48, A0
lxvd2x vs11, o48, A1
lxvd2x vs19, o48, A2
lxvd2x vs27, o48, A3
lxvd2x vs4, o64, A0
lxvd2x vs12, o64, A1
lxvd2x vs20, o64, A2
lxvd2x vs28, o64, A3
lxvd2x vs5, o80, A0
lxvd2x vs13, o80, A1
lxvd2x vs21, o80, A2
lxvd2x vs29, o80, A3
lxvd2x vs6, o96, A0
lxvd2x vs14, o96, A1
lxvd2x vs22, o96, A2
lxvd2x vs30, o96, A3
lxvd2x vs7, o112, A0
lxvd2x vs15, o112, A1
lxvd2x vs23, o112, A2
lxvd2x vs31, o112, A3
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0
xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3
xxpermdi vs40, vs2, vs10, 0
xxpermdi vs41, vs18, vs26, 0
xxpermdi vs42, vs2, vs10, 3
xxpermdi vs43, vs18, vs26, 3
xxpermdi vs44, vs3, vs11, 0
xxpermdi vs45, vs19, vs27, 0
xxpermdi vs46, vs3, vs11, 3
xxpermdi vs47, vs19, vs27, 3
xxpermdi vs48, vs4, vs12, 0
xxpermdi vs49, vs20, vs28, 0
xxpermdi vs50, vs4, vs12, 3
xxpermdi vs51, vs20, vs28, 3
xxpermdi vs52, vs5, vs13, 0
xxpermdi vs53, vs21, vs29, 0
xxpermdi vs54, vs5, vs13, 3
xxpermdi vs55, vs21, vs29, 3
addi A0, A0, 128
addi A1, A1, 128
xxpermdi vs56, vs6, vs14, 0
xxpermdi vs57, vs22, vs30, 0
xxpermdi vs58, vs6, vs14, 3
xxpermdi vs59, vs22, vs30, 3
addi A3, A3, 128
addi A2, A2, 128
xxpermdi vs60, vs7, vs15, 0
xxpermdi vs61, vs23, vs31, 0
xxpermdi vs62, vs7, vs15, 3
xxpermdi vs63, vs23, vs31, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
stxvd2x vs36, o64, BO
stxvd2x vs37, o80, BO
stxvd2x vs38, o96, BO
stxvd2x vs39, o112, BO
addi BO, BO, 128
stxvd2x vs40, o0, BO
stxvd2x vs41, o16, BO
stxvd2x vs42, o32, BO
stxvd2x vs43, o48, BO
stxvd2x vs44, o64, BO
stxvd2x vs45, o80, BO
stxvd2x vs46, o96, BO
stxvd2x vs47, o112, BO
addi BO, BO, 128
stxvd2x vs48, o0, BO
stxvd2x vs49, o16, BO
stxvd2x vs50, o32, BO
stxvd2x vs51, o48, BO
stxvd2x vs52, o64, BO
stxvd2x vs53, o80, BO
stxvd2x vs54, o96, BO
stxvd2x vs55, o112, BO
addi BO, BO, 128
stxvd2x vs56, o0, BO
stxvd2x vs57, o16, BO
stxvd2x vs58, o32, BO
stxvd2x vs59, o48, BO
stxvd2x vs60, o64, BO
stxvd2x vs61, o80, BO
stxvd2x vs62, o96, BO
stxvd2x vs63, o112, BO
addi BO, BO, 128
.endm
/**********************************************************************************************
* Macros for N=4 and M=8
**********************************************************************************************/
.macro COPY_4x8
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
lxvd2x vs2, o32, A0
lxvd2x vs3, o48, A0
addi A0, A0, 64
lxvd2x vs8, o0, A1
lxvd2x vs9, o16, A1
lxvd2x vs10, o32, A1
lxvd2x vs11, o48, A1
addi A1, A1, 64
lxvd2x vs16, o0, A2
lxvd2x vs17, o16, A2
lxvd2x vs18, o32, A2
lxvd2x vs19, o48, A2
addi A2, A2, 64
lxvd2x vs24, o0, A3
lxvd2x vs25, o16, A3
lxvd2x vs26, o32, A3
lxvd2x vs27, o48, A3
addi A3, A3, 64
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0
xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3
xxpermdi vs40, vs2, vs10, 0
xxpermdi vs41, vs18, vs26, 0
xxpermdi vs42, vs2, vs10, 3
xxpermdi vs43, vs18, vs26, 3
xxpermdi vs44, vs3, vs11, 0
xxpermdi vs45, vs19, vs27, 0
xxpermdi vs46, vs3, vs11, 3
xxpermdi vs47, vs19, vs27, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
stxvd2x vs36, o64, BO
stxvd2x vs37, o80, BO
stxvd2x vs38, o96, BO
stxvd2x vs39, o112, BO
addi BO, BO, 128
stxvd2x vs40, o0, BO
stxvd2x vs41, o16, BO
stxvd2x vs42, o32, BO
stxvd2x vs43, o48, BO
stxvd2x vs44, o64, BO
stxvd2x vs45, o80, BO
stxvd2x vs46, o96, BO
stxvd2x vs47, o112, BO
addi BO, BO, 128
.endm
/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/
.macro COPY_4x4
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
addi A0, A0, 32
lxvd2x vs8, o0, A1
lxvd2x vs9, o16, A1
addi A1, A1, 32
lxvd2x vs16, o0, A2
lxvd2x vs17, o16, A2
addi A2, A2, 32
lxvd2x vs24, o0, A3
lxvd2x vs25, o16, A3
addi A3, A3, 32
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0
xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
stxvd2x vs36, o64, BO
stxvd2x vs37, o80, BO
stxvd2x vs38, o96, BO
stxvd2x vs39, o112, BO
addi BO, BO, 128
.endm
/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/
.macro COPY_4x2
lxvd2x vs0, o0, A0
addi A0, A0, 16
lxvd2x vs8, o0, A1
addi A1, A1, 16
lxvd2x vs16, o0, A2
addi A2, A2, 16
lxvd2x vs24, o0, A3
addi A3, A3, 16
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
addi BO, BO, 64
.endm
/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/
.macro COPY_4x1
lxsdx vs0, o0, A0
addi A0, A0, 8
lxsdx vs8, o0, A1
addi A1, A1, 8
lxsdx vs16, o0, A2
addi A2, A2, 8
lxsdx vs24, o0, A3
addi A3, A3, 8
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
addi BO, BO, 32
.endm
/**********************************************************************************************
* Macros for N=2 and M=16
**********************************************************************************************/
.macro COPY_2x16
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
lxvd2x vs2, o32, A0
lxvd2x vs3, o48, A0
lxvd2x vs4, o64, A0
lxvd2x vs5, o80, A0
lxvd2x vs6, o96, A0
lxvd2x vs7, o112, A0
addi A0, A0, 128
lxvd2x vs8, o0, A1
lxvd2x vs9, o16, A1
lxvd2x vs10, o32, A1
lxvd2x vs11, o48, A1
lxvd2x vs12, o64, A1
lxvd2x vs13, o80, A1
lxvd2x vs14, o96, A1
lxvd2x vs15, o112, A1
addi A1, A1, 128
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs0, vs8, 3
xxpermdi vs34, vs1, vs9, 0
xxpermdi vs35, vs1, vs9, 3
xxpermdi vs36, vs2, vs10, 0
xxpermdi vs37, vs2, vs10, 3
xxpermdi vs38, vs3, vs11, 0
xxpermdi vs39, vs3, vs11, 3
xxpermdi vs40, vs4, vs12, 0
xxpermdi vs41, vs4, vs12, 3
xxpermdi vs42, vs5, vs13, 0
xxpermdi vs43, vs5, vs13, 3
xxpermdi vs44, vs6, vs14, 0
xxpermdi vs45, vs6, vs14, 3
xxpermdi vs46, vs7, vs15, 0
xxpermdi vs47, vs7, vs15, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
stxvd2x vs36, o64, BO
stxvd2x vs37, o80, BO
stxvd2x vs38, o96, BO
stxvd2x vs39, o112, BO
addi BO, BO, 128
stxvd2x vs40, o0, BO
stxvd2x vs41, o16, BO
stxvd2x vs42, o32, BO
stxvd2x vs43, o48, BO
stxvd2x vs44, o64, BO
stxvd2x vs45, o80, BO
stxvd2x vs46, o96, BO
stxvd2x vs47, o112, BO
addi BO, BO, 128
.endm
/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/
.macro COPY_2x8
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
lxvd2x vs2, o32, A0
lxvd2x vs3, o48, A0
addi A0, A0, 64
lxvd2x vs8, o0, A1
lxvd2x vs9, o16, A1
lxvd2x vs10, o32, A1
lxvd2x vs11, o48, A1
addi A1, A1, 64
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs0, vs8, 3
xxpermdi vs34, vs1, vs9, 0
xxpermdi vs35, vs1, vs9, 3
xxpermdi vs36, vs2, vs10, 0
xxpermdi vs37, vs2, vs10, 3
xxpermdi vs38, vs3, vs11, 0
xxpermdi vs39, vs3, vs11, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
stxvd2x vs36, o64, BO
stxvd2x vs37, o80, BO
stxvd2x vs38, o96, BO
stxvd2x vs39, o112, BO
addi BO, BO, 128
.endm
/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/
.macro COPY_2x4
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
addi A0, A0, 32
lxvd2x vs8, o0, A1
lxvd2x vs9, o16, A1
addi A1, A1, 32
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs0, vs8, 3
xxpermdi vs34, vs1, vs9, 0
xxpermdi vs35, vs1, vs9, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
addi BO, BO, 64
.endm
/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/
.macro COPY_2x2
lxvd2x vs0, o0, A0
addi A0, A0, 16
lxvd2x vs8, o0, A1
addi A1, A1, 16
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs0, vs8, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
addi BO, BO, 32
.endm
/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/
.macro COPY_2x1
lxsdx vs0, o0, A0
addi A0, A0, 8
lxsdx vs8, o0, A1
addi A1, A1, 8
xxpermdi vs32, vs0, vs8, 0
stxvd2x vs32, o0, BO
addi BO, BO, 16
.endm
/**********************************************************************************************
* Macros for N=1 and M=16
**********************************************************************************************/
.macro COPY_1x16
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
lxvd2x vs2, o32, A0
lxvd2x vs3, o48, A0
lxvd2x vs4, o64, A0
lxvd2x vs5, o80, A0
lxvd2x vs6, o96, A0
lxvd2x vs7, o112, A0
addi A0, A0, 128
stxvd2x vs0, o0, BO
stxvd2x vs1, o16, BO
stxvd2x vs2, o32, BO
stxvd2x vs3, o48, BO
addi BO, BO, 64
stxvd2x vs4, o0, BO
stxvd2x vs5, o16, BO
stxvd2x vs6, o32, BO
stxvd2x vs7, o48, BO
addi BO, BO, 64
.endm
/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/
.macro COPY_1x8
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
lxvd2x vs2, o32, A0
lxvd2x vs3, o48, A0
addi A0, A0, 64
stxvd2x vs0, o0, BO
stxvd2x vs1, o16, BO
stxvd2x vs2, o32, BO
stxvd2x vs3, o48, BO
addi BO, BO, 64
.endm
/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/
.macro COPY_1x4
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
addi A0, A0, 32
stxvd2x vs0, o0, BO
stxvd2x vs1, o16, BO
addi BO, BO, 32
.endm
/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/
.macro COPY_1x2
lxvd2x vs0, o0, A0
addi A0, A0, 16
stxvd2x vs0, o0, BO
addi BO, BO, 16
.endm
/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/
.macro COPY_1x1
lxsdx vs0, o0, A0
addi A0, A0, 8
stxsdx vs0, o0, BO
addi BO, BO, 8
.endm

View File

@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add B2, B2, B add B2, B2, B
add B1, B1, B add B1, B1, B
li PREA, 768 li PREA, 256
addi PREB, M16, 128 addi PREB, M16, 128
li o8, 8 li o8, 8

View File

@ -57,16 +57,20 @@ DCOPYT_L4_BEGIN:
DCOPYT_L4x16_LOOP: DCOPYT_L4x16_LOOP:
/*
addi T1, PREB, 128 addi T1, PREB, 128
addi T2, PREB, 256 addi T2, PREB, 256
*/
dcbt A0, PREA dcbt A0, PREA
dcbt A1, PREA dcbt A1, PREA
dcbt A2, PREA dcbt A2, PREA
dcbt A3, PREA dcbt A3, PREA
/*
dcbtst BO, M16 dcbtst BO, M16
dcbtst BO, PREB dcbtst BO, PREB
dcbtst BO, T1 dcbtst BO, T1
dcbtst BO, T2 dcbtst BO, T2
*/
COPY_4x16 COPY_4x16
add BO, BO, M16 add BO, BO, M16

View File

@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define PRE r30 #define PRE r30
#define T2 r31 #define T2 r31
#include "dgemm_macros_16x4_power8.S" #include "dtrmm_macros_16x4_power8.S"
#ifndef NEEDPARAM #ifndef NEEDPARAM

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,207 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#define M r3
#define N r4
#define A r5
#define LDA r6
#define B r7
#define A0 r8
#define A1 r9
#define A2 r10
#define A3 r11
#define J r12
#define PREA r14
#define PREB r15
#define BO r16
#define B8 r17
#define B4 r18
#define B2 r19
#define B1 r20
#define o4 r21
#define T2 r22
#define I r23
#define o16 r24
#define o32 r25
#define o48 r26
#define NOTU1 r29
#define M8 r30
#define T1 r31
#define o0 0
#include "sgemm_tcopy_macros_8_power8.S"
#define STACKSIZE 384
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
cmpwi cr0, M, 0
ble- L999
cmpwi cr0, N, 0
ble- L999
slwi LDA, LDA, BASE_SHIFT
slwi M8, M, 3 + BASE_SHIFT
li T2, -8
li PREA, -4
li PREB, -2
and B4, N, T2
and B2, N, PREA
and B1, N, PREB
mullw B4, B4, M
mullw B2, B2, M
mullw B1, B1, M
slwi B4, B4, BASE_SHIFT
slwi B2, B2, BASE_SHIFT
slwi B1, B1, BASE_SHIFT
add B4, B4, B
add B2, B2, B
add B1, B1, B
li PREA, 384
addi PREB, M8, 128
li o4, 4
li o16, 16
li o32, 32
li o48, 48
#include "sgemm_tcopy_logic_8_power8.S"
L999:
li r3, 0
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi SP, SP, STACKSIZE
blr
EPILOGUE

View File

@ -0,0 +1,299 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
srawi. I, M, 2
ble SCOPYOT_L2_BEGIN
SCOPYOT_L4_BEGIN:
mr A0, A
add A1, A0, LDA
add A2, A1, LDA
add A3, A2, LDA
add A, A3, LDA
mr B8, B
addi B, B, 32*SIZE
sradi. J, N, 3
ble SCOPYOT_L4x4_BEGIN
mr BO, B8
.align 5
SCOPYOT_L4x8_LOOP:
dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
COPY_4x8
addi A0, A0, 8*SIZE
addi A1, A1, 8*SIZE
addi A2, A2, 8*SIZE
addi A3, A3, 8*SIZE
add BO, BO, M8
addic. J, J, -1
ble SCOPYOT_L4x4_BEGIN
COPY_4x8
addi A0, A0, 8*SIZE
addi A1, A1, 8*SIZE
addi A2, A2, 8*SIZE
addi A3, A3, 8*SIZE
add BO, BO, M8
addic. J, J, -1
ble SCOPYOT_L4x4_BEGIN
COPY_4x8
addi A0, A0, 8*SIZE
addi A1, A1, 8*SIZE
addi A2, A2, 8*SIZE
addi A3, A3, 8*SIZE
add BO, BO, M8
addic. J, J, -1
ble SCOPYOT_L4x4_BEGIN
COPY_4x8
addi A0, A0, 8*SIZE
addi A1, A1, 8*SIZE
addi A2, A2, 8*SIZE
addi A3, A3, 8*SIZE
add BO, BO, M8
addic. J, J, -1
bgt SCOPYOT_L4x8_LOOP
SCOPYOT_L4x4_BEGIN:
andi. T1, N, 4
ble SCOPYOT_L4x2_BEGIN
mr BO, B4
COPY_4x4
addi A0, A0, 4*SIZE
addi A1, A1, 4*SIZE
addi A2, A2, 4*SIZE
addi A3, A3, 4*SIZE
addi B4, B4, 16*SIZE
SCOPYOT_L4x2_BEGIN:
andi. T1, N, 2
ble SCOPYOT_L4x1_BEGIN
mr BO, B2
COPY_4x2
addi A0, A0, 2*SIZE
addi A1, A1, 2*SIZE
addi A2, A2, 2*SIZE
addi A3, A3, 2*SIZE
addi B2, B2, 8*SIZE
SCOPYOT_L4x1_BEGIN:
andi. T1, N, 1
ble SCOPYOT_L4_END
mr BO, B1
COPY_4x1
addi A0, A0, 1*SIZE
addi A1, A1, 1*SIZE
addi A2, A2, 1*SIZE
addi A3, A3, 1*SIZE
addi B1, B1, 4*SIZE
SCOPYOT_L4_END:
addic. I, I, -1
bgt SCOPYOT_L4_BEGIN
SCOPYOT_L2_BEGIN:
andi. T1, M, 2
ble SCOPYOT_L1_BEGIN
mr A0, A
add A1, A0, LDA
add A, A1, LDA
mr B8, B
addi B, B, 16*SIZE
sradi. J, N, 3
ble SCOPYOT_L2x4_BEGIN
mr BO, B8
SCOPYOT_L2x8_LOOP:
COPY_2x8
addi A0, A0, 8*SIZE
addi A1, A1, 8*SIZE
add BO, BO, M8
addic. J, J, -1
bgt SCOPYOT_L2x8_LOOP
SCOPYOT_L2x4_BEGIN:
andi. T1, N, 4
ble SCOPYOT_L2x2_BEGIN
mr BO, B4
COPY_2x4
addi A0, A0, 4*SIZE
addi A1, A1, 4*SIZE
addi B4, B4, 8*SIZE
SCOPYOT_L2x2_BEGIN:
andi. T1, N, 2
ble SCOPYOT_L2x1_BEGIN
mr BO, B2
COPY_2x2
addi A0, A0, 2*SIZE
addi A1, A1, 2*SIZE
addi B2, B2, 4*SIZE
SCOPYOT_L2x1_BEGIN:
andi. T1, N, 1
ble SCOPYOT_L2_END
mr BO, B1
COPY_2x1
addi A0, A0, 1*SIZE
addi A1, A1, 1*SIZE
addi B1, B1, 2*SIZE
SCOPYOT_L2_END:
SCOPYOT_L1_BEGIN:
andi. T1, M, 1
ble L999
mr A0, A
add A, A0, LDA
mr B8, B
addi B, B, 8*SIZE
sradi. J, N, 3
ble SCOPYOT_L1x4_BEGIN
mr BO, B8
SCOPYOT_L1x8_LOOP:
COPY_1x8
addi A0, A0, 8*SIZE
add BO, BO, M8
addic. J, J, -1
bgt SCOPYOT_L1x8_LOOP
SCOPYOT_L1x4_BEGIN:
andi. T1, N, 4
ble SCOPYOT_L1x2_BEGIN
mr BO, B4
COPY_1x4
addi A0, A0, 4*SIZE
addi B4, B4, 4*SIZE
SCOPYOT_L1x2_BEGIN:
andi. T1, N, 2
ble SCOPYOT_L1x1_BEGIN
mr BO, B2
COPY_1x2
addi A0, A0, 2*SIZE
addi B2, B2, 2*SIZE
SCOPYOT_L1x1_BEGIN:
andi. T1, N, 1
ble SCOPYOT_L1_END
mr BO, B1
COPY_1x1
addi A0, A0, 1*SIZE
addi B1, B1, 1*SIZE
SCOPYOT_L1_END:

View File

@ -0,0 +1,308 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/**********************************************************************************************
* Macros for N=4 and M=8
**********************************************************************************************/
.macro COPY_4x8
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
lxvw4x vs34, o0, A1
lxvw4x vs35, o16, A1
lxvw4x vs36, o0, A2
lxvw4x vs37, o16, A2
lxvw4x vs38, o0, A3
lxvw4x vs39, o16, A3
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
addi T1, T1, 64
stxvw4x vs36, o0, T1
stxvw4x vs37, o16, T1
stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1
.endm
/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/
.macro COPY_4x4
lxvw4x vs32, o0, A0
lxvw4x vs33, o0, A1
lxvw4x vs34, o0, A2
lxvw4x vs35, o0, A3
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
.endm
/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/
.macro COPY_4x2
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
lxsspx vs34, o0, A1
lxsspx vs35, o4, A1
lxsspx vs36, o0, A2
lxsspx vs37, o4, A2
lxsspx vs38, o0, A3
lxsspx vs39, o4, A3
mr T1, BO
stxsspx vs32, o0, T1
stxsspx vs33, o4, T1
addi T1, T1, 8
stxsspx vs34, o0, T1
stxsspx vs35, o4, T1
addi T1, T1, 8
stxsspx vs36, o0, T1
stxsspx vs37, o4, T1
addi T1, T1, 8
stxsspx vs38, o0, T1
stxsspx vs39, o4, T1
.endm
/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/
.macro COPY_4x1
lxsspx vs32, o0, A0
lxsspx vs33, o0, A1
lxsspx vs34, o0, A2
lxsspx vs35, o0, A3
mr T1, BO
stxsspx vs32, o0, T1
stxsspx vs33, o4, T1
addi T1, T1, 8
stxsspx vs34, o0, T1
stxsspx vs35, o4, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/
.macro COPY_2x8
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
lxvw4x vs34, o0, A1
lxvw4x vs35, o16, A1
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/
.macro COPY_2x4
lxvw4x vs32, o0, A0
lxvw4x vs33, o0, A1
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/
.macro COPY_2x2
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
lxsspx vs34, o0, A1
lxsspx vs35, o4, A1
mr T1, BO
stxsspx vs32, o0, T1
stxsspx vs33, o4, T1
addi T1, T1, 8
stxsspx vs34, o0, T1
stxsspx vs35, o4, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/
.macro COPY_2x1
lxsspx vs32, o0, A0
lxsspx vs33, o0, A1
mr T1, BO
stxsspx vs32, o0, T1
stxsspx vs33, o4, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/
.macro COPY_1x8
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/
.macro COPY_1x4
lxvw4x vs32, o0, A0
mr T1, BO
stxvw4x vs32, o0, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/
.macro COPY_1x2
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
mr T1, BO
stxsspx vs32, o0, T1
stxsspx vs33, o4, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/
.macro COPY_1x1
lxsspx vs32, o0, A0
mr T1, BO
stxsspx vs32, o0, T1
.endm

View File

@ -1,3 +1,73 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */ /* All rights reserved. */
@ -250,7 +320,7 @@
ble L999 ble L999
slwi LDC, LDC, ZBASE_SHIFT slwi LDC, LDC, ZBASE_SHIFT
li PRE, 384 li PRE, 512
li o8 , 8 li o8 , 8
li o16 , 16 li o16 , 16
li o24 , 24 li o24 , 24

View File

@ -1,3 +1,39 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
srawi. J, N, 1 srawi. J, N, 1
ble ZGEMM_L2_END ble ZGEMM_L2_END
@ -5,20 +41,34 @@ ZGEMM_L2_BEGIN:
mr BO, B mr BO, B
mr BBO, BBUFFER mr BBO, BBUFFER
slwi T1, K, 1 srawi. T1, K, 2
ble ZGEMM_L2_COPYB1
ZGEMM_L2_COPYB: ZGEMM_L2_COPYB8:
lxvdsx vs4, o0, BO // b0_r addi T2, PRE, 128
lxvdsx vs5, o8, BO // b0_i dcbt BO, PRE
addi BO, BO, 16 dcbtst BBO, PRE
stxvd2x vs4, o0, BBO dcbtst BBO, T2
stxvd2x vs5, o16, BBO ZCOPYB_8x1
addic. T1, T1, -1 addic. T1, T1, -1
addi BBO, BBO, 32
bge ZGEMM_L2_COPYB bgt ZGEMM_L2_COPYB8
ZGEMM_L2_COPYB1:
andi. T1, K, 3
ble ZGEMM_L2_COPYB_END
ZGEMM_L2_COPYB_LOOP:
ZCOPYB_1x1
ZCOPYB_1x1
addic. T1, T1, -1
bgt ZGEMM_L2_COPYB_LOOP
ZGEMM_L2_COPYB_END:
mr CO, C mr CO, C
mr AO, A mr AO, A
@ -493,6 +543,7 @@ ZGEMM_L1_BEGIN:
slwi T1, K, 0 slwi T1, K, 0
ZGEMM_L1_COPYB: ZGEMM_L1_COPYB:
dcbtst BBO, PRE
lxvdsx vs4, o0, BO // b0_r lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i lxvdsx vs5, o8, BO // b0_i

View File

@ -1,3 +1,38 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define XSFADD_R1 xsadddp #define XSFADD_R1 xsadddp
@ -3055,3 +3090,76 @@
.endm .endm
.macro ZCOPYB_1x1
lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i
addi BO, BO, 16
stxvd2x vs4, o0, BBO
stxvd2x vs5, o16, BBO
addi BBO, BBO, 32
.endm
.macro ZCOPYB_8x1
lxvd2x vs32, o0, BO
lxvd2x vs33, o16, BO
lxvd2x vs34, o32, BO
lxvd2x vs35, o48, BO
addi BO, BO, 64
lxvd2x vs36, o0, BO
lxvd2x vs37, o16, BO
lxvd2x vs38, o32, BO
lxvd2x vs39, o48, BO
addi BO, BO, 64
xxspltd vs40, vs32, 0
xxspltd vs41, vs32, 1
xxspltd vs42, vs33, 0
xxspltd vs43, vs33, 1
xxspltd vs44, vs34, 0
xxspltd vs45, vs34, 1
xxspltd vs46, vs35, 0
xxspltd vs47, vs35, 1
xxspltd vs48, vs36, 0
xxspltd vs49, vs36, 1
xxspltd vs50, vs37, 0
xxspltd vs51, vs37, 1
xxspltd vs52, vs38, 0
xxspltd vs53, vs38, 1
xxspltd vs54, vs39, 0
xxspltd vs55, vs39, 1
stxvd2x vs40, o0, BBO
stxvd2x vs41, o16, BBO
stxvd2x vs42, o32, BBO
stxvd2x vs43, o48, BBO
addi BBO, BBO, 64
stxvd2x vs44, o0, BBO
stxvd2x vs45, o16, BBO
stxvd2x vs46, o32, BBO
stxvd2x vs47, o48, BBO
addi BBO, BBO, 64
stxvd2x vs48, o0, BBO
stxvd2x vs49, o16, BBO
stxvd2x vs50, o32, BBO
stxvd2x vs51, o48, BBO
addi BBO, BBO, 64
stxvd2x vs52, o0, BBO
stxvd2x vs53, o16, BBO
stxvd2x vs54, o32, BBO
stxvd2x vs55, o48, BBO
addi BBO, BBO, 64
.endm

View File

@ -0,0 +1,205 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#define M r3
#define N r4
#define A r5
#define LDA r6
#define B r7
#define A0 r8
#define A1 r9
#define A2 r10
#define A3 r11
#define J r12
#define PREA r14
#define PREB r15
#define BO r16
#define B8 r17
#define B4 r18
#define B2 r19
#define B1 r20
#define NOTUS1 r21
#define T2 r22
#define I r23
#define o16 r24
#define o32 r25
#define o48 r26
#define NOTUS2 r27
#define M8 r30
#define T1 r31
#define o0 0
#include "zgemm_tcopy_macros_8_power8.S"
#define STACKSIZE 384
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
cmpwi cr0, M, 0
ble- L999
cmpwi cr0, N, 0
ble- L999
slwi LDA, LDA, ZBASE_SHIFT
slwi M8, M, 3 + ZBASE_SHIFT
li T2, -8
li PREA, -4
li PREB, -2
and B4, N, T2
and B2, N, PREA
and B1, N, PREB
mullw B4, B4, M
mullw B2, B2, M
mullw B1, B1, M
slwi B4, B4, ZBASE_SHIFT
slwi B2, B2, ZBASE_SHIFT
slwi B1, B1, ZBASE_SHIFT
add B4, B4, B
add B2, B2, B
add B1, B1, B
li PREA, 384
addi PREB, M8, 128
li o16, 16
li o32, 32
li o48, 48
#include "zgemm_tcopy_logic_8_power8.S"
L999:
li r3, 0
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi SP, SP, STACKSIZE
blr
EPILOGUE

View File

@ -0,0 +1,246 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
srawi. I, M, 2
ble ZCOPYT_L2_BEGIN
ZCOPYT_L4_BEGIN:
mr A0, A
add A1, A0, LDA
add A2, A1, LDA
add A3, A2, LDA
add A, A3, LDA
mr B8, B
addi B, B, 64*SIZE
sradi. J, N, 3
ble ZCOPYT_L4x4_BEGIN
mr BO, B8
.align 5
ZCOPYT_L4x8_LOOP:
addi T1, PREB, 128
addi T2, PREB, 256
dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
dcbtst BO, M8
dcbtst BO, PREB
dcbtst BO, T1
dcbtst BO, T2
COPY_4x8
add BO, BO, M8
addic. J, J, -1
bgt ZCOPYT_L4x8_LOOP
ZCOPYT_L4x4_BEGIN:
andi. T1, N, 4
ble ZCOPYT_L4x2_BEGIN
mr BO, B4
COPY_4x4
addi B4, B4, 32*SIZE
ZCOPYT_L4x2_BEGIN:
andi. T1, N, 2
ble ZCOPYT_L4x1_BEGIN
mr BO, B2
COPY_4x2
addi B2, B2, 16*SIZE
ZCOPYT_L4x1_BEGIN:
andi. T1, N, 1
ble ZCOPYT_L4_END
mr BO, B1
COPY_4x1
addi B1, B1, 8*SIZE
ZCOPYT_L4_END:
addic. I, I, -1
bgt ZCOPYT_L4_BEGIN
ZCOPYT_L2_BEGIN:
andi. T1, M, 2
ble ZCOPYT_L1_BEGIN
mr A0, A
add A1, A0, LDA
add A, A1, LDA
mr B8, B
addi B, B, 32*SIZE
sradi. J, N, 3
ble ZCOPYT_L2x4_BEGIN
mr BO, B8
ZCOPYT_L2x8_LOOP:
COPY_2x8
add BO, BO, M8
addic. J, J, -1
bgt ZCOPYT_L2x8_LOOP
ZCOPYT_L2x4_BEGIN:
andi. T1, N, 4
ble ZCOPYT_L2x2_BEGIN
mr BO, B4
COPY_2x4
addi B4, B4, 16*SIZE
ZCOPYT_L2x2_BEGIN:
andi. T1, N, 2
ble ZCOPYT_L2x1_BEGIN
mr BO, B2
COPY_2x2
addi B2, B2, 8*SIZE
ZCOPYT_L2x1_BEGIN:
andi. T1, N, 1
ble ZCOPYT_L2_END
mr BO, B1
COPY_2x1
addi B1, B1, 4*SIZE
ZCOPYT_L2_END:
ZCOPYT_L1_BEGIN:
andi. T1, M, 1
ble L999
mr A0, A
add A, A0, LDA
mr B8, B
addi B, B, 16*SIZE
sradi. J, N, 3
ble ZCOPYT_L1x4_BEGIN
mr BO, B8
ZCOPYT_L1x8_LOOP:
COPY_1x8
add BO, BO, M8
addic. J, J, -1
bgt ZCOPYT_L1x8_LOOP
ZCOPYT_L1x4_BEGIN:
andi. T1, N, 4
ble ZCOPYT_L1x2_BEGIN
mr BO, B4
COPY_1x4
addi B4, B4, 8*SIZE
ZCOPYT_L1x2_BEGIN:
andi. T1, N, 2
ble ZCOPYT_L1x1_BEGIN
mr BO, B2
COPY_1x2
addi B2, B2, 4*SIZE
ZCOPYT_L1x1_BEGIN:
andi. T1, N, 1
ble ZCOPYT_L1_END
mr BO, B1
COPY_1x1
addi B1, B1, 2*SIZE
ZCOPYT_L1_END:

View File

@ -0,0 +1,535 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/**********************************************************************************************
* Macros for N=4 and M=8
**********************************************************************************************/
.macro COPY_4x8
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64
lxvd2x vs36, o0, A0
lxvd2x vs37, o16, A0
lxvd2x vs38, o32, A0
lxvd2x vs39, o48, A0
addi A0, A0, 64
lxvd2x vs40, o0, A1
lxvd2x vs41, o16, A1
lxvd2x vs42, o32, A1
lxvd2x vs43, o48, A1
addi A1, A1, 64
lxvd2x vs44, o0, A1
lxvd2x vs45, o16, A1
lxvd2x vs46, o32, A1
lxvd2x vs47, o48, A1
addi A1, A1, 64
lxvd2x vs48, o0, A2
lxvd2x vs49, o16, A2
lxvd2x vs50, o32, A2
lxvd2x vs51, o48, A2
addi A2, A2, 64
lxvd2x vs52, o0, A2
lxvd2x vs53, o16, A2
lxvd2x vs54, o32, A2
lxvd2x vs55, o48, A2
addi A2, A2, 64
lxvd2x vs56, o0, A3
lxvd2x vs57, o16, A3
lxvd2x vs58, o32, A3
lxvd2x vs59, o48, A3
addi A3, A3, 64
lxvd2x vs60, o0, A3
lxvd2x vs61, o16, A3
lxvd2x vs62, o32, A3
lxvd2x vs63, o48, A3
addi A3, A3, 64
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
addi T1, T1, 64
stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
addi T1, T1, 64
stxvd2x vs40, o0, T1
stxvd2x vs41, o16, T1
stxvd2x vs42, o32, T1
stxvd2x vs43, o48, T1
addi T1, T1, 64
stxvd2x vs44, o0, T1
stxvd2x vs45, o16, T1
stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1
addi T1, T1, 64
stxvd2x vs48, o0, T1
stxvd2x vs49, o16, T1
stxvd2x vs50, o32, T1
stxvd2x vs51, o48, T1
addi T1, T1, 64
stxvd2x vs52, o0, T1
stxvd2x vs53, o16, T1
stxvd2x vs54, o32, T1
stxvd2x vs55, o48, T1
addi T1, T1, 64
stxvd2x vs56, o0, T1
stxvd2x vs57, o16, T1
stxvd2x vs58, o32, T1
stxvd2x vs59, o48, T1
addi T1, T1, 64
stxvd2x vs60, o0, T1
stxvd2x vs61, o16, T1
stxvd2x vs62, o32, T1
stxvd2x vs63, o48, T1
.endm
/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/
.macro COPY_4x4
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64
lxvd2x vs36, o0, A1
lxvd2x vs37, o16, A1
lxvd2x vs38, o32, A1
lxvd2x vs39, o48, A1
addi A1, A1, 64
lxvd2x vs40, o0, A2
lxvd2x vs41, o16, A2
lxvd2x vs42, o32, A2
lxvd2x vs43, o48, A2
addi A2, A2, 64
lxvd2x vs44, o0, A3
lxvd2x vs45, o16, A3
lxvd2x vs46, o32, A3
lxvd2x vs47, o48, A3
addi A3, A3, 64
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
addi T1, T1, 64
stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
addi T1, T1, 64
stxvd2x vs40, o0, T1
stxvd2x vs41, o16, T1
stxvd2x vs42, o32, T1
stxvd2x vs43, o48, T1
addi T1, T1, 64
stxvd2x vs44, o0, T1
stxvd2x vs45, o16, T1
stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1
.endm
/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/
.macro COPY_4x2
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
addi A0, A0, 32
lxvd2x vs34, o0, A1
lxvd2x vs35, o16, A1
addi A1, A1, 32
lxvd2x vs36, o0, A2
lxvd2x vs37, o16, A2
addi A2, A2, 32
lxvd2x vs38, o0, A3
lxvd2x vs39, o16, A3
addi A3, A3, 32
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
addi T1, T1, 64
stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
.endm
/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/
.macro COPY_4x1
lxvd2x vs32, o0, A0
addi A0, A0, 16
lxvd2x vs33, o0, A1
addi A1, A1, 16
lxvd2x vs34, o0, A2
addi A2, A2, 16
lxvd2x vs35, o0, A3
addi A3, A3, 16
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/
.macro COPY_2x8
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64
lxvd2x vs36, o0, A0
lxvd2x vs37, o16, A0
lxvd2x vs38, o32, A0
lxvd2x vs39, o48, A0
addi A0, A0, 64
lxvd2x vs40, o0, A1
lxvd2x vs41, o16, A1
lxvd2x vs42, o32, A1
lxvd2x vs43, o48, A1
addi A1, A1, 64
lxvd2x vs44, o0, A1
lxvd2x vs45, o16, A1
lxvd2x vs46, o32, A1
lxvd2x vs47, o48, A1
addi A1, A1, 64
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
addi T1, T1, 64
stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
addi T1, T1, 64
stxvd2x vs40, o0, T1
stxvd2x vs41, o16, T1
stxvd2x vs42, o32, T1
stxvd2x vs43, o48, T1
addi T1, T1, 64
stxvd2x vs44, o0, T1
stxvd2x vs45, o16, T1
stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/
.macro COPY_2x4
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64
lxvd2x vs36, o0, A1
lxvd2x vs37, o16, A1
lxvd2x vs38, o32, A1
lxvd2x vs39, o48, A1
addi A1, A1, 64
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
addi T1, T1, 64
stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/
.macro COPY_2x2
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
addi A0, A0, 32
lxvd2x vs34, o0, A1
lxvd2x vs35, o16, A1
addi A1, A1, 32
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/
.macro COPY_2x1
lxvd2x vs32, o0, A0
addi A0, A0, 16
lxvd2x vs33, o0, A1
addi A1, A1, 16
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/
.macro COPY_1x8
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64
lxvd2x vs36, o0, A0
lxvd2x vs37, o16, A0
lxvd2x vs38, o32, A0
lxvd2x vs39, o48, A0
addi A0, A0, 64
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
addi T1, T1, 64
stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/
.macro COPY_1x4
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/
.macro COPY_1x2
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
addi A0, A0, 32
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/
.macro COPY_1x1
lxvd2x vs32, o0, A0
addi A0, A0, 16
mr T1, BO
stxvd2x vs32, o0, T1
.endm

View File

@ -933,6 +933,23 @@ static void init_parameter(void) {
#endif #endif
#endif #endif
#ifdef EXCAVATOR
#ifdef DEBUG
fprintf(stderr, "Excavator\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef PILEDRIVER #ifdef PILEDRIVER
#ifdef DEBUG #ifdef DEBUG

View File

@ -1,3 +1,7 @@
DSCALKERNEL = dscal.c
CSCALKERNEL = cscal.c
ZSCALKERNEL = zscal.c
SAXPYKERNEL = saxpy.c SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c CAXPYKERNEL = caxpy.c
@ -20,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c
DGEMVNKERNEL = dgemv_n_4.c DGEMVNKERNEL = dgemv_n_4.c
DGEMVTKERNEL = dgemv_t_4.c DGEMVTKERNEL = dgemv_t_4.c
ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_4.c
ZGEMVTKERNEL = zgemv_t_4.c ZGEMVTKERNEL = zgemv_t_4.c
DCOPYKERNEL = dcopy_bulldozer.S DCOPYKERNEL = dcopy_bulldozer.S
@ -68,25 +72,23 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LN = strsm_kernel_LN_bulldozer.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_LT = strsm_kernel_LT_bulldozer.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RN = strsm_kernel_RN_bulldozer.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c STRSMKERNEL_RT = strsm_kernel_RT_bulldozer.c
DTRSMKERNEL_LN = dtrsm_kernel_LN_bulldozer.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_RT = dtrsm_kernel_RT_bulldozer.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LN = ctrsm_kernel_LN_bulldozer.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_LT = ctrsm_kernel_LT_bulldozer.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RN = ctrsm_kernel_RN_bulldozer.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_RT = ctrsm_kernel_RT_bulldozer.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ztrsm_kernel_LN_bulldozer.c
ZTRSMKERNEL_LT = ztrsm_kernel_LT_bulldozer.c
ZTRSMKERNEL_RN = ztrsm_kernel_RN_bulldozer.c
ZTRSMKERNEL_RT = ztrsm_kernel_RT_bulldozer.c

View File

@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(PILEDRIVER) || defined(STEAMROLLER) #if defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "caxpy_microk_steamroller-2.c" #include "caxpy_microk_steamroller-2.c"
#elif defined(BULLDOZER) #elif defined(BULLDOZER)
#include "caxpy_microk_bulldozer-2.c" #include "caxpy_microk_bulldozer-2.c"

View File

@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER) #if defined(BULLDOZER)
#include "cdot_microk_bulldozer-2.c" #include "cdot_microk_bulldozer-2.c"
#elif defined(STEAMROLLER) || defined(PILEDRIVER) #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
#include "cdot_microk_steamroller-2.c" #include "cdot_microk_steamroller-2.c"
#elif defined(HASWELL) #elif defined(HASWELL)
#include "cdot_microk_haswell-2.c" #include "cdot_microk_haswell-2.c"

View File

@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(HASWELL) #if defined(HASWELL)
#include "cgemv_n_microk_haswell-4.c" #include "cgemv_n_microk_haswell-4.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "cgemv_n_microk_bulldozer-4.c" #include "cgemv_n_microk_bulldozer-4.c"
#endif #endif

View File

@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(HASWELL) #if defined(HASWELL)
#include "cgemv_t_microk_haswell-4.c" #include "cgemv_t_microk_haswell-4.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "cgemv_t_microk_bulldozer-4.c" #include "cgemv_t_microk_bulldozer-4.c"
#endif #endif

View File

@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cscal_microk_haswell-2.c" #include "cscal_microk_haswell-2.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER) #elif defined(BULLDOZER) || defined(PILEDRIVER)
#include "cscal_microk_bulldozer-2.c" #include "cscal_microk_bulldozer-2.c"
#elif defined(STEAMROLLER) #elif defined(STEAMROLLER) || defined(EXCAVATOR)
#include "cscal_microk_steamroller-2.c" #include "cscal_microk_steamroller-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "cscal_microk_bulldozer-2.c" #include "cscal_microk_bulldozer-2.c"

View File

@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "daxpy_microk_nehalem-2.c" #include "daxpy_microk_nehalem-2.c"
#elif defined(BULLDOZER) #elif defined(BULLDOZER)
#include "daxpy_microk_bulldozer-2.c" #include "daxpy_microk_bulldozer-2.c"
#elif defined(STEAMROLLER) #elif defined(STEAMROLLER) || defined(EXCAVATOR)
#include "daxpy_microk_steamroller-2.c" #include "daxpy_microk_steamroller-2.c"
#elif defined(PILEDRIVER) #elif defined(PILEDRIVER)
#include "daxpy_microk_piledriver-2.c" #include "daxpy_microk_piledriver-2.c"

View File

@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER) #if defined(BULLDOZER)
#include "ddot_microk_bulldozer-2.c" #include "ddot_microk_bulldozer-2.c"
#elif defined(STEAMROLLER) #elif defined(STEAMROLLER) || defined(EXCAVATOR)
#include "ddot_microk_steamroller-2.c" #include "ddot_microk_steamroller-2.c"
#elif defined(PILEDRIVER) #elif defined(PILEDRIVER)
#include "ddot_microk_piledriver-2.c" #include "ddot_microk_piledriver-2.c"

View File

@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NEHALEM) #if defined(NEHALEM)
#include "dgemv_n_microk_nehalem-4.c" #include "dgemv_n_microk_nehalem-4.c"
#elif defined(HASWELL) || defined(STEAMROLLER) #elif defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dgemv_n_microk_haswell-4.c" #include "dgemv_n_microk_haswell-4.c"
#endif #endif

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(HASWELL) || defined(STEAMROLLER) #if defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dgemv_t_microk_haswell-4.c" #include "dgemv_t_microk_haswell-4.c"
#endif #endif

View File

@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dscal_microk_bulldozer-2.c" #include "dscal_microk_bulldozer-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "dscal_microk_sandy-2.c" #include "dscal_microk_sandy-2.c"

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dsymv_L_microk_bulldozer-2.c" #include "dsymv_L_microk_bulldozer-2.c"
#elif defined(HASWELL) #elif defined(HASWELL)
#include "dsymv_L_microk_haswell-2.c" #include "dsymv_L_microk_haswell-2.c"

View File

@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dsymv_U_microk_bulldozer-2.c" #include "dsymv_U_microk_bulldozer-2.c"
#elif defined(HASWELL) #elif defined(HASWELL)
#include "dsymv_U_microk_haswell-2.c" #include "dsymv_U_microk_haswell-2.c"

View File

@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "saxpy_microk_haswell-2.c" #include "saxpy_microk_haswell-2.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "saxpy_microk_sandy-2.c" #include "saxpy_microk_sandy-2.c"
#elif defined(PILEDRIVER) || defined(STEAMROLLER) #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "saxpy_microk_piledriver-2.c" #include "saxpy_microk_piledriver-2.c"
#endif #endif

View File

@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER) #if defined(BULLDOZER)
#include "sdot_microk_bulldozer-2.c" #include "sdot_microk_bulldozer-2.c"
#elif defined(STEAMROLLER) || defined(PILEDRIVER) #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
#include "sdot_microk_steamroller-2.c" #include "sdot_microk_steamroller-2.c"
#elif defined(NEHALEM) #elif defined(NEHALEM)
#include "sdot_microk_nehalem-2.c" #include "sdot_microk_nehalem-2.c"

View File

@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "sgemv_n_microk_bulldozer-4.c" #include "sgemv_n_microk_bulldozer-4.c"
#elif defined(NEHALEM) #elif defined(NEHALEM)
#include "sgemv_n_microk_nehalem-4.c" #include "sgemv_n_microk_nehalem-4.c"
@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sgemv_n_microk_haswell-4.c" #include "sgemv_n_microk_haswell-4.c"
#endif #endif
#if defined(STEAMROLLER) #if defined(STEAMROLLER) || defined(EXCAVATOR)
#define NBMAX 2048 #define NBMAX 2048
#else #else
#define NBMAX 4096 #define NBMAX 4096

View File

@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NEHALEM) #if defined(NEHALEM)
#include "sgemv_t_microk_nehalem-4.c" #include "sgemv_t_microk_nehalem-4.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "sgemv_t_microk_bulldozer-4.c" #include "sgemv_t_microk_bulldozer-4.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "sgemv_t_microk_sandy-4.c" #include "sgemv_t_microk_sandy-4.c"
@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sgemv_t_microk_haswell-4.c" #include "sgemv_t_microk_haswell-4.c"
#endif #endif
#if defined(STEAMROLLER) #if defined(STEAMROLLER) || defined(EXCAVATOR)
#define NBMAX 2048 #define NBMAX 2048
#else #else
#define NBMAX 4096 #define NBMAX 4096

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "ssymv_L_microk_bulldozer-2.c" #include "ssymv_L_microk_bulldozer-2.c"
#elif defined(NEHALEM) #elif defined(NEHALEM)
#include "ssymv_L_microk_nehalem-2.c" #include "ssymv_L_microk_nehalem-2.c"

View File

@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "ssymv_U_microk_bulldozer-2.c" #include "ssymv_U_microk_bulldozer-2.c"
#elif defined(NEHALEM) #elif defined(NEHALEM)
#include "ssymv_U_microk_nehalem-2.c" #include "ssymv_U_microk_nehalem-2.c"

View File

@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER) #if defined(BULLDOZER)
#include "zaxpy_microk_bulldozer-2.c" #include "zaxpy_microk_bulldozer-2.c"
#elif defined(PILEDRIVER) || defined(STEAMROLLER) #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "zaxpy_microk_steamroller-2.c" #include "zaxpy_microk_steamroller-2.c"
#elif defined(HASWELL) #elif defined(HASWELL)
#include "zaxpy_microk_haswell-2.c" #include "zaxpy_microk_haswell-2.c"

View File

@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER) #if defined(BULLDOZER)
#include "zdot_microk_bulldozer-2.c" #include "zdot_microk_bulldozer-2.c"
#elif defined(STEAMROLLER) || defined(PILEDRIVER) #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
#include "zdot_microk_steamroller-2.c" #include "zdot_microk_steamroller-2.c"
#elif defined(HASWELL) #elif defined(HASWELL)
#include "zdot_microk_haswell-2.c" #include "zdot_microk_haswell-2.c"

View File

@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "zgemv_n_microk_haswell-4.c" #include "zgemv_n_microk_haswell-4.c"
#elif defined(SANDYBRIDGE) #elif defined(SANDYBRIDGE)
#include "zgemv_n_microk_sandy-4.c" #include "zgemv_n_microk_sandy-4.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "zgemv_n_microk_bulldozer-4.c" #include "zgemv_n_microk_bulldozer-4.c"
#endif #endif

View File

@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "zgemv_t_microk_bulldozer-4.c" #include "zgemv_t_microk_bulldozer-4.c"
#elif defined(HASWELL) #elif defined(HASWELL)
#include "zgemv_t_microk_haswell-4.c" #include "zgemv_t_microk_haswell-4.c"

View File

@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "zscal_microk_haswell-2.c" #include "zscal_microk_haswell-2.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER) #elif defined(BULLDOZER) || defined(PILEDRIVER)
#include "zscal_microk_bulldozer-2.c" #include "zscal_microk_bulldozer-2.c"
#elif defined(STEAMROLLER) #elif defined(STEAMROLLER) || defined(EXCAVATOR)
#include "zscal_microk_steamroller-2.c" #include "zscal_microk_steamroller-2.c"
#endif #endif

16
param.h
View File

@ -1977,15 +1977,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 960 #define SGEMM_DEFAULT_P 1280
#define DGEMM_DEFAULT_P 480 #define DGEMM_DEFAULT_P 640
#define CGEMM_DEFAULT_P 720 #define CGEMM_DEFAULT_P 640
#define ZGEMM_DEFAULT_P 480 #define ZGEMM_DEFAULT_P 320
#define SGEMM_DEFAULT_Q 720 #define SGEMM_DEFAULT_Q 640
#define DGEMM_DEFAULT_Q 720 #define DGEMM_DEFAULT_Q 640
#define CGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 640
#define ZGEMM_DEFAULT_Q 720 #define ZGEMM_DEFAULT_Q 640
#define SYMV_P 8 #define SYMV_P 8

View File

@ -4,6 +4,7 @@ include ../Makefile.system
all :: level1 level2 level3 all :: level1 level2 level3
level1 : sblat1 dblat1 cblat1 zblat1 level1 : sblat1 dblat1 cblat1 zblat1
ifndef CROSS
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1
@ -21,8 +22,10 @@ else
OPENBLAS_NUM_THREADS=2 ./zblat1 OPENBLAS_NUM_THREADS=2 ./zblat1
endif endif
endif endif
endif
level2 : sblat2 dblat2 cblat2 zblat2 level2 : sblat2 dblat2 cblat2 zblat2
ifndef CROSS
rm -f ?BLAT2.SUMM rm -f ?BLAT2.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat
@$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0
@ -54,8 +57,10 @@ else
@$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0
endif endif
endif endif
endif
level3 : sblat3 dblat3 cblat3 zblat3 level3 : sblat3 dblat3 cblat3 zblat3
ifndef CROSS
rm -f ?BLAT3.SUMM rm -f ?BLAT3.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat
@$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0
@ -87,9 +92,11 @@ else
@$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0
endif endif
endif endif
endif
level3_3m : zblat3_3m cblat3_3m level3_3m : zblat3_3m cblat3_3m
ifndef CROSS
rm -f ?BLAT3_3M.SUMM rm -f ?BLAT3_3M.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat
@$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
@ -109,6 +116,7 @@ else
@$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0
endif endif
endif endif
endif

View File

@ -21,7 +21,9 @@ $(UTESTBIN): $(OBJS)
$(CC) $(CFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) $(CC) $(CFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
run_test: $(UTESTBIN) run_test: $(UTESTBIN)
ifndef CROSS
./$(UTESTBIN) ./$(UTESTBIN)
endif
clean: clean:
-rm -f *.o $(UTESTBIN) -rm -f *.o $(UTESTBIN)