diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 4431103bd..999413be2 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -151,5 +151,9 @@ In chronological order:
* [2016-03-20] Fix compiler error in VisualStudio with CMake
* [2016-03-22] Fix access violation on Windows while static linking
+* Paul Mustière
+ * [2016-02-04] Fix Android build on ARMV7
+ * [2016-04-26] Android build with LAPACK for ARMV7 & ARMV8
+
* Shivraj Patil
* [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA
diff --git a/Makefile b/Makefile
index 9ba2bffb3..2ae004798 100644
--- a/Makefile
+++ b/Makefile
@@ -108,8 +108,6 @@ endif
tests :
ifndef NOFORTRAN
-ifndef TARGET
-ifndef CROSS
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
@@ -119,8 +117,6 @@ ifndef NO_CBLAS
$(MAKE) -C ctest all
endif
endif
-endif
-endif
libs :
ifeq ($(CORE), UNKOWN)
diff --git a/Makefile.install b/Makefile.install
index 5da4e68c9..1b9388a8b 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -20,75 +20,75 @@ lib.grd :
$(error OpenBLAS: Please run "make" firstly)
install : lib.grd
- @-mkdir -p $(DESTDIR)$(PREFIX)
- @-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
- @-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
- @-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR)
- @-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
+ @-mkdir -p "$(DESTDIR)$(PREFIX)"
+ @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)"
+ @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
+ @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
+ @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)"
@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
#for inc
- @echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
- @echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
- @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
- @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
- @cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
- @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
+ @echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
+ @echo \#define OPENBLAS_CONFIG_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
+ @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
+ @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
+ @cat openblas_config_template.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
+ @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
- @echo \#ifndef OPENBLAS_F77BLAS_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
- @echo \#define OPENBLAS_F77BLAS_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
- @echo \#include \"openblas_config.h\" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
- @cat common_interface.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
- @echo \#endif >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
+ @echo \#ifndef OPENBLAS_F77BLAS_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
+ @echo \#define OPENBLAS_F77BLAS_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
+ @echo \#include \"openblas_config.h\" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
+ @cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
+ @echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
ifndef NO_CBLAS
@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
- @sed 's/common/openblas_config/g' cblas.h > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h
+ @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
endif
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
- @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
- @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
- @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
- @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
+ @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
+ @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
+ @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
+ @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
endif
#for install static library
ifndef NO_STATIC
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
- @install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
- @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
+ @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
+ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
- @install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
- @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
+ @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
+ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
- @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
- @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
+ @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
+ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
- @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
- @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
+ @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
+ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
- @-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
- @-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
- @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
+ @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
+ @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
+ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
- @-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR)
- @-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
+ @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
+ @-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
endif
ifeq ($(OSNAME), CYGWIN_NT)
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
@@ -96,34 +96,34 @@ endif
endif
#Generating OpenBLASConfig.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
- @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
- @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
+ @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
+ @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
ifndef NO_SHARED
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
- @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
+ @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
- @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
+ @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
ifeq ($(OSNAME), Darwin)
- @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
+ @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
else
#only static
- @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
+ @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
#Generating OpenBLASConfigVersion.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
- @echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
+ @echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo "else ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo Install OK!
diff --git a/README.md b/README.md
index 32a861081..8ac88840a 100644
--- a/README.md
+++ b/README.md
@@ -82,6 +82,7 @@ Please read GotoBLAS_01Readme.txt
- **MingWin or Visual Studio(CMake)/Windows**: Please read .
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
+- **Android**: Supported by community. Please read .
## Usages
Link with libopenblas.a or -lopenblas for shared library.
diff --git a/c_check b/c_check
index d624472dc..50ff360a2 100644
--- a/c_check
+++ b/c_check
@@ -1,5 +1,7 @@
#!/usr/bin/perl
+use File::Basename;
+
# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
@@ -26,14 +28,12 @@ if ($?) {
$cross_suffix = "";
-if ($ARGV[0] =~ /(.*)(-[.\d]+)/) {
- if ($1 =~ /(.*-)(.*)/) {
- $cross_suffix = $1;
- }
-} else {
- if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) {
- $cross_suffix = $1;
- }
+if (dirname($compiler_name) ne ".") {
+ $cross_suffix .= dirname($compiler_name) . "/";
+}
+
+if (basename($compiler_name) =~ /(.*-)(.*)/) {
+ $cross_suffix .= $1;
}
$compiler = "";
@@ -243,7 +243,7 @@ print MAKEFILE "BINARY64=\n" if $binformat ne bin64;
print MAKEFILE "BINARY32=1\n" if $binformat eq bin32;
print MAKEFILE "BINARY64=1\n" if $binformat eq bin64;
print MAKEFILE "FU=$need_fu\n" if $need_fu ne "";
-print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne "";
+print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne "";
print MAKEFILE "CROSS=1\n" if $cross != 0;
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
diff --git a/ctest/Makefile b/ctest/Makefile
index 7a5d236aa..6eda43863 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -42,6 +42,7 @@ ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
all :: all1 all2 all3
all1: xscblat1 xdcblat1 xccblat1 xzcblat1
+ifndef CROSS
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat1
OMP_NUM_THREADS=2 ./xdcblat1
@@ -53,8 +54,10 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat1
OPENBLAS_NUM_THREADS=2 ./xzcblat1
endif
+endif
all2: xscblat2 xdcblat2 xccblat2 xzcblat2
+ifndef CROSS
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat2 < sin2
OMP_NUM_THREADS=2 ./xdcblat2 < din2
@@ -66,8 +69,10 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2
OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2
endif
+endif
all3: xscblat3 xdcblat3 xccblat3 xzcblat3
+ifndef CROSS
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat3 < sin3
OMP_NUM_THREADS=2 ./xdcblat3 < din3
@@ -88,6 +93,7 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
endif
+endif
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 2fde07fcc..9e8cce438 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -439,7 +439,7 @@ static gotoblas_t *force_coretype(char *coretype){
char message[128];
//char mname[20];
- for ( i=1 ; i <= 21; i++)
+ for ( i=1 ; i <= 22; i++)
{
if (!strncasecmp(coretype,corename[i],20))
{
diff --git a/driver/others/init.c b/driver/others/init.c
index f134f85f7..801f93991 100644
--- a/driver/others/init.c
+++ b/driver/others/init.c
@@ -361,6 +361,9 @@ static void numa_mapping(void) {
unsigned long work, bit;
int count = 0;
int bitmask_idx = 0;
+ int current_cpu;
+ int current_node = 0;
+ int cpu_count = 0;
for (node = 0; node < common -> num_nodes; node ++) {
core = 0;
@@ -382,33 +385,84 @@ static void numa_mapping(void) {
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
#endif
- h = 1;
+ current_cpu = sched_getcpu();
+ for (cpu = 0; cpu < count; cpu++) {
+ if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) {
+ current_node = READ_NODE(common -> cpu_info[cpu]);
+ break;
+ }
+ }
+ for (i = 0; i < MAX_BITMASK_LEN; i++)
+ cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]);
- while (h < count) h = 2 * h + 1;
+ /*
+ * If all the processes can be accommodated in the
+ * in the current node itself, then bind to cores
+ * from the current node only
+ */
+ if (numprocs <= cpu_count) {
+ /*
+ * First sort all the cores in order from the current node.
+ * Then take remaining nodes one by one in order,
+ * and sort their cores in order.
+ */
+ for (i = 0; i < count; i++) {
+ for (j = 0; j < count - 1; j++) {
+ int node_1, node_2;
+ int core_1, core_2;
+ int swap = 0;
- while (h > 1) {
- h /= 2;
- for (i = h; i < count; i++) {
- work = common -> cpu_info[i];
- bit = CPU_ISSET(i, &cpu_orig_mask[0]);
- j = i - h;
- while (work < common -> cpu_info[j]) {
- common -> cpu_info[j + h] = common -> cpu_info[j];
- if (CPU_ISSET(j, &cpu_orig_mask[0])) {
- CPU_SET(j + h, &cpu_orig_mask[0]);
- } else {
- CPU_CLR(j + h, &cpu_orig_mask[0]);
- }
- j -= h;
- if (j < 0) break;
- }
- common -> cpu_info[j + h] = work;
- if (bit) {
- CPU_SET(j + h, &cpu_orig_mask[0]);
- } else {
- CPU_CLR(j + h, &cpu_orig_mask[0]);
+ node_1 = READ_NODE(common -> cpu_info[j]);
+ node_2 = READ_NODE(common -> cpu_info[j + 1]);
+ core_1 = READ_CORE(common -> cpu_info[j]);
+ core_2 = READ_CORE(common -> cpu_info[j + 1]);
+
+ if (node_1 == node_2) {
+ if (core_1 > core_2)
+ swap = 1;
+ } else {
+ if ((node_2 == current_node) ||
+ ((node_1 != current_node) && (node_1 > node_2)))
+ swap = 1;
+ }
+ if (swap) {
+ unsigned long temp;
+
+ temp = common->cpu_info[j];
+ common->cpu_info[j] = common->cpu_info[j + 1];
+ common->cpu_info[j + 1] = temp;
+ }
}
+ }
+ } else {
+ h = 1;
+ while (h < count) h = 2 * h + 1;
+
+ while (h > 1) {
+ h /= 2;
+ for (i = h; i < count; i++) {
+ work = common -> cpu_info[i];
+ bit = CPU_ISSET(i, &cpu_orig_mask[0]);
+ j = i - h;
+ while (work < common -> cpu_info[j]) {
+ common -> cpu_info[j + h] = common -> cpu_info[j];
+ if (CPU_ISSET(j, &cpu_orig_mask[0])) {
+ CPU_SET(j + h, &cpu_orig_mask[0]);
+ } else {
+ CPU_CLR(j + h, &cpu_orig_mask[0]);
+ }
+ j -= h;
+ if (j < 0) break;
+ }
+ common -> cpu_info[j + h] = work;
+ if (bit) {
+ CPU_SET(j + h, &cpu_orig_mask[0]);
+ } else {
+ CPU_CLR(j + h, &cpu_orig_mask[0]);
+ }
+
+ }
}
}
@@ -416,7 +470,10 @@ static void numa_mapping(void) {
fprintf(stderr, "\nSorting ...\n\n");
for (cpu = 0; cpu < count; cpu++)
- fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
+ fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu],
+ READ_CPU(common -> cpu_info[cpu]),
+ READ_CORE(common -> cpu_info[cpu]),
+ READ_NODE(common -> cpu_info[cpu]));
#endif
}
diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index f4b1a80ad..f22c6b69a 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -167,7 +167,7 @@ int get_L2_size(void){
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
- defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER)
+ defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@@ -251,7 +251,7 @@ int get_L2_size(void){
void blas_set_parameter(void){
int factor;
-#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
int size = 16;
#else
int size = get_L2_size();
diff --git a/exports/Makefile b/exports/Makefile
index c2b8d9c1c..5632b6fff 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -110,9 +110,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
endif
ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2))
#only build without Fortran
- $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
+ $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else
- $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
+ $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
endif
dllinit.$(SUFFIX) : dllinit.c
diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index e1b89cc97..8e3d084aa 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -12,7 +12,7 @@ SGEMMKERNEL = sgemm_kernel_16x8_power8.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = sgemm_tcopy_16_power8.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
-SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
+SGEMMOTCOPY = sgemm_tcopy_8_power8.S
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
@@ -21,16 +21,16 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_16x4_power8.S
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
DGEMMITCOPY = dgemm_tcopy_16_power8.S
-DGEMMONCOPY = gemm_ncopy_4.S
-DGEMMOTCOPY = gemm_tcopy_4.S
-DGEMMINCOPYOBJ = dgemm_incopy.o
-DGEMMITCOPYOBJ = dgemm_itcopy.o
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
+DGEMMONCOPY = dgemm_ncopy_4_power8.S
+DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+DGEMMINCOPYOBJ = dgemm_incopy.o
+DGEMMITCOPYOBJ = dgemm_itcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
-CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
+CGEMMITCOPY = cgemm_tcopy_8_power8.S
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
@@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
-ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c
+ZGEMMITCOPY = zgemm_tcopy_8_power8.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
ZGEMMINCOPYOBJ = zgemm_incopy.o
diff --git a/kernel/power/cgemm_tcopy_8_power8.S b/kernel/power/cgemm_tcopy_8_power8.S
new file mode 100644
index 000000000..b1a7d2b27
--- /dev/null
+++ b/kernel/power/cgemm_tcopy_8_power8.S
@@ -0,0 +1,206 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define M r3
+#define N r4
+#define A r5
+#define LDA r6
+#define B r7
+
+#define A0 r8
+#define A1 r9
+#define A2 r10
+#define A3 r11
+
+#define J r12
+
+#define PREA r14
+#define PREB r15
+#define BO r16
+#define B8 r17
+#define B4 r18
+#define B2 r19
+#define B1 r20
+#define o4 r21
+#define T2 r22
+#define I r23
+#define o16 r24
+#define o32 r25
+#define o48 r26
+#define NOTUS2 r27
+#define M8 r30
+#define T1 r31
+
+#define o0 0
+
+#include "cgemm_tcopy_macros_8_power8.S"
+
+#define STACKSIZE 384
+
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+ cmpwi cr0, M, 0
+ ble- L999
+ cmpwi cr0, N, 0
+ ble- L999
+
+ slwi LDA, LDA, ZBASE_SHIFT
+ slwi M8, M, 3 + ZBASE_SHIFT
+
+ li T2, -8
+ li PREA, -4
+ li PREB, -2
+
+ and B4, N, T2
+ and B2, N, PREA
+ and B1, N, PREB
+
+ mullw B4, B4, M
+ mullw B2, B2, M
+ mullw B1, B1, M
+
+ slwi B4, B4, ZBASE_SHIFT
+ slwi B2, B2, ZBASE_SHIFT
+ slwi B1, B1, ZBASE_SHIFT
+
+ add B4, B4, B
+ add B2, B2, B
+ add B1, B1, B
+
+ li PREA, 384
+ addi PREB, M8, 128
+
+ li o4, 4
+ li o16, 16
+ li o32, 32
+ li o48, 48
+
+#include "cgemm_tcopy_logic_8_power8.S"
+
+L999:
+
+ li r3, 0
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ addi SP, SP, STACKSIZE
+
+ blr
+ EPILOGUE
+
+
diff --git a/kernel/power/cgemm_tcopy_logic_8_power8.S b/kernel/power/cgemm_tcopy_logic_8_power8.S
new file mode 100644
index 000000000..9418908b7
--- /dev/null
+++ b/kernel/power/cgemm_tcopy_logic_8_power8.S
@@ -0,0 +1,247 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+ srawi. I, M, 2
+ ble CCOPYT_L2_BEGIN
+
+
+CCOPYT_L4_BEGIN:
+
+ mr A0, A
+ add A1, A0, LDA
+ add A2, A1, LDA
+ add A3, A2, LDA
+ add A, A3, LDA
+ mr B8, B
+ addi B, B, 64*SIZE
+
+ sradi. J, N, 3
+ ble CCOPYT_L4x4_BEGIN
+
+ mr BO, B8
+
+CCOPYT_L4x8_LOOP:
+
+ dcbt A0, PREA
+ dcbt A1, PREA
+ dcbt A2, PREA
+ dcbt A3, PREA
+ dcbtst BO, M8
+ dcbtst BO, PREB
+ COPY_4x8
+
+ add BO, BO, M8
+
+ addic. J, J, -1
+ ble CCOPYT_L4x4_BEGIN
+
+
+ COPY_4x8
+
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt CCOPYT_L4x8_LOOP
+
+CCOPYT_L4x4_BEGIN:
+
+ andi. T1, N, 4
+ ble CCOPYT_L4x2_BEGIN
+
+ mr BO, B4
+
+ COPY_4x4
+
+
+ addi B4, B4, 32*SIZE
+
+CCOPYT_L4x2_BEGIN:
+
+ andi. T1, N, 2
+ ble CCOPYT_L4x1_BEGIN
+
+ mr BO, B2
+
+ COPY_4x2
+
+
+ addi B2, B2, 16*SIZE
+
+CCOPYT_L4x1_BEGIN:
+
+ andi. T1, N, 1
+ ble CCOPYT_L4_END
+
+ mr BO, B1
+
+ COPY_4x1
+
+
+ addi B1, B1, 8*SIZE
+
+CCOPYT_L4_END:
+
+ addic. I, I, -1
+ bgt CCOPYT_L4_BEGIN
+
+
+
+CCOPYT_L2_BEGIN:
+
+ andi. T1, M, 2
+ ble CCOPYT_L1_BEGIN
+
+ mr A0, A
+ add A1, A0, LDA
+ add A, A1, LDA
+ mr B8, B
+ addi B, B, 32*SIZE
+
+ sradi. J, N, 3
+ ble CCOPYT_L2x4_BEGIN
+
+ mr BO, B8
+
+CCOPYT_L2x8_LOOP:
+
+ COPY_2x8
+
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt CCOPYT_L2x8_LOOP
+
+CCOPYT_L2x4_BEGIN:
+
+ andi. T1, N, 4
+ ble CCOPYT_L2x2_BEGIN
+
+ mr BO, B4
+
+ COPY_2x4
+
+
+ addi B4, B4, 16*SIZE
+
+CCOPYT_L2x2_BEGIN:
+
+ andi. T1, N, 2
+ ble CCOPYT_L2x1_BEGIN
+
+ mr BO, B2
+
+ COPY_2x2
+
+
+ addi B2, B2, 8*SIZE
+
+CCOPYT_L2x1_BEGIN:
+
+ andi. T1, N, 1
+ ble CCOPYT_L2_END
+
+ mr BO, B1
+
+ COPY_2x1
+
+
+ addi B1, B1, 4*SIZE
+
+CCOPYT_L2_END:
+
+
+CCOPYT_L1_BEGIN:
+
+ andi. T1, M, 1
+ ble L999
+
+ mr A0, A
+ add A, A0, LDA
+ mr B8, B
+ addi B, B, 16*SIZE
+
+ sradi. J, N, 3
+ ble CCOPYT_L1x4_BEGIN
+
+ mr BO, B8
+
+CCOPYT_L1x8_LOOP:
+
+ COPY_1x8
+
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt CCOPYT_L1x8_LOOP
+
+CCOPYT_L1x4_BEGIN:
+
+ andi. T1, N, 4
+ ble CCOPYT_L1x2_BEGIN
+
+ mr BO, B4
+
+ COPY_1x4
+
+
+ addi B4, B4, 8*SIZE
+
+CCOPYT_L1x2_BEGIN:
+
+ andi. T1, N, 2
+ ble CCOPYT_L1x1_BEGIN
+
+ mr BO, B2
+
+ COPY_1x2
+
+
+ addi B2, B2, 4*SIZE
+
+CCOPYT_L1x1_BEGIN:
+
+ andi. T1, N, 1
+ ble CCOPYT_L1_END
+
+ mr BO, B1
+
+ COPY_1x1
+
+
+ addi B1, B1, 2*SIZE
+
+CCOPYT_L1_END:
+
diff --git a/kernel/power/cgemm_tcopy_macros_8_power8.S b/kernel/power/cgemm_tcopy_macros_8_power8.S
new file mode 100644
index 000000000..03fda2766
--- /dev/null
+++ b/kernel/power/cgemm_tcopy_macros_8_power8.S
@@ -0,0 +1,385 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro COPY_4x8
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+ lxvw4x vs34, o32, A0
+ lxvw4x vs35, o48, A0
+
+ lxvw4x vs36, o0, A1
+ lxvw4x vs37, o16, A1
+ lxvw4x vs38, o32, A1
+ lxvw4x vs39, o48, A1
+
+ addi A0, A0, 64
+ addi A1, A1, 64
+
+ lxvw4x vs40, o0, A2
+ lxvw4x vs41, o16, A2
+ lxvw4x vs42, o32, A2
+ lxvw4x vs43, o48, A2
+
+ lxvw4x vs44, o0, A3
+ lxvw4x vs45, o16, A3
+ lxvw4x vs46, o32, A3
+ lxvw4x vs47, o48, A3
+
+ mr T1, BO
+ addi A2, A2, 64
+ addi A3, A3, 64
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs36, o0, T1
+ stxvw4x vs37, o16, T1
+ stxvw4x vs38, o32, T1
+ stxvw4x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs40, o0, T1
+ stxvw4x vs41, o16, T1
+ stxvw4x vs42, o32, T1
+ stxvw4x vs43, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs44, o0, T1
+ stxvw4x vs45, o16, T1
+ stxvw4x vs46, o32, T1
+ stxvw4x vs47, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro COPY_4x4
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+ addi A0, A0, 32
+
+ lxvw4x vs34, o0, A1
+ lxvw4x vs35, o16, A1
+ addi A1, A1, 32
+
+ lxvw4x vs36, o0, A2
+ lxvw4x vs37, o16, A2
+ addi A2, A2, 32
+
+ lxvw4x vs38, o0, A3
+ lxvw4x vs39, o16, A3
+ addi A3, A3, 32
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs36, o0, T1
+ stxvw4x vs37, o16, T1
+
+ stxvw4x vs38, o32, T1
+ stxvw4x vs39, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro COPY_4x2
+
+ lxvw4x vs32, o0, A0
+ addi A0, A0, 16
+
+ lxvw4x vs33, o0, A1
+ addi A1, A1, 16
+
+ lxvw4x vs34, o0, A2
+ addi A2, A2, 16
+
+ lxvw4x vs35, o0, A3
+ addi A3, A3, 16
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+
+ stxvw4x vs33, o16, T1
+
+ stxvw4x vs34, o32, T1
+
+ stxvw4x vs35, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro COPY_4x1
+
+ lxsspx vs32, o0, A0
+ lxsspx vs33, o4, A0
+ addi A0, A0, 8
+
+ lxsspx vs34, o0, A1
+ lxsspx vs35, o4, A1
+ addi A1, A1, 8
+
+ lxsspx vs36, o0, A2
+ lxsspx vs37, o4, A2
+ addi A2, A2, 8
+
+ lxsspx vs38, o0, A3
+ lxsspx vs39, o4, A3
+ addi A3, A3, 8
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+ stxsspx vs33, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs34, o0, T1
+ stxsspx vs35, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs36, o0, T1
+ stxsspx vs37, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs38, o0, T1
+ stxsspx vs39, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro COPY_2x8
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+ lxvw4x vs34, o32, A0
+ lxvw4x vs35, o48, A0
+ addi A0, A0, 64
+
+ lxvw4x vs36, o0, A1
+ lxvw4x vs37, o16, A1
+ lxvw4x vs38, o32, A1
+ lxvw4x vs39, o48, A1
+ addi A1, A1, 64
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs36, o0, T1
+ stxvw4x vs37, o16, T1
+ stxvw4x vs38, o32, T1
+ stxvw4x vs39, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro COPY_2x4
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+ addi A0, A0, 32
+
+ lxvw4x vs34, o0, A1
+ lxvw4x vs35, o16, A1
+ addi A1, A1, 32
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro COPY_2x2
+
+ lxvw4x vs32, o0, A0
+ addi A0, A0, 16
+
+ lxvw4x vs33, o0, A1
+ addi A1, A1, 16
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+
+ stxvw4x vs33, o16, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro COPY_2x1
+
+ lxsspx vs32, o0, A0
+ lxsspx vs33, o4, A0
+ addi A0, A0, 8
+
+ lxsspx vs34, o0, A1
+ lxsspx vs35, o4, A1
+ addi A1, A1, 8
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+ stxsspx vs33, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs34, o0, T1
+ stxsspx vs35, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro COPY_1x8
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+ lxvw4x vs34, o32, A0
+ lxvw4x vs35, o48, A0
+ addi A0, A0, 64
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro COPY_1x4
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+ addi A0, A0, 32
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro COPY_1x2
+
+ lxvw4x vs32, o0, A0
+ addi A0, A0, 16
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro COPY_1x1
+
+ lxsspx vs32, o0, A0
+ lxsspx vs33, o4, A0
+ addi A0, A0, 8
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+ stxsspx vs33, o4, T1
+
+.endm
+
diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S
index 4c14b0c6f..8af7fe389 100644
--- a/kernel/power/dgemm_kernel_16x4_power8.S
+++ b/kernel/power/dgemm_kernel_16x4_power8.S
@@ -131,13 +131,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define o0 0
+#define T4 r12
+#define T3 r11
+
+#define o40 r12
+#define o56 r11
+
+#define o112 r14
#define o8 r15
#define o24 r16
-#define ALPHA r17
+#define o64 r17
#define L r18
#define T1 r19
-#define KK r20
-#define BB r21
+#define o80 r20
+#define o96 r21
#define I r22
#define J r23
#define AO r24
@@ -202,6 +209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
+ std r14, 280(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
@@ -220,6 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
+ stw r14, 212(SP)
#endif
stfd f1, ALPHA_SP
@@ -260,19 +269,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble .L999_H1
#ifdef __64BIT__
- addi ALPHA, SP, 296
+ addi T1, SP, 296
#else
- addi ALPHA, SP, 224
+ addi T1, SP, 224
#endif
- li PRE, 256
+ li PRE, 384
li o8 , 8
li o16, 16
li o24, 24
li o32, 32
li o48, 48
+ li o64, 64
+ li o80, 80
+ li o96, 96
+ li o112, 112
- lxvdsx alpha_r, 0, ALPHA
+ lxvdsx alpha_r, 0, T1
#include "dgemm_logic_16x4_power8.S"
@@ -320,6 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
+ ld r14, 280(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
@@ -338,6 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
+ lwz r14, 212(SP)
#endif
addi SP, SP, STACKSIZE
diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S
index 49c438f61..718f80bdd 100644
--- a/kernel/power/dgemm_logic_16x4_power8.S
+++ b/kernel/power/dgemm_logic_16x4_power8.S
@@ -35,193 +35,187 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
srawi. J, N, 2
- ble .LDGEMM_L4_END
+ ble LDGEMM_L4_END
-.LDGEMM_L4_BEGIN:
+LDGEMM_L4_BEGIN:
mr CO, C
mr AO, A
slwi T1, LDC , 2
add C, C, T1
srawi. I, M, 4
- ble .LDGEMM_L4x16_END
+ ble LDGEMM_L4x16_END
-.LDGEMM_L4x16_BEGIN:
+ .align 4
+LDGEMM_L4x16_BEGIN:
+ li L, -128
+
+ mr T1, CO
+ add T2, T1, LDC
+ add T3, T2, LDC
+ add T4, T3, LDC
+
+ and T1, T1, L
+ and T2, T2, L
+ and T3, T3, L
+ and T4, T4, L
+
+ dcbt T1, r0
+ dcbt T2, r0
+ dcbt T3, r0
+ dcbt T4, r0
mr BO, B
- srawi. L, K, 3
- ble .LDGEMM_L4x16_SUB0
+ srawi. L, K, 1
+
+ addi T1, T1, 128
+ addi T2, T2, 128
+ addi T3, T3, 128
+ addi T4, T4, 128
+
+ dcbt T1, r0
+ dcbt T2, r0
+ dcbt T3, r0
+ dcbt T4, r0
+
+ ble LDGEMM_L4x16_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L4x16_SUB4
+ ble LDGEMM_L4x16_SUB4
-.LDGEMM_L4x16_LOOP_START:
+ .align 4
+LDGEMM_L4x16_LOOP_START:
- dcbt AO, PRE
+ li o40, 40
+ li o56, 56
+
+ dcbt AO, PRE
LOAD4x16_1
- dcbt AO, PRE
+ dcbt AO, PRE
KERNEL4x16_I1
- dcbt AO, PRE
- KERNEL4x16_2
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
-
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
-
+ dcbt AO, PRE
addic. L, L, -2
- ble .LDGEMM_L4x16_LOOP_END
+ KERNEL4x16_L2
- .align 5
+ ble LDGEMM_L4x16_LOOP_END
-.LDGEMM_L4x16_LOOP:
+ .align 4
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
+LDGEMM_L4x16_LOOP:
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_L1
+ dcbt AO, PRE
addic. L, L, -1
- bgt .LDGEMM_L4x16_LOOP
+ KERNEL4x16_L2
-.LDGEMM_L4x16_LOOP_END:
+ bgt LDGEMM_L4x16_LOOP
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
+ .align 4
+
+LDGEMM_L4x16_LOOP_END:
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
- dcbt AO, PRE
KERNEL4x16_1
KERNEL4x16_E2
- b .LDGEMM_L4x16_SUB1
+ b LDGEMM_L4x16_SUB1
-.LDGEMM_L4x16_SUB4:
+LDGEMM_L4x16_SUB4:
- dcbt AO, PRE
KERNEL4x16_SUBI1
- dcbt AO, PRE
- KERNEL4x16_SUB1
- dcbt AO, PRE
- KERNEL4x16_SUB1
- dcbt AO, PRE
KERNEL4x16_SUB1
- KERNEL4x16_SUB1
- KERNEL4x16_SUB1
- KERNEL4x16_SUB1
- KERNEL4x16_SUB1
+ b LDGEMM_L4x16_SUB1
- b .LDGEMM_L4x16_SUB1
+LDGEMM_L4x16_SUB0:
-.LDGEMM_L4x16_SUB0:
-
- andi. L, K, 7
+ andi. L, K, 1
KERNEL4x16_SUBI1
addic. L, L, -1
- ble .LDGEMM_L4x16_SAVE
- b .LDGEMM_L4x16_SUB2
+ ble LDGEMM_L4x16_SAVE
+ b LDGEMM_L4x16_SUB2
-.LDGEMM_L4x16_SUB1:
+LDGEMM_L4x16_SUB1:
- andi. L, K, 7
- ble .LDGEMM_L4x16_SAVE
+ andi. L, K, 1
+ ble LDGEMM_L4x16_SAVE
-.LDGEMM_L4x16_SUB2:
+LDGEMM_L4x16_SUB2:
KERNEL4x16_SUB1
addic. L, L, -1
- bgt .LDGEMM_L4x16_SUB2
+ bgt LDGEMM_L4x16_SUB2
-.LDGEMM_L4x16_SAVE:
+ .align 4
+LDGEMM_L4x16_SAVE:
SAVE4x16
addic. I, I, -1
- bgt .LDGEMM_L4x16_BEGIN
+ bgt LDGEMM_L4x16_BEGIN
-.LDGEMM_L4x16_END:
+LDGEMM_L4x16_END:
-.LDGEMM_L4x8_BEGIN:
+LDGEMM_L4x8_BEGIN:
andi. T2, M, 15
- ble .LDGEMM_L4x1_END
+ ble LDGEMM_L4x1_END
andi. T1, M, 8
- ble .LDGEMM_L4x8_END
+ ble LDGEMM_L4x8_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L4x8_SUB0
+ ble LDGEMM_L4x8_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L4x8_SUB4
+ ble LDGEMM_L4x8_SUB4
-.LDGEMM_L4x8_LOOP_START:
+LDGEMM_L4x8_LOOP_START:
+ dcbt AO, PRE
LOAD4x8_1
KERNEL4x8_I1
+ dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
+ dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
+ dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
+ dcbt AO, PRE
KERNEL4x8_2
addic. L, L, -2
- ble .LDGEMM_L4x8_LOOP_END
+ ble LDGEMM_L4x8_LOOP_END
.align 5
-.LDGEMM_L4x8_LOOP:
+LDGEMM_L4x8_LOOP:
KERNEL4x8_1
+ dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
+ dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
+ dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
+ dcbt AO, PRE
KERNEL4x8_2
addic. L, L, -1
- bgt .LDGEMM_L4x8_LOOP
+ bgt LDGEMM_L4x8_LOOP
-.LDGEMM_L4x8_LOOP_END:
+LDGEMM_L4x8_LOOP_END:
KERNEL4x8_1
KERNEL4x8_2
@@ -233,9 +227,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x8_1
KERNEL4x8_E2
- b .LDGEMM_L4x8_SUB1
+ b LDGEMM_L4x8_SUB1
-.LDGEMM_L4x8_SUB4:
+LDGEMM_L4x8_SUB4:
KERNEL4x8_SUBI1
KERNEL4x8_SUB1
@@ -247,81 +241,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x8_SUB1
KERNEL4x8_SUB1
- b .LDGEMM_L4x8_SUB1
+ b LDGEMM_L4x8_SUB1
-.LDGEMM_L4x8_SUB0:
+LDGEMM_L4x8_SUB0:
andi. L, K, 7
KERNEL4x8_SUBI1
addic. L, L, -1
- ble .LDGEMM_L4x8_SAVE
- b .LDGEMM_L4x8_SUB2
+ ble LDGEMM_L4x8_SAVE
+ b LDGEMM_L4x8_SUB2
-.LDGEMM_L4x8_SUB1:
+LDGEMM_L4x8_SUB1:
andi. L, K, 7
- ble .LDGEMM_L4x8_SAVE
+ ble LDGEMM_L4x8_SAVE
-.LDGEMM_L4x8_SUB2:
+LDGEMM_L4x8_SUB2:
KERNEL4x8_SUB1
addic. L, L, -1
- bgt .LDGEMM_L4x8_SUB2
+ bgt LDGEMM_L4x8_SUB2
-.LDGEMM_L4x8_SAVE:
+LDGEMM_L4x8_SAVE:
SAVE4x8
-.LDGEMM_L4x8_END:
+LDGEMM_L4x8_END:
-.LDGEMM_L4x4_BEGIN:
+LDGEMM_L4x4_BEGIN:
andi. T1, M, 4
- ble .LDGEMM_L4x4_END
+ ble LDGEMM_L4x4_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L4x4_SUB0
+ ble LDGEMM_L4x4_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L4x4_SUB4
+ ble LDGEMM_L4x4_SUB4
-.LDGEMM_L4x4_LOOP_START:
+LDGEMM_L4x4_LOOP_START:
+ dcbt AO, PRE
LOAD4x4_1
KERNEL4x4_I1
KERNEL4x4_2
KERNEL4x4_1
+ dcbt AO, PRE
KERNEL4x4_2
KERNEL4x4_1
KERNEL4x4_2
KERNEL4x4_1
+ dcbt AO, PRE
KERNEL4x4_2
addic. L, L, -2
- ble .LDGEMM_L4x4_LOOP_END
+ ble LDGEMM_L4x4_LOOP_END
.align 5
-.LDGEMM_L4x4_LOOP:
+LDGEMM_L4x4_LOOP:
KERNEL4x4_1
KERNEL4x4_2
KERNEL4x4_1
+ dcbt AO, PRE
KERNEL4x4_2
KERNEL4x4_1
KERNEL4x4_2
KERNEL4x4_1
+ dcbt AO, PRE
KERNEL4x4_2
addic. L, L, -1
- bgt .LDGEMM_L4x4_LOOP
+ bgt LDGEMM_L4x4_LOOP
-.LDGEMM_L4x4_LOOP_END:
+LDGEMM_L4x4_LOOP_END:
KERNEL4x4_1
KERNEL4x4_2
@@ -333,9 +332,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x4_1
KERNEL4x4_E2
- b .LDGEMM_L4x4_SUB1
+ b LDGEMM_L4x4_SUB1
-.LDGEMM_L4x4_SUB4:
+LDGEMM_L4x4_SUB4:
KERNEL4x4_SUBI1
KERNEL4x4_SUB1
@@ -347,48 +346,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x4_SUB1
KERNEL4x4_SUB1
- b .LDGEMM_L4x4_SUB1
+ b LDGEMM_L4x4_SUB1
-.LDGEMM_L4x4_SUB0:
+LDGEMM_L4x4_SUB0:
andi. L, K, 7
KERNEL4x4_SUBI1
addic. L, L, -1
- ble .LDGEMM_L4x4_SAVE
- b .LDGEMM_L4x4_SUB2
+ ble LDGEMM_L4x4_SAVE
+ b LDGEMM_L4x4_SUB2
-.LDGEMM_L4x4_SUB1:
+LDGEMM_L4x4_SUB1:
andi. L, K, 7
- ble .LDGEMM_L4x4_SAVE
+ ble LDGEMM_L4x4_SAVE
-.LDGEMM_L4x4_SUB2:
+LDGEMM_L4x4_SUB2:
KERNEL4x4_SUB1
addic. L, L, -1
- bgt .LDGEMM_L4x4_SUB2
+ bgt LDGEMM_L4x4_SUB2
-.LDGEMM_L4x4_SAVE:
+LDGEMM_L4x4_SAVE:
SAVE4x4
-.LDGEMM_L4x4_END:
+LDGEMM_L4x4_END:
-.LDGEMM_L4x2_BEGIN:
+LDGEMM_L4x2_BEGIN:
andi. T1, M, 2
- ble .LDGEMM_L4x2_END
+ ble LDGEMM_L4x2_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L4x2_SUB0
+ ble LDGEMM_L4x2_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L4x2_SUB4
+ ble LDGEMM_L4x2_SUB4
-.LDGEMM_L4x2_LOOP_START:
+LDGEMM_L4x2_LOOP_START:
LOAD4x2_1
KERNEL4x2_I1
@@ -402,11 +401,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x2_2
addic. L, L, -2
- ble .LDGEMM_L4x2_LOOP_END
+ ble LDGEMM_L4x2_LOOP_END
.align 5
-.LDGEMM_L4x2_LOOP:
+LDGEMM_L4x2_LOOP:
KERNEL4x2_1
KERNEL4x2_2
@@ -419,9 +418,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x2_2
addic. L, L, -1
- bgt .LDGEMM_L4x2_LOOP
+ bgt LDGEMM_L4x2_LOOP
-.LDGEMM_L4x2_LOOP_END:
+LDGEMM_L4x2_LOOP_END:
KERNEL4x2_1
KERNEL4x2_2
@@ -433,9 +432,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x2_1
KERNEL4x2_E2
- b .LDGEMM_L4x2_SUB1
+ b LDGEMM_L4x2_SUB1
-.LDGEMM_L4x2_SUB4:
+LDGEMM_L4x2_SUB4:
KERNEL4x2_SUBI1
KERNEL4x2_SUB1
@@ -447,48 +446,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x2_SUB1
KERNEL4x2_SUB1
- b .LDGEMM_L4x2_SUB1
+ b LDGEMM_L4x2_SUB1
-.LDGEMM_L4x2_SUB0:
+LDGEMM_L4x2_SUB0:
andi. L, K, 7
KERNEL4x2_SUBI1
addic. L, L, -1
- ble .LDGEMM_L4x2_SAVE
- b .LDGEMM_L4x2_SUB2
+ ble LDGEMM_L4x2_SAVE
+ b LDGEMM_L4x2_SUB2
-.LDGEMM_L4x2_SUB1:
+LDGEMM_L4x2_SUB1:
andi. L, K, 7
- ble .LDGEMM_L4x2_SAVE
+ ble LDGEMM_L4x2_SAVE
-.LDGEMM_L4x2_SUB2:
+LDGEMM_L4x2_SUB2:
KERNEL4x2_SUB1
addic. L, L, -1
- bgt .LDGEMM_L4x2_SUB2
+ bgt LDGEMM_L4x2_SUB2
-.LDGEMM_L4x2_SAVE:
+LDGEMM_L4x2_SAVE:
SAVE4x2
-.LDGEMM_L4x2_END:
+LDGEMM_L4x2_END:
-.LDGEMM_L4x1_BEGIN:
+LDGEMM_L4x1_BEGIN:
andi. T1, M, 1
- ble .LDGEMM_L4x1_END
+ ble LDGEMM_L4x1_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L4x1_SUB0
+ ble LDGEMM_L4x1_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L4x1_SUB4
+ ble LDGEMM_L4x1_SUB4
-.LDGEMM_L4x1_LOOP_START:
+LDGEMM_L4x1_LOOP_START:
LOAD4x1_1
KERNEL4x1_I1
@@ -502,11 +501,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x1_2
addic. L, L, -2
- ble .LDGEMM_L4x1_LOOP_END
+ ble LDGEMM_L4x1_LOOP_END
.align 5
-.LDGEMM_L4x1_LOOP:
+LDGEMM_L4x1_LOOP:
KERNEL4x1_1
KERNEL4x1_2
@@ -519,9 +518,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x1_2
addic. L, L, -1
- bgt .LDGEMM_L4x1_LOOP
+ bgt LDGEMM_L4x1_LOOP
-.LDGEMM_L4x1_LOOP_END:
+LDGEMM_L4x1_LOOP_END:
KERNEL4x1_1
KERNEL4x1_2
@@ -533,9 +532,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x1_1
KERNEL4x1_E2
- b .LDGEMM_L4x1_SUB1
+ b LDGEMM_L4x1_SUB1
-.LDGEMM_L4x1_SUB4:
+LDGEMM_L4x1_SUB4:
KERNEL4x1_SUBI1
KERNEL4x1_SUB1
@@ -547,74 +546,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x1_SUB1
KERNEL4x1_SUB1
- b .LDGEMM_L4x1_SUB1
+ b LDGEMM_L4x1_SUB1
-.LDGEMM_L4x1_SUB0:
+LDGEMM_L4x1_SUB0:
andi. L, K, 7
KERNEL4x1_SUBI1
addic. L, L, -1
- ble .LDGEMM_L4x1_SAVE
- b .LDGEMM_L4x1_SUB2
+ ble LDGEMM_L4x1_SAVE
+ b LDGEMM_L4x1_SUB2
-.LDGEMM_L4x1_SUB1:
+LDGEMM_L4x1_SUB1:
andi. L, K, 7
- ble .LDGEMM_L4x1_SAVE
+ ble LDGEMM_L4x1_SAVE
-.LDGEMM_L4x1_SUB2:
+LDGEMM_L4x1_SUB2:
KERNEL4x1_SUB1
addic. L, L, -1
- bgt .LDGEMM_L4x1_SUB2
+ bgt LDGEMM_L4x1_SUB2
-.LDGEMM_L4x1_SAVE:
+LDGEMM_L4x1_SAVE:
SAVE4x1
-.LDGEMM_L4x1_END:
+LDGEMM_L4x1_END:
slwi T1, K, 5
add B, B, T1
addic. J, J, -1
- bgt .LDGEMM_L4_BEGIN
+ bgt LDGEMM_L4_BEGIN
andi. T2, N, 3
ble .L999
-.LDGEMM_L4_END:
+LDGEMM_L4_END:
- b .LDGEMM_L2_BEGIN
+ b LDGEMM_L2_BEGIN
.L999_H1:
b .L999
-.LDGEMM_L2_BEGIN:
+LDGEMM_L2_BEGIN:
andi. T1, N, 2
- ble .LDGEMM_L2_END
+ ble LDGEMM_L2_END
mr CO, C
mr AO, A
slwi T1, LDC , 1
add C, C, T1
srawi. I, M, 4
- ble .LDGEMM_L2x16_END
+ ble LDGEMM_L2x16_END
-.LDGEMM_L2x16_BEGIN:
+LDGEMM_L2x16_BEGIN:
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L2x16_SUB0
+ ble LDGEMM_L2x16_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L2x16_SUB4
+ ble LDGEMM_L2x16_SUB4
-.LDGEMM_L2x16_LOOP_START:
+LDGEMM_L2x16_LOOP_START:
dcbt AO, PRE
LOAD2x16_1
@@ -637,11 +636,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x16_2
addic. L, L, -2
- ble .LDGEMM_L2x16_LOOP_END
+ ble LDGEMM_L2x16_LOOP_END
.align 5
-.LDGEMM_L2x16_LOOP:
+LDGEMM_L2x16_LOOP:
dcbt AO, PRE
KERNEL2x16_1
@@ -662,9 +661,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x16_2
addic. L, L, -1
- bgt .LDGEMM_L2x16_LOOP
+ bgt LDGEMM_L2x16_LOOP
-.LDGEMM_L2x16_LOOP_END:
+LDGEMM_L2x16_LOOP_END:
dcbt AO, PRE
KERNEL2x16_1
@@ -683,9 +682,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x16_1
KERNEL2x16_E2
- b .LDGEMM_L2x16_SUB1
+ b LDGEMM_L2x16_SUB1
-.LDGEMM_L2x16_SUB4:
+LDGEMM_L2x16_SUB4:
dcbt AO, PRE
KERNEL2x16_SUBI1
@@ -701,86 +700,95 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x16_SUB1
KERNEL2x16_SUB1
- b .LDGEMM_L2x16_SUB1
+ b LDGEMM_L2x16_SUB1
-.LDGEMM_L2x16_SUB0:
+LDGEMM_L2x16_SUB0:
andi. L, K, 7
KERNEL2x16_SUBI1
addic. L, L, -1
- ble .LDGEMM_L2x16_SAVE
- b .LDGEMM_L2x16_SUB2
+ ble LDGEMM_L2x16_SAVE
+ b LDGEMM_L2x16_SUB2
-.LDGEMM_L2x16_SUB1:
+LDGEMM_L2x16_SUB1:
andi. L, K, 7
- ble .LDGEMM_L2x16_SAVE
+ ble LDGEMM_L2x16_SAVE
-.LDGEMM_L2x16_SUB2:
+LDGEMM_L2x16_SUB2:
KERNEL2x16_SUB1
addic. L, L, -1
- bgt .LDGEMM_L2x16_SUB2
+ bgt LDGEMM_L2x16_SUB2
-.LDGEMM_L2x16_SAVE:
+LDGEMM_L2x16_SAVE:
SAVE2x16
addic. I, I, -1
- bgt .LDGEMM_L2x16_BEGIN
+ bgt LDGEMM_L2x16_BEGIN
-.LDGEMM_L2x16_END:
+LDGEMM_L2x16_END:
-.LDGEMM_L2x8_BEGIN:
+LDGEMM_L2x8_BEGIN:
andi. T2, M, 15
- ble .LDGEMM_L2x1_END
+ ble LDGEMM_L2x1_END
andi. T1, M, 8
- ble .LDGEMM_L2x8_END
+ ble LDGEMM_L2x8_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L2x8_SUB0
+ ble LDGEMM_L2x8_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L2x8_SUB4
+ ble LDGEMM_L2x8_SUB4
-.LDGEMM_L2x8_LOOP_START:
+LDGEMM_L2x8_LOOP_START:
+ dcbt AO, PRE
LOAD2x8_1
KERNEL2x8_I1
+ dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
+ dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
+ dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
+ dcbt AO, PRE
KERNEL2x8_2
addic. L, L, -2
- ble .LDGEMM_L2x8_LOOP_END
+ ble LDGEMM_L2x8_LOOP_END
.align 5
-.LDGEMM_L2x8_LOOP:
+LDGEMM_L2x8_LOOP:
KERNEL2x8_1
+ dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
+ dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
+ dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
+ dcbt AO, PRE
KERNEL2x8_2
addic. L, L, -1
- bgt .LDGEMM_L2x8_LOOP
+ bgt LDGEMM_L2x8_LOOP
-.LDGEMM_L2x8_LOOP_END:
+LDGEMM_L2x8_LOOP_END:
KERNEL2x8_1
KERNEL2x8_2
@@ -792,9 +800,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x8_1
KERNEL2x8_E2
- b .LDGEMM_L2x8_SUB1
+ b LDGEMM_L2x8_SUB1
-.LDGEMM_L2x8_SUB4:
+LDGEMM_L2x8_SUB4:
KERNEL2x8_SUBI1
KERNEL2x8_SUB1
@@ -806,48 +814,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x8_SUB1
KERNEL2x8_SUB1
- b .LDGEMM_L2x8_SUB1
+ b LDGEMM_L2x8_SUB1
-.LDGEMM_L2x8_SUB0:
+LDGEMM_L2x8_SUB0:
andi. L, K, 7
KERNEL2x8_SUBI1
addic. L, L, -1
- ble .LDGEMM_L2x8_SAVE
- b .LDGEMM_L2x8_SUB2
+ ble LDGEMM_L2x8_SAVE
+ b LDGEMM_L2x8_SUB2
-.LDGEMM_L2x8_SUB1:
+LDGEMM_L2x8_SUB1:
andi. L, K, 7
- ble .LDGEMM_L2x8_SAVE
+ ble LDGEMM_L2x8_SAVE
-.LDGEMM_L2x8_SUB2:
+LDGEMM_L2x8_SUB2:
KERNEL2x8_SUB1
addic. L, L, -1
- bgt .LDGEMM_L2x8_SUB2
+ bgt LDGEMM_L2x8_SUB2
-.LDGEMM_L2x8_SAVE:
+LDGEMM_L2x8_SAVE:
SAVE2x8
-.LDGEMM_L2x8_END:
+LDGEMM_L2x8_END:
-.LDGEMM_L2x4_BEGIN:
+LDGEMM_L2x4_BEGIN:
andi. T1, M, 4
- ble .LDGEMM_L2x4_END
+ ble LDGEMM_L2x4_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L2x4_SUB0
+ ble LDGEMM_L2x4_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L2x4_SUB4
+ ble LDGEMM_L2x4_SUB4
-.LDGEMM_L2x4_LOOP_START:
+LDGEMM_L2x4_LOOP_START:
LOAD2x4_1
KERNEL2x4_I1
@@ -861,11 +869,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x4_2
addic. L, L, -2
- ble .LDGEMM_L2x4_LOOP_END
+ ble LDGEMM_L2x4_LOOP_END
.align 5
-.LDGEMM_L2x4_LOOP:
+LDGEMM_L2x4_LOOP:
KERNEL2x4_1
KERNEL2x4_2
@@ -878,9 +886,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x4_2
addic. L, L, -1
- bgt .LDGEMM_L2x4_LOOP
+ bgt LDGEMM_L2x4_LOOP
-.LDGEMM_L2x4_LOOP_END:
+LDGEMM_L2x4_LOOP_END:
KERNEL2x4_1
KERNEL2x4_2
@@ -892,9 +900,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x4_1
KERNEL2x4_E2
- b .LDGEMM_L2x4_SUB1
+ b LDGEMM_L2x4_SUB1
-.LDGEMM_L2x4_SUB4:
+LDGEMM_L2x4_SUB4:
KERNEL2x4_SUBI1
KERNEL2x4_SUB1
@@ -906,48 +914,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x4_SUB1
KERNEL2x4_SUB1
- b .LDGEMM_L2x4_SUB1
+ b LDGEMM_L2x4_SUB1
-.LDGEMM_L2x4_SUB0:
+LDGEMM_L2x4_SUB0:
andi. L, K, 7
KERNEL2x4_SUBI1
addic. L, L, -1
- ble .LDGEMM_L2x4_SAVE
- b .LDGEMM_L2x4_SUB2
+ ble LDGEMM_L2x4_SAVE
+ b LDGEMM_L2x4_SUB2
-.LDGEMM_L2x4_SUB1:
+LDGEMM_L2x4_SUB1:
andi. L, K, 7
- ble .LDGEMM_L2x4_SAVE
+ ble LDGEMM_L2x4_SAVE
-.LDGEMM_L2x4_SUB2:
+LDGEMM_L2x4_SUB2:
KERNEL2x4_SUB1
addic. L, L, -1
- bgt .LDGEMM_L2x4_SUB2
+ bgt LDGEMM_L2x4_SUB2
-.LDGEMM_L2x4_SAVE:
+LDGEMM_L2x4_SAVE:
SAVE2x4
-.LDGEMM_L2x4_END:
+LDGEMM_L2x4_END:
-.LDGEMM_L2x2_BEGIN:
+LDGEMM_L2x2_BEGIN:
andi. T1, M, 2
- ble .LDGEMM_L2x2_END
+ ble LDGEMM_L2x2_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L2x2_SUB0
+ ble LDGEMM_L2x2_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L2x2_SUB4
+ ble LDGEMM_L2x2_SUB4
-.LDGEMM_L2x2_LOOP_START:
+LDGEMM_L2x2_LOOP_START:
LOAD2x2_1
KERNEL2x2_I1
@@ -961,11 +969,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x2_2
addic. L, L, -2
- ble .LDGEMM_L2x2_LOOP_END
+ ble LDGEMM_L2x2_LOOP_END
.align 5
-.LDGEMM_L2x2_LOOP:
+LDGEMM_L2x2_LOOP:
KERNEL2x2_1
KERNEL2x2_2
@@ -978,9 +986,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x2_2
addic. L, L, -1
- bgt .LDGEMM_L2x2_LOOP
+ bgt LDGEMM_L2x2_LOOP
-.LDGEMM_L2x2_LOOP_END:
+LDGEMM_L2x2_LOOP_END:
KERNEL2x2_1
KERNEL2x2_2
@@ -992,9 +1000,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x2_1
KERNEL2x2_E2
- b .LDGEMM_L2x2_SUB1
+ b LDGEMM_L2x2_SUB1
-.LDGEMM_L2x2_SUB4:
+LDGEMM_L2x2_SUB4:
KERNEL2x2_SUBI1
KERNEL2x2_SUB1
@@ -1006,48 +1014,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x2_SUB1
KERNEL2x2_SUB1
- b .LDGEMM_L2x2_SUB1
+ b LDGEMM_L2x2_SUB1
-.LDGEMM_L2x2_SUB0:
+LDGEMM_L2x2_SUB0:
andi. L, K, 7
KERNEL2x2_SUBI1
addic. L, L, -1
- ble .LDGEMM_L2x2_SAVE
- b .LDGEMM_L2x2_SUB2
+ ble LDGEMM_L2x2_SAVE
+ b LDGEMM_L2x2_SUB2
-.LDGEMM_L2x2_SUB1:
+LDGEMM_L2x2_SUB1:
andi. L, K, 7
- ble .LDGEMM_L2x2_SAVE
+ ble LDGEMM_L2x2_SAVE
-.LDGEMM_L2x2_SUB2:
+LDGEMM_L2x2_SUB2:
KERNEL2x2_SUB1
addic. L, L, -1
- bgt .LDGEMM_L2x2_SUB2
+ bgt LDGEMM_L2x2_SUB2
-.LDGEMM_L2x2_SAVE:
+LDGEMM_L2x2_SAVE:
SAVE2x2
-.LDGEMM_L2x2_END:
+LDGEMM_L2x2_END:
-.LDGEMM_L2x1_BEGIN:
+LDGEMM_L2x1_BEGIN:
andi. T1, M, 1
- ble .LDGEMM_L2x1_END
+ ble LDGEMM_L2x1_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L2x1_SUB0
+ ble LDGEMM_L2x1_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L2x1_SUB4
+ ble LDGEMM_L2x1_SUB4
-.LDGEMM_L2x1_LOOP_START:
+LDGEMM_L2x1_LOOP_START:
LOAD2x1_1
KERNEL2x1_I1
@@ -1061,11 +1069,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x1_2
addic. L, L, -2
- ble .LDGEMM_L2x1_LOOP_END
+ ble LDGEMM_L2x1_LOOP_END
.align 5
-.LDGEMM_L2x1_LOOP:
+LDGEMM_L2x1_LOOP:
KERNEL2x1_1
KERNEL2x1_2
@@ -1078,9 +1086,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x1_2
addic. L, L, -1
- bgt .LDGEMM_L2x1_LOOP
+ bgt LDGEMM_L2x1_LOOP
-.LDGEMM_L2x1_LOOP_END:
+LDGEMM_L2x1_LOOP_END:
KERNEL2x1_1
KERNEL2x1_2
@@ -1092,9 +1100,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x1_1
KERNEL2x1_E2
- b .LDGEMM_L2x1_SUB1
+ b LDGEMM_L2x1_SUB1
-.LDGEMM_L2x1_SUB4:
+LDGEMM_L2x1_SUB4:
KERNEL2x1_SUBI1
KERNEL2x1_SUB1
@@ -1106,59 +1114,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x1_SUB1
KERNEL2x1_SUB1
- b .LDGEMM_L2x1_SUB1
+ b LDGEMM_L2x1_SUB1
-.LDGEMM_L2x1_SUB0:
+LDGEMM_L2x1_SUB0:
andi. L, K, 7
KERNEL2x1_SUBI1
addic. L, L, -1
- ble .LDGEMM_L2x1_SAVE
- b .LDGEMM_L2x1_SUB2
+ ble LDGEMM_L2x1_SAVE
+ b LDGEMM_L2x1_SUB2
-.LDGEMM_L2x1_SUB1:
+LDGEMM_L2x1_SUB1:
andi. L, K, 7
- ble .LDGEMM_L2x1_SAVE
+ ble LDGEMM_L2x1_SAVE
-.LDGEMM_L2x1_SUB2:
+LDGEMM_L2x1_SUB2:
KERNEL2x1_SUB1
addic. L, L, -1
- bgt .LDGEMM_L2x1_SUB2
+ bgt LDGEMM_L2x1_SUB2
-.LDGEMM_L2x1_SAVE:
+LDGEMM_L2x1_SAVE:
SAVE2x1
-.LDGEMM_L2x1_END:
+LDGEMM_L2x1_END:
slwi T1, K, 4
add B, B, T1
-.LDGEMM_L2_END:
-.LDGEMM_L1_BEGIN:
+LDGEMM_L2_END:
+LDGEMM_L1_BEGIN:
andi. T1, N, 1
- ble .LDGEMM_L1_END
+ ble LDGEMM_L1_END
mr CO, C
mr AO, A
srawi. I, M, 4
- ble .LDGEMM_L1x16_END
+ ble LDGEMM_L1x16_END
-.LDGEMM_L1x16_BEGIN:
+LDGEMM_L1x16_BEGIN:
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L1x16_SUB0
+ ble LDGEMM_L1x16_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L1x16_SUB4
+ ble LDGEMM_L1x16_SUB4
-.LDGEMM_L1x16_LOOP_START:
+LDGEMM_L1x16_LOOP_START:
dcbt AO, PRE
LOAD1x16_1
@@ -1181,11 +1189,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x16_2
addic. L, L, -2
- ble .LDGEMM_L1x16_LOOP_END
+ ble LDGEMM_L1x16_LOOP_END
.align 5
-.LDGEMM_L1x16_LOOP:
+LDGEMM_L1x16_LOOP:
dcbt AO, PRE
KERNEL1x16_1
@@ -1206,9 +1214,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x16_2
addic. L, L, -1
- bgt .LDGEMM_L1x16_LOOP
+ bgt LDGEMM_L1x16_LOOP
-.LDGEMM_L1x16_LOOP_END:
+LDGEMM_L1x16_LOOP_END:
dcbt AO, PRE
KERNEL1x16_1
@@ -1227,9 +1235,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x16_1
KERNEL1x16_E2
- b .LDGEMM_L1x16_SUB1
+ b LDGEMM_L1x16_SUB1
-.LDGEMM_L1x16_SUB4:
+LDGEMM_L1x16_SUB4:
dcbt AO, PRE
KERNEL1x16_SUBI1
@@ -1245,86 +1253,95 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x16_SUB1
KERNEL1x16_SUB1
- b .LDGEMM_L1x16_SUB1
+ b LDGEMM_L1x16_SUB1
-.LDGEMM_L1x16_SUB0:
+LDGEMM_L1x16_SUB0:
andi. L, K, 7
KERNEL1x16_SUBI1
addic. L, L, -1
- ble .LDGEMM_L1x16_SAVE
- b .LDGEMM_L1x16_SUB2
+ ble LDGEMM_L1x16_SAVE
+ b LDGEMM_L1x16_SUB2
-.LDGEMM_L1x16_SUB1:
+LDGEMM_L1x16_SUB1:
andi. L, K, 7
- ble .LDGEMM_L1x16_SAVE
+ ble LDGEMM_L1x16_SAVE
-.LDGEMM_L1x16_SUB2:
+LDGEMM_L1x16_SUB2:
KERNEL1x16_SUB1
addic. L, L, -1
- bgt .LDGEMM_L1x16_SUB2
+ bgt LDGEMM_L1x16_SUB2
-.LDGEMM_L1x16_SAVE:
+LDGEMM_L1x16_SAVE:
SAVE1x16
addic. I, I, -1
- bgt .LDGEMM_L1x16_BEGIN
+ bgt LDGEMM_L1x16_BEGIN
-.LDGEMM_L1x16_END:
+LDGEMM_L1x16_END:
-.LDGEMM_L1x8_BEGIN:
+LDGEMM_L1x8_BEGIN:
andi. T2, M, 15
- ble .LDGEMM_L1x1_END
+ ble LDGEMM_L1x1_END
andi. T1, M, 8
- ble .LDGEMM_L1x8_END
+ ble LDGEMM_L1x8_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L1x8_SUB0
+ ble LDGEMM_L1x8_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L1x8_SUB4
+ ble LDGEMM_L1x8_SUB4
-.LDGEMM_L1x8_LOOP_START:
+LDGEMM_L1x8_LOOP_START:
+ dcbt AO, PRE
LOAD1x8_1
KERNEL1x8_I1
+ dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
+ dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
+ dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
+ dcbt AO, PRE
KERNEL1x8_2
addic. L, L, -2
- ble .LDGEMM_L1x8_LOOP_END
+ ble LDGEMM_L1x8_LOOP_END
.align 5
-.LDGEMM_L1x8_LOOP:
+LDGEMM_L1x8_LOOP:
KERNEL1x8_1
+ dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
+ dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
+ dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
+ dcbt AO, PRE
KERNEL1x8_2
addic. L, L, -1
- bgt .LDGEMM_L1x8_LOOP
+ bgt LDGEMM_L1x8_LOOP
-.LDGEMM_L1x8_LOOP_END:
+LDGEMM_L1x8_LOOP_END:
KERNEL1x8_1
KERNEL1x8_2
@@ -1336,9 +1353,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x8_1
KERNEL1x8_E2
- b .LDGEMM_L1x8_SUB1
+ b LDGEMM_L1x8_SUB1
-.LDGEMM_L1x8_SUB4:
+LDGEMM_L1x8_SUB4:
KERNEL1x8_SUBI1
KERNEL1x8_SUB1
@@ -1350,48 +1367,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x8_SUB1
KERNEL1x8_SUB1
- b .LDGEMM_L1x8_SUB1
+ b LDGEMM_L1x8_SUB1
-.LDGEMM_L1x8_SUB0:
+LDGEMM_L1x8_SUB0:
andi. L, K, 7
KERNEL1x8_SUBI1
addic. L, L, -1
- ble .LDGEMM_L1x8_SAVE
- b .LDGEMM_L1x8_SUB2
+ ble LDGEMM_L1x8_SAVE
+ b LDGEMM_L1x8_SUB2
-.LDGEMM_L1x8_SUB1:
+LDGEMM_L1x8_SUB1:
andi. L, K, 7
- ble .LDGEMM_L1x8_SAVE
+ ble LDGEMM_L1x8_SAVE
-.LDGEMM_L1x8_SUB2:
+LDGEMM_L1x8_SUB2:
KERNEL1x8_SUB1
addic. L, L, -1
- bgt .LDGEMM_L1x8_SUB2
+ bgt LDGEMM_L1x8_SUB2
-.LDGEMM_L1x8_SAVE:
+LDGEMM_L1x8_SAVE:
SAVE1x8
-.LDGEMM_L1x8_END:
+LDGEMM_L1x8_END:
-.LDGEMM_L1x4_BEGIN:
+LDGEMM_L1x4_BEGIN:
andi. T1, M, 4
- ble .LDGEMM_L1x4_END
+ ble LDGEMM_L1x4_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L1x4_SUB0
+ ble LDGEMM_L1x4_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L1x4_SUB4
+ ble LDGEMM_L1x4_SUB4
-.LDGEMM_L1x4_LOOP_START:
+LDGEMM_L1x4_LOOP_START:
LOAD1x4_1
KERNEL1x4_I1
@@ -1405,11 +1422,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x4_2
addic. L, L, -2
- ble .LDGEMM_L1x4_LOOP_END
+ ble LDGEMM_L1x4_LOOP_END
.align 5
-.LDGEMM_L1x4_LOOP:
+LDGEMM_L1x4_LOOP:
KERNEL1x4_1
KERNEL1x4_2
@@ -1422,9 +1439,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x4_2
addic. L, L, -1
- bgt .LDGEMM_L1x4_LOOP
+ bgt LDGEMM_L1x4_LOOP
-.LDGEMM_L1x4_LOOP_END:
+LDGEMM_L1x4_LOOP_END:
KERNEL1x4_1
KERNEL1x4_2
@@ -1436,9 +1453,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x4_1
KERNEL1x4_E2
- b .LDGEMM_L1x4_SUB1
+ b LDGEMM_L1x4_SUB1
-.LDGEMM_L1x4_SUB4:
+LDGEMM_L1x4_SUB4:
KERNEL1x4_SUBI1
KERNEL1x4_SUB1
@@ -1450,48 +1467,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x4_SUB1
KERNEL1x4_SUB1
- b .LDGEMM_L1x4_SUB1
+ b LDGEMM_L1x4_SUB1
-.LDGEMM_L1x4_SUB0:
+LDGEMM_L1x4_SUB0:
andi. L, K, 7
KERNEL1x4_SUBI1
addic. L, L, -1
- ble .LDGEMM_L1x4_SAVE
- b .LDGEMM_L1x4_SUB2
+ ble LDGEMM_L1x4_SAVE
+ b LDGEMM_L1x4_SUB2
-.LDGEMM_L1x4_SUB1:
+LDGEMM_L1x4_SUB1:
andi. L, K, 7
- ble .LDGEMM_L1x4_SAVE
+ ble LDGEMM_L1x4_SAVE
-.LDGEMM_L1x4_SUB2:
+LDGEMM_L1x4_SUB2:
KERNEL1x4_SUB1
addic. L, L, -1
- bgt .LDGEMM_L1x4_SUB2
+ bgt LDGEMM_L1x4_SUB2
-.LDGEMM_L1x4_SAVE:
+LDGEMM_L1x4_SAVE:
SAVE1x4
-.LDGEMM_L1x4_END:
+LDGEMM_L1x4_END:
-.LDGEMM_L1x2_BEGIN:
+LDGEMM_L1x2_BEGIN:
andi. T1, M, 2
- ble .LDGEMM_L1x2_END
+ ble LDGEMM_L1x2_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L1x2_SUB0
+ ble LDGEMM_L1x2_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L1x2_SUB4
+ ble LDGEMM_L1x2_SUB4
-.LDGEMM_L1x2_LOOP_START:
+LDGEMM_L1x2_LOOP_START:
LOAD1x2_1
KERNEL1x2_I1
@@ -1505,11 +1522,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x2_2
addic. L, L, -2
- ble .LDGEMM_L1x2_LOOP_END
+ ble LDGEMM_L1x2_LOOP_END
.align 5
-.LDGEMM_L1x2_LOOP:
+LDGEMM_L1x2_LOOP:
KERNEL1x2_1
KERNEL1x2_2
@@ -1522,9 +1539,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x2_2
addic. L, L, -1
- bgt .LDGEMM_L1x2_LOOP
+ bgt LDGEMM_L1x2_LOOP
-.LDGEMM_L1x2_LOOP_END:
+LDGEMM_L1x2_LOOP_END:
KERNEL1x2_1
KERNEL1x2_2
@@ -1536,9 +1553,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x2_1
KERNEL1x2_E2
- b .LDGEMM_L1x2_SUB1
+ b LDGEMM_L1x2_SUB1
-.LDGEMM_L1x2_SUB4:
+LDGEMM_L1x2_SUB4:
KERNEL1x2_SUBI1
KERNEL1x2_SUB1
@@ -1550,48 +1567,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x2_SUB1
KERNEL1x2_SUB1
- b .LDGEMM_L1x2_SUB1
+ b LDGEMM_L1x2_SUB1
-.LDGEMM_L1x2_SUB0:
+LDGEMM_L1x2_SUB0:
andi. L, K, 7
KERNEL1x2_SUBI1
addic. L, L, -1
- ble .LDGEMM_L1x2_SAVE
- b .LDGEMM_L1x2_SUB2
+ ble LDGEMM_L1x2_SAVE
+ b LDGEMM_L1x2_SUB2
-.LDGEMM_L1x2_SUB1:
+LDGEMM_L1x2_SUB1:
andi. L, K, 7
- ble .LDGEMM_L1x2_SAVE
+ ble LDGEMM_L1x2_SAVE
-.LDGEMM_L1x2_SUB2:
+LDGEMM_L1x2_SUB2:
KERNEL1x2_SUB1
addic. L, L, -1
- bgt .LDGEMM_L1x2_SUB2
+ bgt LDGEMM_L1x2_SUB2
-.LDGEMM_L1x2_SAVE:
+LDGEMM_L1x2_SAVE:
SAVE1x2
-.LDGEMM_L1x2_END:
+LDGEMM_L1x2_END:
-.LDGEMM_L1x1_BEGIN:
+LDGEMM_L1x1_BEGIN:
andi. T1, M, 1
- ble .LDGEMM_L1x1_END
+ ble LDGEMM_L1x1_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L1x1_SUB0
+ ble LDGEMM_L1x1_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L1x1_SUB4
+ ble LDGEMM_L1x1_SUB4
-.LDGEMM_L1x1_LOOP_START:
+LDGEMM_L1x1_LOOP_START:
LOAD1x1_1
KERNEL1x1_I1
@@ -1605,11 +1622,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x1_2
addic. L, L, -2
- ble .LDGEMM_L1x1_LOOP_END
+ ble LDGEMM_L1x1_LOOP_END
.align 5
-.LDGEMM_L1x1_LOOP:
+LDGEMM_L1x1_LOOP:
KERNEL1x1_1
KERNEL1x1_2
@@ -1622,9 +1639,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x1_2
addic. L, L, -1
- bgt .LDGEMM_L1x1_LOOP
+ bgt LDGEMM_L1x1_LOOP
-.LDGEMM_L1x1_LOOP_END:
+LDGEMM_L1x1_LOOP_END:
KERNEL1x1_1
KERNEL1x1_2
@@ -1636,9 +1653,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x1_1
KERNEL1x1_E2
- b .LDGEMM_L1x1_SUB1
+ b LDGEMM_L1x1_SUB1
-.LDGEMM_L1x1_SUB4:
+LDGEMM_L1x1_SUB4:
KERNEL1x1_SUBI1
KERNEL1x1_SUB1
@@ -1650,34 +1667,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x1_SUB1
KERNEL1x1_SUB1
- b .LDGEMM_L1x1_SUB1
+ b LDGEMM_L1x1_SUB1
-.LDGEMM_L1x1_SUB0:
+LDGEMM_L1x1_SUB0:
andi. L, K, 7
KERNEL1x1_SUBI1
addic. L, L, -1
- ble .LDGEMM_L1x1_SAVE
- b .LDGEMM_L1x1_SUB2
+ ble LDGEMM_L1x1_SAVE
+ b LDGEMM_L1x1_SUB2
-.LDGEMM_L1x1_SUB1:
+LDGEMM_L1x1_SUB1:
andi. L, K, 7
- ble .LDGEMM_L1x1_SAVE
+ ble LDGEMM_L1x1_SAVE
-.LDGEMM_L1x1_SUB2:
+LDGEMM_L1x1_SUB2:
KERNEL1x1_SUB1
addic. L, L, -1
- bgt .LDGEMM_L1x1_SUB2
+ bgt LDGEMM_L1x1_SUB2
-.LDGEMM_L1x1_SAVE:
+LDGEMM_L1x1_SAVE:
SAVE1x1
-.LDGEMM_L1x1_END:
+LDGEMM_L1x1_END:
-.LDGEMM_L1_END:
+LDGEMM_L1_END:
diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S
index 27c05e08e..2c7851207 100644
--- a/kernel/power/dgemm_macros_16x4_power8.S
+++ b/kernel/power/dgemm_macros_16x4_power8.S
@@ -47,88 +47,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvdsx vs24, 0, BO
lxvdsx vs25, o8, BO
- addi AO, AO, 64
-
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
+ lxvd2x vs4, o64, AO
+ lxvd2x vs5, o80, AO
+ lxvd2x vs6, o96, AO
+ lxvd2x vs7, o112, AO
lxvdsx vs26, o16, BO
lxvdsx vs27, o24, BO
- addi AO, AO, 64
+ addi AO, AO, 128
addi BO, BO, 32
.endm
+
.macro KERNEL4x16_I1
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
- xvmuldp vs34, vs2, vs24
- xvmuldp vs35, vs3, vs24
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
- lxvd2x vs8, 0, AO
+ lxvd2x vs8, o0, AO
lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
- xvmuldp vs36, vs4, vs24
- xvmuldp vs37, vs5, vs24
- xvmuldp vs38, vs6, vs24
- xvmuldp vs39, vs7, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
lxvdsx vs28, 0, BO
lxvdsx vs29, o8, BO
- xvmuldp vs40, vs0, vs25
- xvmuldp vs41, vs1, vs25
- xvmuldp vs42, vs2, vs25
- xvmuldp vs43, vs3, vs25
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
- xvmuldp vs44, vs4, vs25
- xvmuldp vs45, vs5, vs25
- xvmuldp vs46, vs6, vs25
- xvmuldp vs47, vs7, vs25
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
- addi AO, AO, 64
- xvmuldp vs48, vs0, vs26
- xvmuldp vs49, vs1, vs26
- xvmuldp vs50, vs2, vs26
- xvmuldp vs51, vs3, vs26
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
- lxvd2x vs12, 0, AO
- lxvd2x vs13, o16, AO
+ lxvd2x vs12, o64, AO
+ lxvd2x vs13, o80, AO
- xvmuldp vs52, vs4, vs26
- xvmuldp vs53, vs5, vs26
- xvmuldp vs54, vs6, vs26
- xvmuldp vs55, vs7, vs26
+ xvmuldp vs52, vs4, vs26
+ xvmuldp vs53, vs5, vs26
+ xvmuldp vs54, vs6, vs26
+ xvmuldp vs55, vs7, vs26
- lxvd2x vs14, o32, AO
- lxvd2x vs15, o48, AO
+ lxvd2x vs14, o96, AO
+ lxvd2x vs15, o112, AO
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
- xvmuldp vs56, vs0, vs27
- xvmuldp vs57, vs1, vs27
- xvmuldp vs58, vs2, vs27
- xvmuldp vs59, vs3, vs27
lxvdsx vs30, o16, BO
lxvdsx vs31, o24, BO
- xvmuldp vs60, vs4, vs27
- xvmuldp vs61, vs5, vs27
- xvmuldp vs62, vs6, vs27
- xvmuldp vs63, vs7, vs27
+ xvmuldp vs60, vs4, vs27
+ xvmuldp vs61, vs5, vs27
+ xvmuldp vs62, vs6, vs27
+ xvmuldp vs63, vs7, vs27
- addi AO, AO, 64
- addi BO, BO, 32
+ addi AO, AO, 128
.endm
+
+
.macro KERNEL4x16_1
xvmaddadp vs32, vs0, vs24
@@ -136,8 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs34, vs2, vs24
xvmaddadp vs35, vs3, vs24
- lxvd2x vs8, 0, AO
+ lxvd2x vs8, o0, AO
lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
xvmaddadp vs36, vs4, vs24
xvmaddadp vs37, vs5, vs24
@@ -152,31 +154,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs42, vs2, vs25
xvmaddadp vs43, vs3, vs25
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
xvmaddadp vs44, vs4, vs25
xvmaddadp vs45, vs5, vs25
xvmaddadp vs46, vs6, vs25
xvmaddadp vs47, vs7, vs25
- addi AO, AO, 64
xvmaddadp vs48, vs0, vs26
xvmaddadp vs49, vs1, vs26
xvmaddadp vs50, vs2, vs26
xvmaddadp vs51, vs3, vs26
- lxvd2x vs12, 0, AO
- lxvd2x vs13, o16, AO
+ lxvd2x vs12, o64, AO
+ lxvd2x vs13, o80, AO
xvmaddadp vs52, vs4, vs26
xvmaddadp vs53, vs5, vs26
xvmaddadp vs54, vs6, vs26
xvmaddadp vs55, vs7, vs26
- lxvd2x vs14, o32, AO
- lxvd2x vs15, o48, AO
+ lxvd2x vs14, o96, AO
+ lxvd2x vs15, o112, AO
xvmaddadp vs56, vs0, vs27
xvmaddadp vs57, vs1, vs27
@@ -192,7 +191,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs62, vs6, vs27
xvmaddadp vs63, vs7, vs27
- addi AO, AO, 64
+ addi AO, AO, 128
addi BO, BO, 32
.endm
@@ -228,23 +227,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs46, vs14, vs29
xvmaddadp vs47, vs15, vs29
- addi AO, AO, 64
xvmaddadp vs48, vs8, vs30
xvmaddadp vs49, vs9, vs30
xvmaddadp vs50, vs10, vs30
xvmaddadp vs51, vs11, vs30
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
+ lxvd2x vs4, o64, AO
+ lxvd2x vs5, o80, AO
xvmaddadp vs52, vs12, vs30
xvmaddadp vs53, vs13, vs30
xvmaddadp vs54, vs14, vs30
xvmaddadp vs55, vs15, vs30
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
+ lxvd2x vs6, o96, AO
+ lxvd2x vs7, o112, AO
xvmaddadp vs56, vs8, vs31
xvmaddadp vs57, vs9, vs31
@@ -259,11 +257,144 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs62, vs14, vs31
xvmaddadp vs63, vs15, vs31
- addi AO, AO, 64
+ addi AO, AO, 128
addi BO, BO, 32
.endm
+.macro KERNEL4x16_L1
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ lxvd2x vs8, o0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+
+ lxvd2x vs12, o64, AO
+ lxvd2x vs13, o80, AO
+
+ xvmaddadp vs52, vs4, vs26
+ xvmaddadp vs53, vs5, vs26
+ xvmaddadp vs54, vs6, vs26
+ xvmaddadp vs55, vs7, vs26
+
+ lxvd2x vs14, o96, AO
+ lxvd2x vs15, o112, AO
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmaddadp vs60, vs4, vs27
+ xvmaddadp vs61, vs5, vs27
+ xvmaddadp vs62, vs6, vs27
+ xvmaddadp vs63, vs7, vs27
+
+ addi AO, AO, 128
+
+.endm
+
+.macro KERNEL4x16_L2
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ lxvdsx vs24, o32, BO
+ lxvdsx vs25, o40, BO
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+
+ lxvd2x vs4, o64, AO
+ lxvd2x vs5, o80, AO
+
+ xvmaddadp vs52, vs12, vs30
+ xvmaddadp vs53, vs13, vs30
+ xvmaddadp vs54, vs14, vs30
+ xvmaddadp vs55, vs15, vs30
+
+ lxvd2x vs6, o96, AO
+ lxvd2x vs7, o112, AO
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+
+ lxvdsx vs26, o48, BO
+ lxvdsx vs27, o56, BO
+
+ xvmaddadp vs60, vs12, vs31
+ addi AO, AO, 128
+ xvmaddadp vs61, vs13, vs31
+ xvmaddadp vs62, vs14, vs31
+ addi BO, BO, 64
+ xvmaddadp vs63, vs15, vs31
+
+
+.endm
+
+
.macro KERNEL4x16_E2
@@ -378,15 +509,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvdsx vs26, o16, BO
lxvdsx vs27, o24, BO
- addi AO, AO, 64
- addi BO, BO, 32
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
+ lxvd2x vs4, o64, AO
+ lxvd2x vs5, o80, AO
+ lxvd2x vs6, o96, AO
+ lxvd2x vs7, o112, AO
- addi AO, AO, 64
xvmaddadp vs32, vs0, vs24
@@ -402,6 +530,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs41, vs1, vs25
xvmaddadp vs42, vs2, vs25
xvmaddadp vs43, vs3, vs25
+ addi BO, BO, 32
xvmaddadp vs44, vs4, vs25
xvmaddadp vs45, vs5, vs25
xvmaddadp vs46, vs6, vs25
@@ -411,6 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs49, vs1, vs26
xvmaddadp vs50, vs2, vs26
xvmaddadp vs51, vs3, vs26
+ addi AO, AO, 128
xvmaddadp vs52, vs4, vs26
xvmaddadp vs53, vs5, vs26
xvmaddadp vs54, vs6, vs26
@@ -430,21 +560,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x16
mr T1, CO
- addi T2, T1, 64
+ add T2, T1, LDC
+ add T3, T2, LDC
+ add T4, T3, LDC
-#ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- lxvd2x vs2, o32, T1
- lxvd2x vs3, o48, T1
+ lxvd2x vs0, 0, CO
+ lxvd2x vs1, o16, CO
+ lxvd2x vs2, o32, CO
+ lxvd2x vs3, o48, CO
+ lxvd2x vs4, o64, CO
+ lxvd2x vs5, o80, CO
+ lxvd2x vs6, o96, CO
+ lxvd2x vs7, o112, CO
- lxvd2x vs4, 0, T2
- lxvd2x vs5, o16, T2
- lxvd2x vs6, o32, T2
- lxvd2x vs7, o48, T2
-#endif
+ lxvd2x vs8, 0, T2
+ lxvd2x vs9, o16, T2
+ lxvd2x vs10, o32, T2
+ lxvd2x vs11, o48, T2
+ lxvd2x vs12, o64, T2
+ lxvd2x vs13, o80, T2
+ lxvd2x vs14, o96, T2
+ lxvd2x vs15, o112, T2
+
+ lxvd2x vs24, 0, T3
+ lxvd2x vs25, o16, T3
+ lxvd2x vs26, o32, T3
+ lxvd2x vs27, o48, T3
+ lxvd2x vs28, o64, T3
+ lxvd2x vs29, o80, T3
+ lxvd2x vs30, o96, T3
+ lxvd2x vs31, o112, T3
-#ifndef TRMMKERNEL
xvmaddadp vs0, vs32, alpha_r
xvmaddadp vs1, vs33, alpha_r
xvmaddadp vs2, vs34, alpha_r
@@ -453,172 +599,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs5, vs37, alpha_r
xvmaddadp vs6, vs38, alpha_r
xvmaddadp vs7, vs39, alpha_r
-#else
- xvmuldp vs0, vs32, alpha_r
- xvmuldp vs1, vs33, alpha_r
- xvmuldp vs2, vs34, alpha_r
- xvmuldp vs3, vs35, alpha_r
- xvmuldp vs4, vs36, alpha_r
- xvmuldp vs5, vs37, alpha_r
- xvmuldp vs6, vs38, alpha_r
- xvmuldp vs7, vs39, alpha_r
-#endif
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
- stxvd2x vs2, o32, T1
- stxvd2x vs3, o48, T1
+ lxvd2x vs32, 0, T4
+ lxvd2x vs33, o16, T4
+ lxvd2x vs34, o32, T4
+ lxvd2x vs35, o48, T4
+ lxvd2x vs36, o64, T4
+ lxvd2x vs37, o80, T4
+ lxvd2x vs38, o96, T4
+ lxvd2x vs39, o112, T4
- dcbt T1, PRE
-
- stxvd2x vs4, 0, T2
- stxvd2x vs5, o16, T2
- stxvd2x vs6, o32, T2
- stxvd2x vs7, o48, T2
-
- add T1, T1, LDC
- add T2, T2, LDC
-
-#ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- lxvd2x vs9, o16, T1
- lxvd2x vs10, o32, T1
- lxvd2x vs11, o48, T1
-
- lxvd2x vs12, 0, T2
- lxvd2x vs13, o16, T2
- lxvd2x vs14, o32, T2
- lxvd2x vs15, o48, T2
-#endif
-
-#ifndef TRMMKERNEL
xvmaddadp vs8, vs40, alpha_r
xvmaddadp vs9, vs41, alpha_r
xvmaddadp vs10, vs42, alpha_r
xvmaddadp vs11, vs43, alpha_r
- xvmaddadp vs12, vs44, alpha_r
- xvmaddadp vs13, vs45, alpha_r
- xvmaddadp vs14, vs46, alpha_r
- xvmaddadp vs15, vs47, alpha_r
-#else
- xvmuldp vs8, vs40, alpha_r
- xvmuldp vs9, vs41, alpha_r
- xvmuldp vs10, vs42, alpha_r
- xvmuldp vs11, vs43, alpha_r
- xvmuldp vs12, vs44, alpha_r
- xvmuldp vs13, vs45, alpha_r
- xvmuldp vs14, vs46, alpha_r
- xvmuldp vs15, vs47, alpha_r
-#endif
-
- stxvd2x vs8, 0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
-
- dcbt T1, PRE
-
- stxvd2x vs12, 0, T2
- stxvd2x vs13, o16, T2
- stxvd2x vs14, o32, T2
- stxvd2x vs15, o48, T2
-
- add T1, T1, LDC
- add T2, T2, LDC
-
-#ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- lxvd2x vs2, o32, T1
- lxvd2x vs3, o48, T1
-
- lxvd2x vs4, 0, T2
- lxvd2x vs5, o16, T2
- lxvd2x vs6, o32, T2
- lxvd2x vs7, o48, T2
-#endif
-
-#ifndef TRMMKERNEL
- xvmaddadp vs0, vs48, alpha_r
- xvmaddadp vs1, vs49, alpha_r
- xvmaddadp vs2, vs50, alpha_r
- xvmaddadp vs3, vs51, alpha_r
- xvmaddadp vs4, vs52, alpha_r
- xvmaddadp vs5, vs53, alpha_r
- xvmaddadp vs6, vs54, alpha_r
- xvmaddadp vs7, vs55, alpha_r
-#else
- xvmuldp vs0, vs48, alpha_r
- xvmuldp vs1, vs49, alpha_r
- xvmuldp vs2, vs50, alpha_r
- xvmuldp vs3, vs51, alpha_r
- xvmuldp vs4, vs52, alpha_r
- xvmuldp vs5, vs53, alpha_r
- xvmuldp vs6, vs54, alpha_r
- xvmuldp vs7, vs55, alpha_r
-#endif
stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
- dcbt T1, PRE
+ xvmaddadp vs12, vs44, alpha_r
+ xvmaddadp vs13, vs45, alpha_r
+ xvmaddadp vs14, vs46, alpha_r
+ xvmaddadp vs15, vs47, alpha_r
- stxvd2x vs4, 0, T2
- stxvd2x vs5, o16, T2
- stxvd2x vs6, o32, T2
- stxvd2x vs7, o48, T2
+ stxvd2x vs4, o64, T1
+ stxvd2x vs5, o80, T1
+ stxvd2x vs6, o96, T1
+ stxvd2x vs7, o112, T1
- add T1, T1, LDC
- add T2, T2, LDC
+ xvmaddadp vs24, vs48, alpha_r
+ xvmaddadp vs25, vs49, alpha_r
+ xvmaddadp vs26, vs50, alpha_r
+ xvmaddadp vs27, vs51, alpha_r
-#ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- lxvd2x vs9, o16, T1
- lxvd2x vs10, o32, T1
- lxvd2x vs11, o48, T1
+ stxvd2x vs8, o0, T2
+ stxvd2x vs9, o16, T2
+ stxvd2x vs10, o32, T2
+ stxvd2x vs11, o48, T2
- lxvd2x vs12, 0, T2
- lxvd2x vs13, o16, T2
- lxvd2x vs14, o32, T2
- lxvd2x vs15, o48, T2
-#endif
+ xvmaddadp vs28, vs52, alpha_r
+ xvmaddadp vs29, vs53, alpha_r
+ xvmaddadp vs30, vs54, alpha_r
+ xvmaddadp vs31, vs55, alpha_r
-#ifndef TRMMKERNEL
- xvmaddadp vs8, vs56, alpha_r
- xvmaddadp vs9, vs57, alpha_r
- xvmaddadp vs10, vs58, alpha_r
- xvmaddadp vs11, vs59, alpha_r
- xvmaddadp vs12, vs60, alpha_r
- xvmaddadp vs13, vs61, alpha_r
- xvmaddadp vs14, vs62, alpha_r
- xvmaddadp vs15, vs63, alpha_r
-#else
- xvmuldp vs8, vs56, alpha_r
- xvmuldp vs9, vs57, alpha_r
- xvmuldp vs10, vs58, alpha_r
- xvmuldp vs11, vs59, alpha_r
- xvmuldp vs12, vs60, alpha_r
- xvmuldp vs13, vs61, alpha_r
- xvmuldp vs14, vs62, alpha_r
- xvmuldp vs15, vs63, alpha_r
-#endif
+ stxvd2x vs12, o64, T2
+ stxvd2x vs13, o80, T2
+ stxvd2x vs14, o96, T2
+ stxvd2x vs15, o112, T2
- stxvd2x vs8, 0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
+ xvmaddadp vs32, vs56, alpha_r
+ xvmaddadp vs33, vs57, alpha_r
+ xvmaddadp vs34, vs58, alpha_r
+ xvmaddadp vs35, vs59, alpha_r
- dcbt T1, PRE
+ stxvd2x vs24, 0, T3
+ stxvd2x vs25, o16, T3
+ stxvd2x vs26, o32, T3
+ stxvd2x vs27, o48, T3
- stxvd2x vs12, 0, T2
- stxvd2x vs13, o16, T2
- stxvd2x vs14, o32, T2
- stxvd2x vs15, o48, T2
+ xvmaddadp vs36, vs60, alpha_r
+ xvmaddadp vs37, vs61, alpha_r
+ xvmaddadp vs38, vs62, alpha_r
+ xvmaddadp vs39, vs63, alpha_r
+
+ stxvd2x vs28, o64, T3
+ stxvd2x vs29, o80, T3
+ stxvd2x vs30, o96, T3
+ stxvd2x vs31, o112, T3
+
+ stxvd2x vs32, o0, T4
+ stxvd2x vs33, o16, T4
+ stxvd2x vs34, o32, T4
+ stxvd2x vs35, o48, T4
addi CO, CO, 128
+ stxvd2x vs36, o64, T4
+ stxvd2x vs37, o80, T4
+ stxvd2x vs38, o96, T4
+ stxvd2x vs39, o112, T4
+
+
.endm
/*********************************************************************
diff --git a/kernel/power/dgemm_ncopy_4_power8.S b/kernel/power/dgemm_ncopy_4_power8.S
new file mode 100644
index 000000000..31966047f
--- /dev/null
+++ b/kernel/power/dgemm_ncopy_4_power8.S
@@ -0,0 +1,228 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define M r3
+#define N r4
+#define A r5
+#define LDA r6
+#define B r7
+
+#define A0 r8
+#define A1 r9
+#define A2 r10
+#define A3 r11
+
+#define J r12
+
+#define PREA r14
+#define PREB r15
+#define BO r16
+#define o64 r17
+#define o80 r18
+#define o96 r19
+#define o112 r20
+#define o8 r21
+#define T2 r22
+#define I r23
+#define o16 r24
+#define o32 r25
+#define o48 r26
+#define NOTU1 r27
+#define NOTU2 r30
+#define T1 r31
+
+#define o0 0
+
+#include "dgemm_ncopy_macros_4_power8.S"
+
+#define STACKSIZE 384
+
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+ cmpwi cr0, M, 0
+ ble- L999
+ cmpwi cr0, N, 0
+ ble- L999
+
+ slwi LDA, LDA, BASE_SHIFT
+
+ li PREA, 384
+ li PREB, 384
+
+ li o8, 8
+ li o16, 16
+ li o32, 32
+ li o48, 48
+ li o64, 64
+ li o80, 80
+ li o96, 96
+ li o112, 112
+
+#include "dgemm_ncopy_logic_4_power8.S"
+
+L999:
+
+ li r3, 0
+
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ addi SP, SP, STACKSIZE
+
+ blr
+ EPILOGUE
+
+
diff --git a/kernel/power/dgemm_ncopy_logic_4_power8.S b/kernel/power/dgemm_ncopy_logic_4_power8.S
new file mode 100644
index 000000000..6944a7818
--- /dev/null
+++ b/kernel/power/dgemm_ncopy_logic_4_power8.S
@@ -0,0 +1,237 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+ mr BO, B
+ srawi. I, N, 2
+ ble DCOPYN_L2_BEGIN
+
+
+DCOPYN_L4_BEGIN:
+
+
+DCOPYN_L4_LOOP:
+
+ mr A0, A
+ add A1, A0, LDA
+ add A2, A1, LDA
+ add A3, A2, LDA
+ add A, A3, LDA
+
+DCOPYN_L4x16_BEGIN:
+
+ srawi. J, M, 4
+ ble DCOPYN_L4x16_END
+
+DCOPYN_L4x16_LOOP:
+
+ dcbt A0, PREA
+ dcbt A1, PREA
+ dcbt A2, PREA
+ dcbt A3, PREA
+ COPY_4x16
+ addic. J, J, -1
+ bgt DCOPYN_L4x16_LOOP
+
+DCOPYN_L4x16_END:
+
+
+DCOPYN_L4x8_BEGIN:
+
+ andi. J, M, 8
+ ble DCOPYN_L4x8_END
+ COPY_4x8
+
+DCOPYN_L4x8_END:
+
+
+DCOPYN_L4x4_BEGIN:
+
+ andi. J, M, 4
+ ble DCOPYN_L4x4_END
+ COPY_4x4
+
+DCOPYN_L4x4_END:
+
+
+DCOPYN_L4x2_BEGIN:
+
+ andi. J, M, 2
+ ble DCOPYN_L4x2_END
+ COPY_4x2
+
+DCOPYN_L4x2_END:
+
+
+DCOPYN_L4x1_BEGIN:
+
+ andi. J, M, 1
+ ble DCOPYN_L4x1_END
+ COPY_4x1
+
+DCOPYN_L4x1_END:
+
+
+DCOPYN_L4_END:
+
+ addic. I, I, -1
+ bgt DCOPYN_L4_LOOP
+
+DCOPYN_L2_BEGIN:
+
+ andi. T1, 4, 2
+ ble DCOPYN_L2_END
+
+DCOPYN_L2_LOOP:
+
+ mr A0, A
+ add A1, A0, LDA
+ add A, A1, LDA
+
+DCOPYN_L2x16_BEGIN:
+
+ srawi. J, M, 4
+ ble DCOPYN_L2x16_END
+
+DCOPYN_L2x16_LOOP:
+
+ COPY_2x16
+ addic. J, J, -1
+ bgt DCOPYN_L2x16_LOOP
+
+DCOPYN_L2x16_END:
+
+
+DCOPYN_L2x8_BEGIN:
+
+ andi. J, M, 8
+ ble DCOPYN_L2x8_END
+ COPY_2x8
+
+DCOPYN_L2x8_END:
+
+
+DCOPYN_L2x4_BEGIN:
+
+ andi. J, M, 4
+ ble DCOPYN_L2x4_END
+ COPY_2x4
+
+DCOPYN_L2x4_END:
+
+
+DCOPYN_L2x2_BEGIN:
+
+ andi. J, M, 2
+ ble DCOPYN_L2x2_END
+ COPY_2x2
+
+DCOPYN_L2x2_END:
+
+
+DCOPYN_L2x1_BEGIN:
+
+ andi. J, M, 1
+ ble DCOPYN_L2x1_END
+ COPY_2x1
+
+DCOPYN_L2x1_END:
+
+
+DCOPYN_L2_END:
+
+
+DCOPYN_L1_BEGIN:
+
+ andi. T1, 4, 1
+ ble DCOPYN_L1_END
+
+DCOPYN_L1_LOOP:
+
+ mr A0, A
+ add A, A0, LDA
+
+DCOPYN_L1x16_BEGIN:
+
+ srawi. J, M, 4
+ ble DCOPYN_L1x16_END
+
+DCOPYN_L1x16_LOOP:
+
+ COPY_1x16
+ addic. J, J, -1
+ bgt DCOPYN_L1x16_LOOP
+
+DCOPYN_L1x16_END:
+
+
+DCOPYN_L1x8_BEGIN:
+
+ andi. J, M, 8
+ ble DCOPYN_L1x8_END
+ COPY_1x8
+
+DCOPYN_L1x8_END:
+
+
+DCOPYN_L1x4_BEGIN:
+
+ andi. J, M, 4
+ ble DCOPYN_L1x4_END
+ COPY_1x4
+
+DCOPYN_L1x4_END:
+
+
+DCOPYN_L1x2_BEGIN:
+
+ andi. J, M, 2
+ ble DCOPYN_L1x2_END
+ COPY_1x2
+
+DCOPYN_L1x2_END:
+
+
+DCOPYN_L1x1_BEGIN:
+
+ andi. J, M, 1
+ ble DCOPYN_L1x1_END
+ COPY_1x1
+
+DCOPYN_L1x1_END:
+
+
+DCOPYN_L1_END:
+
diff --git a/kernel/power/dgemm_ncopy_macros_4_power8.S b/kernel/power/dgemm_ncopy_macros_4_power8.S
new file mode 100644
index 000000000..9b07d73f5
--- /dev/null
+++ b/kernel/power/dgemm_ncopy_macros_4_power8.S
@@ -0,0 +1,691 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro COPY_4x16
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs8, o0, A1
+ lxvd2x vs24, o0, A3
+ lxvd2x vs16, o0, A2
+
+ lxvd2x vs1, o16, A0
+ lxvd2x vs9, o16, A1
+ lxvd2x vs17, o16, A2
+ lxvd2x vs25, o16, A3
+
+ lxvd2x vs2, o32, A0
+ lxvd2x vs10, o32, A1
+ lxvd2x vs18, o32, A2
+ lxvd2x vs26, o32, A3
+
+ lxvd2x vs3, o48, A0
+ lxvd2x vs11, o48, A1
+ lxvd2x vs19, o48, A2
+ lxvd2x vs27, o48, A3
+
+ lxvd2x vs4, o64, A0
+ lxvd2x vs12, o64, A1
+ lxvd2x vs20, o64, A2
+ lxvd2x vs28, o64, A3
+
+ lxvd2x vs5, o80, A0
+ lxvd2x vs13, o80, A1
+ lxvd2x vs21, o80, A2
+ lxvd2x vs29, o80, A3
+
+ lxvd2x vs6, o96, A0
+ lxvd2x vs14, o96, A1
+ lxvd2x vs22, o96, A2
+ lxvd2x vs30, o96, A3
+
+ lxvd2x vs7, o112, A0
+ lxvd2x vs15, o112, A1
+ lxvd2x vs23, o112, A2
+ lxvd2x vs31, o112, A3
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs16, vs24, 0
+ xxpermdi vs34, vs0, vs8, 3
+ xxpermdi vs35, vs16, vs24, 3
+
+ xxpermdi vs36, vs1, vs9, 0
+ xxpermdi vs37, vs17, vs25, 0
+ xxpermdi vs38, vs1, vs9, 3
+ xxpermdi vs39, vs17, vs25, 3
+
+ xxpermdi vs40, vs2, vs10, 0
+ xxpermdi vs41, vs18, vs26, 0
+ xxpermdi vs42, vs2, vs10, 3
+ xxpermdi vs43, vs18, vs26, 3
+
+ xxpermdi vs44, vs3, vs11, 0
+ xxpermdi vs45, vs19, vs27, 0
+ xxpermdi vs46, vs3, vs11, 3
+ xxpermdi vs47, vs19, vs27, 3
+
+ xxpermdi vs48, vs4, vs12, 0
+ xxpermdi vs49, vs20, vs28, 0
+ xxpermdi vs50, vs4, vs12, 3
+ xxpermdi vs51, vs20, vs28, 3
+
+ xxpermdi vs52, vs5, vs13, 0
+ xxpermdi vs53, vs21, vs29, 0
+ xxpermdi vs54, vs5, vs13, 3
+ xxpermdi vs55, vs21, vs29, 3
+
+ addi A0, A0, 128
+ addi A1, A1, 128
+
+ xxpermdi vs56, vs6, vs14, 0
+ xxpermdi vs57, vs22, vs30, 0
+ xxpermdi vs58, vs6, vs14, 3
+ xxpermdi vs59, vs22, vs30, 3
+
+ addi A3, A3, 128
+ addi A2, A2, 128
+
+ xxpermdi vs60, vs7, vs15, 0
+ xxpermdi vs61, vs23, vs31, 0
+ xxpermdi vs62, vs7, vs15, 3
+ xxpermdi vs63, vs23, vs31, 3
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ stxvd2x vs34, o32, BO
+ stxvd2x vs35, o48, BO
+ stxvd2x vs36, o64, BO
+ stxvd2x vs37, o80, BO
+ stxvd2x vs38, o96, BO
+ stxvd2x vs39, o112, BO
+ addi BO, BO, 128
+
+ stxvd2x vs40, o0, BO
+ stxvd2x vs41, o16, BO
+ stxvd2x vs42, o32, BO
+ stxvd2x vs43, o48, BO
+ stxvd2x vs44, o64, BO
+ stxvd2x vs45, o80, BO
+ stxvd2x vs46, o96, BO
+ stxvd2x vs47, o112, BO
+ addi BO, BO, 128
+
+ stxvd2x vs48, o0, BO
+ stxvd2x vs49, o16, BO
+ stxvd2x vs50, o32, BO
+ stxvd2x vs51, o48, BO
+ stxvd2x vs52, o64, BO
+ stxvd2x vs53, o80, BO
+ stxvd2x vs54, o96, BO
+ stxvd2x vs55, o112, BO
+ addi BO, BO, 128
+
+ stxvd2x vs56, o0, BO
+ stxvd2x vs57, o16, BO
+ stxvd2x vs58, o32, BO
+ stxvd2x vs59, o48, BO
+ stxvd2x vs60, o64, BO
+ stxvd2x vs61, o80, BO
+ stxvd2x vs62, o96, BO
+ stxvd2x vs63, o112, BO
+ addi BO, BO, 128
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro COPY_4x8
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ lxvd2x vs2, o32, A0
+ lxvd2x vs3, o48, A0
+ addi A0, A0, 64
+
+
+ lxvd2x vs8, o0, A1
+ lxvd2x vs9, o16, A1
+ lxvd2x vs10, o32, A1
+ lxvd2x vs11, o48, A1
+ addi A1, A1, 64
+
+
+ lxvd2x vs16, o0, A2
+ lxvd2x vs17, o16, A2
+ lxvd2x vs18, o32, A2
+ lxvd2x vs19, o48, A2
+ addi A2, A2, 64
+
+
+ lxvd2x vs24, o0, A3
+ lxvd2x vs25, o16, A3
+ lxvd2x vs26, o32, A3
+ lxvd2x vs27, o48, A3
+ addi A3, A3, 64
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs16, vs24, 0
+ xxpermdi vs34, vs0, vs8, 3
+ xxpermdi vs35, vs16, vs24, 3
+
+ xxpermdi vs36, vs1, vs9, 0
+ xxpermdi vs37, vs17, vs25, 0
+ xxpermdi vs38, vs1, vs9, 3
+ xxpermdi vs39, vs17, vs25, 3
+
+ xxpermdi vs40, vs2, vs10, 0
+ xxpermdi vs41, vs18, vs26, 0
+ xxpermdi vs42, vs2, vs10, 3
+ xxpermdi vs43, vs18, vs26, 3
+
+ xxpermdi vs44, vs3, vs11, 0
+ xxpermdi vs45, vs19, vs27, 0
+ xxpermdi vs46, vs3, vs11, 3
+ xxpermdi vs47, vs19, vs27, 3
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ stxvd2x vs34, o32, BO
+ stxvd2x vs35, o48, BO
+ stxvd2x vs36, o64, BO
+ stxvd2x vs37, o80, BO
+ stxvd2x vs38, o96, BO
+ stxvd2x vs39, o112, BO
+ addi BO, BO, 128
+
+ stxvd2x vs40, o0, BO
+ stxvd2x vs41, o16, BO
+ stxvd2x vs42, o32, BO
+ stxvd2x vs43, o48, BO
+ stxvd2x vs44, o64, BO
+ stxvd2x vs45, o80, BO
+ stxvd2x vs46, o96, BO
+ stxvd2x vs47, o112, BO
+ addi BO, BO, 128
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro COPY_4x4
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ addi A0, A0, 32
+
+
+ lxvd2x vs8, o0, A1
+ lxvd2x vs9, o16, A1
+ addi A1, A1, 32
+
+
+ lxvd2x vs16, o0, A2
+ lxvd2x vs17, o16, A2
+ addi A2, A2, 32
+
+
+ lxvd2x vs24, o0, A3
+ lxvd2x vs25, o16, A3
+ addi A3, A3, 32
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs16, vs24, 0
+ xxpermdi vs34, vs0, vs8, 3
+ xxpermdi vs35, vs16, vs24, 3
+
+ xxpermdi vs36, vs1, vs9, 0
+ xxpermdi vs37, vs17, vs25, 0
+ xxpermdi vs38, vs1, vs9, 3
+ xxpermdi vs39, vs17, vs25, 3
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ stxvd2x vs34, o32, BO
+ stxvd2x vs35, o48, BO
+ stxvd2x vs36, o64, BO
+ stxvd2x vs37, o80, BO
+ stxvd2x vs38, o96, BO
+ stxvd2x vs39, o112, BO
+ addi BO, BO, 128
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro COPY_4x2
+
+ lxvd2x vs0, o0, A0
+ addi A0, A0, 16
+
+
+ lxvd2x vs8, o0, A1
+ addi A1, A1, 16
+
+
+ lxvd2x vs16, o0, A2
+ addi A2, A2, 16
+
+
+ lxvd2x vs24, o0, A3
+ addi A3, A3, 16
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs16, vs24, 0
+ xxpermdi vs34, vs0, vs8, 3
+ xxpermdi vs35, vs16, vs24, 3
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ stxvd2x vs34, o32, BO
+ stxvd2x vs35, o48, BO
+ addi BO, BO, 64
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro COPY_4x1
+
+ lxsdx vs0, o0, A0
+ addi A0, A0, 8
+
+
+ lxsdx vs8, o0, A1
+ addi A1, A1, 8
+
+
+ lxsdx vs16, o0, A2
+ addi A2, A2, 8
+
+
+ lxsdx vs24, o0, A3
+ addi A3, A3, 8
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs16, vs24, 0
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ addi BO, BO, 32
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+.macro COPY_2x16
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ lxvd2x vs2, o32, A0
+ lxvd2x vs3, o48, A0
+ lxvd2x vs4, o64, A0
+ lxvd2x vs5, o80, A0
+ lxvd2x vs6, o96, A0
+ lxvd2x vs7, o112, A0
+ addi A0, A0, 128
+
+
+ lxvd2x vs8, o0, A1
+ lxvd2x vs9, o16, A1
+ lxvd2x vs10, o32, A1
+ lxvd2x vs11, o48, A1
+ lxvd2x vs12, o64, A1
+ lxvd2x vs13, o80, A1
+ lxvd2x vs14, o96, A1
+ lxvd2x vs15, o112, A1
+ addi A1, A1, 128
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs0, vs8, 3
+
+ xxpermdi vs34, vs1, vs9, 0
+ xxpermdi vs35, vs1, vs9, 3
+
+ xxpermdi vs36, vs2, vs10, 0
+ xxpermdi vs37, vs2, vs10, 3
+
+ xxpermdi vs38, vs3, vs11, 0
+ xxpermdi vs39, vs3, vs11, 3
+
+ xxpermdi vs40, vs4, vs12, 0
+ xxpermdi vs41, vs4, vs12, 3
+
+ xxpermdi vs42, vs5, vs13, 0
+ xxpermdi vs43, vs5, vs13, 3
+
+ xxpermdi vs44, vs6, vs14, 0
+ xxpermdi vs45, vs6, vs14, 3
+
+ xxpermdi vs46, vs7, vs15, 0
+ xxpermdi vs47, vs7, vs15, 3
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ stxvd2x vs34, o32, BO
+ stxvd2x vs35, o48, BO
+ stxvd2x vs36, o64, BO
+ stxvd2x vs37, o80, BO
+ stxvd2x vs38, o96, BO
+ stxvd2x vs39, o112, BO
+ addi BO, BO, 128
+
+ stxvd2x vs40, o0, BO
+ stxvd2x vs41, o16, BO
+ stxvd2x vs42, o32, BO
+ stxvd2x vs43, o48, BO
+ stxvd2x vs44, o64, BO
+ stxvd2x vs45, o80, BO
+ stxvd2x vs46, o96, BO
+ stxvd2x vs47, o112, BO
+ addi BO, BO, 128
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro COPY_2x8
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ lxvd2x vs2, o32, A0
+ lxvd2x vs3, o48, A0
+ addi A0, A0, 64
+
+
+ lxvd2x vs8, o0, A1
+ lxvd2x vs9, o16, A1
+ lxvd2x vs10, o32, A1
+ lxvd2x vs11, o48, A1
+ addi A1, A1, 64
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs0, vs8, 3
+
+ xxpermdi vs34, vs1, vs9, 0
+ xxpermdi vs35, vs1, vs9, 3
+
+ xxpermdi vs36, vs2, vs10, 0
+ xxpermdi vs37, vs2, vs10, 3
+
+ xxpermdi vs38, vs3, vs11, 0
+ xxpermdi vs39, vs3, vs11, 3
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ stxvd2x vs34, o32, BO
+ stxvd2x vs35, o48, BO
+ stxvd2x vs36, o64, BO
+ stxvd2x vs37, o80, BO
+ stxvd2x vs38, o96, BO
+ stxvd2x vs39, o112, BO
+ addi BO, BO, 128
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro COPY_2x4
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ addi A0, A0, 32
+
+
+ lxvd2x vs8, o0, A1
+ lxvd2x vs9, o16, A1
+ addi A1, A1, 32
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs0, vs8, 3
+
+ xxpermdi vs34, vs1, vs9, 0
+ xxpermdi vs35, vs1, vs9, 3
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ stxvd2x vs34, o32, BO
+ stxvd2x vs35, o48, BO
+ addi BO, BO, 64
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro COPY_2x2
+
+ lxvd2x vs0, o0, A0
+ addi A0, A0, 16
+
+
+ lxvd2x vs8, o0, A1
+ addi A1, A1, 16
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs0, vs8, 3
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ addi BO, BO, 32
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro COPY_2x1
+
+ lxsdx vs0, o0, A0
+ addi A0, A0, 8
+
+
+ lxsdx vs8, o0, A1
+ addi A1, A1, 8
+
+
+ xxpermdi vs32, vs0, vs8, 0
+
+
+ stxvd2x vs32, o0, BO
+ addi BO, BO, 16
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+.macro COPY_1x16
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ lxvd2x vs2, o32, A0
+ lxvd2x vs3, o48, A0
+ lxvd2x vs4, o64, A0
+ lxvd2x vs5, o80, A0
+ lxvd2x vs6, o96, A0
+ lxvd2x vs7, o112, A0
+ addi A0, A0, 128
+
+
+ stxvd2x vs0, o0, BO
+ stxvd2x vs1, o16, BO
+ stxvd2x vs2, o32, BO
+ stxvd2x vs3, o48, BO
+ addi BO, BO, 64
+
+ stxvd2x vs4, o0, BO
+ stxvd2x vs5, o16, BO
+ stxvd2x vs6, o32, BO
+ stxvd2x vs7, o48, BO
+ addi BO, BO, 64
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro COPY_1x8
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ lxvd2x vs2, o32, A0
+ lxvd2x vs3, o48, A0
+ addi A0, A0, 64
+
+
+ stxvd2x vs0, o0, BO
+ stxvd2x vs1, o16, BO
+ stxvd2x vs2, o32, BO
+ stxvd2x vs3, o48, BO
+ addi BO, BO, 64
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro COPY_1x4
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ addi A0, A0, 32
+
+
+ stxvd2x vs0, o0, BO
+ stxvd2x vs1, o16, BO
+ addi BO, BO, 32
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro COPY_1x2
+
+ lxvd2x vs0, o0, A0
+ addi A0, A0, 16
+
+
+ stxvd2x vs0, o0, BO
+ addi BO, BO, 16
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro COPY_1x1
+
+ lxsdx vs0, o0, A0
+ addi A0, A0, 8
+
+
+ stxsdx vs0, o0, BO
+ addi BO, BO, 8
+
+
+.endm
+
diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S
index f87af535d..eca78bac4 100644
--- a/kernel/power/dgemm_tcopy_16_power8.S
+++ b/kernel/power/dgemm_tcopy_16_power8.S
@@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add B2, B2, B
add B1, B1, B
- li PREA, 768
+ li PREA, 256
addi PREB, M16, 128
li o8, 8
diff --git a/kernel/power/dgemm_tcopy_logic_16_power8.S b/kernel/power/dgemm_tcopy_logic_16_power8.S
index 776cd3401..28fc74793 100644
--- a/kernel/power/dgemm_tcopy_logic_16_power8.S
+++ b/kernel/power/dgemm_tcopy_logic_16_power8.S
@@ -57,16 +57,20 @@ DCOPYT_L4_BEGIN:
DCOPYT_L4x16_LOOP:
+/*
addi T1, PREB, 128
addi T2, PREB, 256
+*/
dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
+/*
dcbtst BO, M16
dcbtst BO, PREB
dcbtst BO, T1
dcbtst BO, T2
+*/
COPY_4x16
add BO, BO, M16
diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S
index 2294128a2..e9dbd991e 100644
--- a/kernel/power/dtrmm_kernel_16x4_power8.S
+++ b/kernel/power/dtrmm_kernel_16x4_power8.S
@@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define PRE r30
#define T2 r31
-#include "dgemm_macros_16x4_power8.S"
+#include "dtrmm_macros_16x4_power8.S"
#ifndef NEEDPARAM
diff --git a/kernel/power/dtrmm_macros_16x4_power8.S b/kernel/power/dtrmm_macros_16x4_power8.S
new file mode 100644
index 000000000..079144a90
--- /dev/null
+++ b/kernel/power/dtrmm_macros_16x4_power8.S
@@ -0,0 +1,3431 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************
+* Macros for N=4, M=16 *
+*********************************************************************/
+
+.macro LOAD4x16_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_I1
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+ addi AO, AO, 64
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+
+ xvmuldp vs52, vs4, vs26
+ xvmuldp vs53, vs5, vs26
+ xvmuldp vs54, vs6, vs26
+ xvmuldp vs55, vs7, vs26
+
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmuldp vs60, vs4, vs27
+ xvmuldp vs61, vs5, vs27
+ xvmuldp vs62, vs6, vs27
+ xvmuldp vs63, vs7, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_1
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+ addi AO, AO, 64
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+
+ xvmaddadp vs52, vs4, vs26
+ xvmaddadp vs53, vs5, vs26
+ xvmaddadp vs54, vs6, vs26
+ xvmaddadp vs55, vs7, vs26
+
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmaddadp vs60, vs4, vs27
+ xvmaddadp vs61, vs5, vs27
+ xvmaddadp vs62, vs6, vs27
+ xvmaddadp vs63, vs7, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_2
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+ addi AO, AO, 64
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+
+ xvmaddadp vs52, vs12, vs30
+ xvmaddadp vs53, vs13, vs30
+ xvmaddadp vs54, vs14, vs30
+ xvmaddadp vs55, vs15, vs30
+
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ xvmaddadp vs60, vs12, vs31
+ xvmaddadp vs61, vs13, vs31
+ xvmaddadp vs62, vs14, vs31
+ xvmaddadp vs63, vs15, vs31
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+ xvmaddadp vs52, vs12, vs30
+ xvmaddadp vs53, vs13, vs30
+ xvmaddadp vs54, vs14, vs30
+ xvmaddadp vs55, vs15, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+ xvmaddadp vs60, vs12, vs31
+ xvmaddadp vs61, vs13, vs31
+ xvmaddadp vs62, vs14, vs31
+ xvmaddadp vs63, vs15, vs31
+
+.endm
+
+.macro KERNEL4x16_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+ xvmuldp vs52, vs4, vs26
+ xvmuldp vs53, vs5, vs26
+ xvmuldp vs54, vs6, vs26
+ xvmuldp vs55, vs7, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+ xvmuldp vs60, vs4, vs27
+ xvmuldp vs61, vs5, vs27
+ xvmuldp vs62, vs6, vs27
+ xvmuldp vs63, vs7, vs27
+
+.endm
+
+.macro KERNEL4x16_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+ xvmaddadp vs52, vs4, vs26
+ xvmaddadp vs53, vs5, vs26
+ xvmaddadp vs54, vs6, vs26
+ xvmaddadp vs55, vs7, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+ xvmaddadp vs60, vs4, vs27
+ xvmaddadp vs61, vs5, vs27
+ xvmaddadp vs62, vs6, vs27
+ xvmaddadp vs63, vs7, vs27
+
+.endm
+
+.macro SAVE4x16
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+ xvmaddadp vs4, vs36, alpha_r
+ xvmaddadp vs5, vs37, alpha_r
+ xvmaddadp vs6, vs38, alpha_r
+ xvmaddadp vs7, vs39, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+ xvmuldp vs4, vs36, alpha_r
+ xvmuldp vs5, vs37, alpha_r
+ xvmuldp vs6, vs38, alpha_r
+ xvmuldp vs7, vs39, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+
+ lxvd2x vs12, 0, T2
+ lxvd2x vs13, o16, T2
+ lxvd2x vs14, o32, T2
+ lxvd2x vs15, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+ xvmaddadp vs12, vs44, alpha_r
+ xvmaddadp vs13, vs45, alpha_r
+ xvmaddadp vs14, vs46, alpha_r
+ xvmaddadp vs15, vs47, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+ xvmuldp vs12, vs44, alpha_r
+ xvmuldp vs13, vs45, alpha_r
+ xvmuldp vs14, vs46, alpha_r
+ xvmuldp vs15, vs47, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+
+ stxvd2x vs12, 0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+ xvmaddadp vs1, vs49, alpha_r
+ xvmaddadp vs2, vs50, alpha_r
+ xvmaddadp vs3, vs51, alpha_r
+ xvmaddadp vs4, vs52, alpha_r
+ xvmaddadp vs5, vs53, alpha_r
+ xvmaddadp vs6, vs54, alpha_r
+ xvmaddadp vs7, vs55, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+ xvmuldp vs1, vs49, alpha_r
+ xvmuldp vs2, vs50, alpha_r
+ xvmuldp vs3, vs51, alpha_r
+ xvmuldp vs4, vs52, alpha_r
+ xvmuldp vs5, vs53, alpha_r
+ xvmuldp vs6, vs54, alpha_r
+ xvmuldp vs7, vs55, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+
+ lxvd2x vs12, 0, T2
+ lxvd2x vs13, o16, T2
+ lxvd2x vs14, o32, T2
+ lxvd2x vs15, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+ xvmaddadp vs9, vs57, alpha_r
+ xvmaddadp vs10, vs58, alpha_r
+ xvmaddadp vs11, vs59, alpha_r
+ xvmaddadp vs12, vs60, alpha_r
+ xvmaddadp vs13, vs61, alpha_r
+ xvmaddadp vs14, vs62, alpha_r
+ xvmaddadp vs15, vs63, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+ xvmuldp vs9, vs57, alpha_r
+ xvmuldp vs10, vs58, alpha_r
+ xvmuldp vs11, vs59, alpha_r
+ xvmuldp vs12, vs60, alpha_r
+ xvmuldp vs13, vs61, alpha_r
+ xvmuldp vs14, vs62, alpha_r
+ xvmuldp vs15, vs63, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+
+ stxvd2x vs12, 0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ addi CO, CO, 128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8 *
+*********************************************************************/
+
+.macro LOAD4x8_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_I1
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_1
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_2
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+
+.endm
+
+.macro SAVE4x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+ xvmaddadp vs1, vs49, alpha_r
+ xvmaddadp vs2, vs50, alpha_r
+ xvmaddadp vs3, vs51, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+ xvmuldp vs1, vs49, alpha_r
+ xvmuldp vs2, vs50, alpha_r
+ xvmuldp vs3, vs51, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+ xvmaddadp vs9, vs57, alpha_r
+ xvmaddadp vs10, vs58, alpha_r
+ xvmaddadp vs11, vs59, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+ xvmuldp vs9, vs57, alpha_r
+ xvmuldp vs10, vs58, alpha_r
+ xvmuldp vs11, vs59, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ addi CO, CO, 64
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=4 *
+*********************************************************************/
+
+.macro LOAD4x4_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x4_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+
+.endm
+
+.macro KERNEL4x4_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+
+.endm
+
+.macro KERNEL4x4_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+
+.endm
+
+.macro SAVE4x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+ xvmaddadp vs1, vs49, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+ xvmuldp vs1, vs49, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+ xvmaddadp vs9, vs57, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+ xvmuldp vs9, vs57, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+
+ addi CO, CO, 32
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=2 *
+*********************************************************************/
+
+.macro LOAD4x2_1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x2_I1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+ xvmuldp vs48, vs0, vs26
+
+ xvmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x2_1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+ xvmaddadp vs48, vs0, vs26
+
+ xvmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x2_2
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+ xvmaddadp vs48, vs8, vs30
+
+ xvmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+ xvmaddadp vs48, vs8, vs30
+
+ xvmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+ xvmuldp vs48, vs0, vs26
+
+ xvmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+ xvmaddadp vs48, vs0, vs26
+
+ xvmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro SAVE4x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+
+ addi CO, CO, 16
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=1 *
+*********************************************************************/
+
+.macro LOAD4x1_1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x1_I1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+ lxsdx vs30, o16, BO
+ lxsdx vs31, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+ xsmuldp vs48, vs0, vs26
+
+ xsmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x1_1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+ lxsdx vs30, o16, BO
+ lxsdx vs31, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+ xsmaddadp vs48, vs0, vs26
+
+ xsmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x1_2
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+ xsmaddadp vs48, vs8, vs30
+
+ xsmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+ xsmaddadp vs48, vs8, vs30
+
+ xsmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+ xsmuldp vs48, vs0, vs26
+
+ xsmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+ xsmaddadp vs48, vs0, vs26
+
+ xsmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro SAVE4x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs32, alpha_r
+#else
+ xsmuldp vs0, vs32, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs8, vs40, alpha_r
+#else
+ xsmuldp vs8, vs40, alpha_r
+#endif
+
+ stxsdx vs8, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs48, alpha_r
+#else
+ xsmuldp vs0, vs48, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs8, vs56, alpha_r
+#else
+ xsmuldp vs8, vs56, alpha_r
+#endif
+
+ stxsdx vs8, 0, T1
+
+ addi CO, CO, 8
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=16 *
+*********************************************************************/
+
+.macro LOAD2x16_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+.endm
+
+.macro KERNEL2x16_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+.endm
+
+.macro KERNEL2x16_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+.endm
+
+.macro KERNEL2x16_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+.endm
+
+.macro SAVE2x16
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+ xvmaddadp vs4, vs36, alpha_r
+ xvmaddadp vs5, vs37, alpha_r
+ xvmaddadp vs6, vs38, alpha_r
+ xvmaddadp vs7, vs39, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+ xvmuldp vs4, vs36, alpha_r
+ xvmuldp vs5, vs37, alpha_r
+ xvmuldp vs6, vs38, alpha_r
+ xvmuldp vs7, vs39, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+
+ lxvd2x vs12, 0, T2
+ lxvd2x vs13, o16, T2
+ lxvd2x vs14, o32, T2
+ lxvd2x vs15, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+ xvmaddadp vs12, vs44, alpha_r
+ xvmaddadp vs13, vs45, alpha_r
+ xvmaddadp vs14, vs46, alpha_r
+ xvmaddadp vs15, vs47, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+ xvmuldp vs12, vs44, alpha_r
+ xvmuldp vs13, vs45, alpha_r
+ xvmuldp vs14, vs46, alpha_r
+ xvmuldp vs15, vs47, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ stxvd2x vs12, 0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ addi CO, CO, 128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8 *
+*********************************************************************/
+
+.macro LOAD2x8_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x8_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+.endm
+
+.macro KERNEL2x8_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+.endm
+
+.macro KERNEL2x8_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+.endm
+
+.macro SAVE2x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ addi CO, CO, 64
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=4 *
+*********************************************************************/
+
+.macro LOAD2x4_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x4_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+.endm
+
+.macro KERNEL2x4_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+.endm
+
+.macro KERNEL2x4_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+.endm
+
+.macro SAVE2x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+
+ addi CO, CO, 32
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=2 *
+*********************************************************************/
+
+.macro LOAD2x2_1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x2_I1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x2_1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x2_2
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro SAVE2x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+
+ addi CO, CO, 16
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=1 *
+*********************************************************************/
+
+.macro LOAD2x1_1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x1_I1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x1_1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x1_2
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro SAVE2x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs32, alpha_r
+#else
+ xsmuldp vs0, vs32, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs8, vs40, alpha_r
+#else
+ xsmuldp vs8, vs40, alpha_r
+#endif
+
+ stxsdx vs8, 0, T1
+
+ addi CO, CO, 8
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=16 *
+*********************************************************************/
+
+.macro LOAD1x16_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+.endm
+
+.macro KERNEL1x16_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+.endm
+
+.macro KERNEL1x16_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+.endm
+
+.macro KERNEL1x16_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+.endm
+
+.macro SAVE1x16
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+ xvmaddadp vs4, vs36, alpha_r
+ xvmaddadp vs5, vs37, alpha_r
+ xvmaddadp vs6, vs38, alpha_r
+ xvmaddadp vs7, vs39, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+ xvmuldp vs4, vs36, alpha_r
+ xvmuldp vs5, vs37, alpha_r
+ xvmuldp vs6, vs38, alpha_r
+ xvmuldp vs7, vs39, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ addi CO, CO, 128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8 *
+*********************************************************************/
+
+.macro LOAD1x8_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x8_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+.endm
+
+.macro KERNEL1x8_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+.endm
+
+.macro KERNEL1x8_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+.endm
+
+.macro SAVE1x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ addi CO, CO, 64
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=4 *
+*********************************************************************/
+
+.macro LOAD1x4_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x4_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+.endm
+
+.macro KERNEL1x4_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+.endm
+
+.macro KERNEL1x4_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+.endm
+
+.macro SAVE1x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ addi CO, CO, 32
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=2 *
+*********************************************************************/
+
+.macro LOAD1x2_1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x2_I1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x2_1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x2_2
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro SAVE1x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ addi CO, CO, 16
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=1 *
+*********************************************************************/
+
+.macro LOAD1x1_1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x1_I1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x1_1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x1_2
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+ xsmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro SAVE1x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs32, alpha_r
+#else
+ xsmuldp vs0, vs32, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ addi CO, CO, 8
+
+.endm
+
diff --git a/kernel/power/sgemm_tcopy_8_power8.S b/kernel/power/sgemm_tcopy_8_power8.S
new file mode 100644
index 000000000..2bbd6e696
--- /dev/null
+++ b/kernel/power/sgemm_tcopy_8_power8.S
@@ -0,0 +1,207 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define M r3
+#define N r4
+#define A r5
+#define LDA r6
+#define B r7
+
+#define A0 r8
+#define A1 r9
+#define A2 r10
+#define A3 r11
+
+#define J r12
+
+#define PREA r14
+#define PREB r15
+#define BO r16
+#define B8 r17
+#define B4 r18
+#define B2 r19
+#define B1 r20
+#define o4 r21
+#define T2 r22
+#define I r23
+#define o16 r24
+#define o32 r25
+#define o48 r26
+#define NOTU1 r29
+#define M8 r30
+#define T1 r31
+
+#define o0 0
+
+#include "sgemm_tcopy_macros_8_power8.S"
+
+#define STACKSIZE 384
+
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+ cmpwi cr0, M, 0
+ ble- L999
+ cmpwi cr0, N, 0
+ ble- L999
+
+ slwi LDA, LDA, BASE_SHIFT
+ slwi M8, M, 3 + BASE_SHIFT
+
+ li T2, -8
+ li PREA, -4
+ li PREB, -2
+
+ and B4, N, T2
+ and B2, N, PREA
+ and B1, N, PREB
+
+ mullw B4, B4, M
+ mullw B2, B2, M
+ mullw B1, B1, M
+
+ slwi B4, B4, BASE_SHIFT
+ slwi B2, B2, BASE_SHIFT
+ slwi B1, B1, BASE_SHIFT
+
+ add B4, B4, B
+ add B2, B2, B
+ add B1, B1, B
+
+ li PREA, 384
+ addi PREB, M8, 128
+
+ li o4, 4
+ li o16, 16
+ li o32, 32
+ li o48, 48
+
+#include "sgemm_tcopy_logic_8_power8.S"
+
+L999:
+
+ li r3, 0
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ addi SP, SP, STACKSIZE
+
+ blr
+ EPILOGUE
+
+
diff --git a/kernel/power/sgemm_tcopy_logic_8_power8.S b/kernel/power/sgemm_tcopy_logic_8_power8.S
new file mode 100644
index 000000000..4cf74baa3
--- /dev/null
+++ b/kernel/power/sgemm_tcopy_logic_8_power8.S
@@ -0,0 +1,299 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+ srawi. I, M, 2
+ ble SCOPYOT_L2_BEGIN
+
+
+SCOPYOT_L4_BEGIN:
+
+ mr A0, A
+ add A1, A0, LDA
+ add A2, A1, LDA
+ add A3, A2, LDA
+ add A, A3, LDA
+ mr B8, B
+ addi B, B, 32*SIZE
+
+ sradi. J, N, 3
+ ble SCOPYOT_L4x4_BEGIN
+
+ mr BO, B8
+ .align 5
+
+SCOPYOT_L4x8_LOOP:
+
+ dcbt A0, PREA
+ dcbt A1, PREA
+ dcbt A2, PREA
+ dcbt A3, PREA
+ COPY_4x8
+
+ addi A0, A0, 8*SIZE
+ addi A1, A1, 8*SIZE
+ addi A2, A2, 8*SIZE
+ addi A3, A3, 8*SIZE
+ add BO, BO, M8
+
+ addic. J, J, -1
+ ble SCOPYOT_L4x4_BEGIN
+
+ COPY_4x8
+
+ addi A0, A0, 8*SIZE
+ addi A1, A1, 8*SIZE
+ addi A2, A2, 8*SIZE
+ addi A3, A3, 8*SIZE
+ add BO, BO, M8
+
+ addic. J, J, -1
+ ble SCOPYOT_L4x4_BEGIN
+
+ COPY_4x8
+
+ addi A0, A0, 8*SIZE
+ addi A1, A1, 8*SIZE
+ addi A2, A2, 8*SIZE
+ addi A3, A3, 8*SIZE
+ add BO, BO, M8
+
+ addic. J, J, -1
+ ble SCOPYOT_L4x4_BEGIN
+
+ COPY_4x8
+
+ addi A0, A0, 8*SIZE
+ addi A1, A1, 8*SIZE
+ addi A2, A2, 8*SIZE
+ addi A3, A3, 8*SIZE
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt SCOPYOT_L4x8_LOOP
+
+SCOPYOT_L4x4_BEGIN:
+
+ andi. T1, N, 4
+ ble SCOPYOT_L4x2_BEGIN
+
+ mr BO, B4
+
+ COPY_4x4
+
+ addi A0, A0, 4*SIZE
+ addi A1, A1, 4*SIZE
+ addi A2, A2, 4*SIZE
+ addi A3, A3, 4*SIZE
+
+ addi B4, B4, 16*SIZE
+
+SCOPYOT_L4x2_BEGIN:
+
+ andi. T1, N, 2
+ ble SCOPYOT_L4x1_BEGIN
+
+ mr BO, B2
+
+ COPY_4x2
+
+ addi A0, A0, 2*SIZE
+ addi A1, A1, 2*SIZE
+ addi A2, A2, 2*SIZE
+ addi A3, A3, 2*SIZE
+
+ addi B2, B2, 8*SIZE
+
+SCOPYOT_L4x1_BEGIN:
+
+ andi. T1, N, 1
+ ble SCOPYOT_L4_END
+
+ mr BO, B1
+
+ COPY_4x1
+
+ addi A0, A0, 1*SIZE
+ addi A1, A1, 1*SIZE
+ addi A2, A2, 1*SIZE
+ addi A3, A3, 1*SIZE
+
+ addi B1, B1, 4*SIZE
+
+SCOPYOT_L4_END:
+
+ addic. I, I, -1
+ bgt SCOPYOT_L4_BEGIN
+
+
+
+SCOPYOT_L2_BEGIN:
+
+ andi. T1, M, 2
+ ble SCOPYOT_L1_BEGIN
+
+ mr A0, A
+ add A1, A0, LDA
+ add A, A1, LDA
+ mr B8, B
+ addi B, B, 16*SIZE
+
+ sradi. J, N, 3
+ ble SCOPYOT_L2x4_BEGIN
+
+ mr BO, B8
+
+SCOPYOT_L2x8_LOOP:
+
+ COPY_2x8
+
+ addi A0, A0, 8*SIZE
+ addi A1, A1, 8*SIZE
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt SCOPYOT_L2x8_LOOP
+
+SCOPYOT_L2x4_BEGIN:
+
+ andi. T1, N, 4
+ ble SCOPYOT_L2x2_BEGIN
+
+ mr BO, B4
+
+ COPY_2x4
+
+ addi A0, A0, 4*SIZE
+ addi A1, A1, 4*SIZE
+
+ addi B4, B4, 8*SIZE
+
+SCOPYOT_L2x2_BEGIN:
+
+ andi. T1, N, 2
+ ble SCOPYOT_L2x1_BEGIN
+
+ mr BO, B2
+
+ COPY_2x2
+
+ addi A0, A0, 2*SIZE
+ addi A1, A1, 2*SIZE
+
+ addi B2, B2, 4*SIZE
+
+SCOPYOT_L2x1_BEGIN:
+
+ andi. T1, N, 1
+ ble SCOPYOT_L2_END
+
+ mr BO, B1
+
+ COPY_2x1
+
+ addi A0, A0, 1*SIZE
+ addi A1, A1, 1*SIZE
+
+ addi B1, B1, 2*SIZE
+
+SCOPYOT_L2_END:
+
+
+SCOPYOT_L1_BEGIN:
+
+ andi. T1, M, 1
+ ble L999
+
+ mr A0, A
+ add A, A0, LDA
+ mr B8, B
+ addi B, B, 8*SIZE
+
+ sradi. J, N, 3
+ ble SCOPYOT_L1x4_BEGIN
+
+ mr BO, B8
+
+SCOPYOT_L1x8_LOOP:
+
+ COPY_1x8
+
+ addi A0, A0, 8*SIZE
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt SCOPYOT_L1x8_LOOP
+
+SCOPYOT_L1x4_BEGIN:
+
+ andi. T1, N, 4
+ ble SCOPYOT_L1x2_BEGIN
+
+ mr BO, B4
+
+ COPY_1x4
+
+ addi A0, A0, 4*SIZE
+
+ addi B4, B4, 4*SIZE
+
+SCOPYOT_L1x2_BEGIN:
+
+ andi. T1, N, 2
+ ble SCOPYOT_L1x1_BEGIN
+
+ mr BO, B2
+
+ COPY_1x2
+
+ addi A0, A0, 2*SIZE
+
+ addi B2, B2, 2*SIZE
+
+SCOPYOT_L1x1_BEGIN:
+
+ andi. T1, N, 1
+ ble SCOPYOT_L1_END
+
+ mr BO, B1
+
+ COPY_1x1
+
+ addi A0, A0, 1*SIZE
+
+ addi B1, B1, 1*SIZE
+
+SCOPYOT_L1_END:
+
diff --git a/kernel/power/sgemm_tcopy_macros_8_power8.S b/kernel/power/sgemm_tcopy_macros_8_power8.S
new file mode 100644
index 000000000..1b71d5bb3
--- /dev/null
+++ b/kernel/power/sgemm_tcopy_macros_8_power8.S
@@ -0,0 +1,308 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro COPY_4x8
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+
+ lxvw4x vs34, o0, A1
+ lxvw4x vs35, o16, A1
+
+ lxvw4x vs36, o0, A2
+ lxvw4x vs37, o16, A2
+
+ lxvw4x vs38, o0, A3
+ lxvw4x vs39, o16, A3
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs36, o0, T1
+ stxvw4x vs37, o16, T1
+
+ stxvw4x vs38, o32, T1
+ stxvw4x vs39, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro COPY_4x4
+
+ lxvw4x vs32, o0, A0
+
+ lxvw4x vs33, o0, A1
+
+ lxvw4x vs34, o0, A2
+
+ lxvw4x vs35, o0, A3
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+
+ stxvw4x vs33, o16, T1
+
+ stxvw4x vs34, o32, T1
+
+ stxvw4x vs35, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro COPY_4x2
+
+ lxsspx vs32, o0, A0
+ lxsspx vs33, o4, A0
+
+ lxsspx vs34, o0, A1
+ lxsspx vs35, o4, A1
+
+ lxsspx vs36, o0, A2
+ lxsspx vs37, o4, A2
+
+ lxsspx vs38, o0, A3
+ lxsspx vs39, o4, A3
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+ stxsspx vs33, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs34, o0, T1
+ stxsspx vs35, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs36, o0, T1
+ stxsspx vs37, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs38, o0, T1
+ stxsspx vs39, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro COPY_4x1
+
+ lxsspx vs32, o0, A0
+
+ lxsspx vs33, o0, A1
+
+ lxsspx vs34, o0, A2
+
+ lxsspx vs35, o0, A3
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+
+ stxsspx vs33, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs34, o0, T1
+
+ stxsspx vs35, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro COPY_2x8
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+
+ lxvw4x vs34, o0, A1
+ lxvw4x vs35, o16, A1
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro COPY_2x4
+
+ lxvw4x vs32, o0, A0
+
+ lxvw4x vs33, o0, A1
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+
+ stxvw4x vs33, o16, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro COPY_2x2
+
+ lxsspx vs32, o0, A0
+ lxsspx vs33, o4, A0
+
+ lxsspx vs34, o0, A1
+ lxsspx vs35, o4, A1
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+ stxsspx vs33, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs34, o0, T1
+ stxsspx vs35, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro COPY_2x1
+
+ lxsspx vs32, o0, A0
+
+ lxsspx vs33, o0, A1
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+
+ stxsspx vs33, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro COPY_1x8
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro COPY_1x4
+
+ lxvw4x vs32, o0, A0
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro COPY_1x2
+
+ lxsspx vs32, o0, A0
+ lxsspx vs33, o4, A0
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+ stxsspx vs33, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro COPY_1x1
+
+ lxsspx vs32, o0, A0
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+
+.endm
+
diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S
index 336b13b1f..02c94a88a 100644
--- a/kernel/power/zgemm_kernel_8x2_power8.S
+++ b/kernel/power/zgemm_kernel_8x2_power8.S
@@ -1,3 +1,73 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
@@ -250,7 +320,7 @@
ble L999
slwi LDC, LDC, ZBASE_SHIFT
- li PRE, 384
+ li PRE, 512
li o8 , 8
li o16 , 16
li o24 , 24
diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S
index 96612da82..0cd784cc0 100644
--- a/kernel/power/zgemm_logic_8x2_power8.S
+++ b/kernel/power/zgemm_logic_8x2_power8.S
@@ -1,3 +1,39 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
srawi. J, N, 1
ble ZGEMM_L2_END
@@ -5,20 +41,34 @@ ZGEMM_L2_BEGIN:
mr BO, B
mr BBO, BBUFFER
- slwi T1, K, 1
+ srawi. T1, K, 2
+ ble ZGEMM_L2_COPYB1
-ZGEMM_L2_COPYB:
+ZGEMM_L2_COPYB8:
- lxvdsx vs4, o0, BO // b0_r
- lxvdsx vs5, o8, BO // b0_i
- addi BO, BO, 16
- stxvd2x vs4, o0, BBO
- stxvd2x vs5, o16, BBO
+ addi T2, PRE, 128
+ dcbt BO, PRE
+ dcbtst BBO, PRE
+ dcbtst BBO, T2
+ ZCOPYB_8x1
addic. T1, T1, -1
- addi BBO, BBO, 32
- bge ZGEMM_L2_COPYB
+ bgt ZGEMM_L2_COPYB8
+ZGEMM_L2_COPYB1:
+
+ andi. T1, K, 3
+ ble ZGEMM_L2_COPYB_END
+
+ZGEMM_L2_COPYB_LOOP:
+
+ ZCOPYB_1x1
+ ZCOPYB_1x1
+ addic. T1, T1, -1
+
+ bgt ZGEMM_L2_COPYB_LOOP
+
+ZGEMM_L2_COPYB_END:
mr CO, C
mr AO, A
@@ -493,6 +543,7 @@ ZGEMM_L1_BEGIN:
slwi T1, K, 0
ZGEMM_L1_COPYB:
+ dcbtst BBO, PRE
lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i
diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S
index a0fbb2e11..c43a115b2 100644
--- a/kernel/power/zgemm_macros_8x2_power8.S
+++ b/kernel/power/zgemm_macros_8x2_power8.S
@@ -1,3 +1,38 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define XSFADD_R1 xsadddp
@@ -3055,3 +3090,76 @@
.endm
+
+
+.macro ZCOPYB_1x1
+
+ lxvdsx vs4, o0, BO // b0_r
+ lxvdsx vs5, o8, BO // b0_i
+ addi BO, BO, 16
+ stxvd2x vs4, o0, BBO
+ stxvd2x vs5, o16, BBO
+ addi BBO, BBO, 32
+
+.endm
+
+
+.macro ZCOPYB_8x1
+
+ lxvd2x vs32, o0, BO
+ lxvd2x vs33, o16, BO
+ lxvd2x vs34, o32, BO
+ lxvd2x vs35, o48, BO
+ addi BO, BO, 64
+
+ lxvd2x vs36, o0, BO
+ lxvd2x vs37, o16, BO
+ lxvd2x vs38, o32, BO
+ lxvd2x vs39, o48, BO
+ addi BO, BO, 64
+
+ xxspltd vs40, vs32, 0
+ xxspltd vs41, vs32, 1
+ xxspltd vs42, vs33, 0
+ xxspltd vs43, vs33, 1
+ xxspltd vs44, vs34, 0
+ xxspltd vs45, vs34, 1
+ xxspltd vs46, vs35, 0
+ xxspltd vs47, vs35, 1
+
+ xxspltd vs48, vs36, 0
+ xxspltd vs49, vs36, 1
+ xxspltd vs50, vs37, 0
+ xxspltd vs51, vs37, 1
+ xxspltd vs52, vs38, 0
+ xxspltd vs53, vs38, 1
+ xxspltd vs54, vs39, 0
+ xxspltd vs55, vs39, 1
+
+ stxvd2x vs40, o0, BBO
+ stxvd2x vs41, o16, BBO
+ stxvd2x vs42, o32, BBO
+ stxvd2x vs43, o48, BBO
+ addi BBO, BBO, 64
+
+ stxvd2x vs44, o0, BBO
+ stxvd2x vs45, o16, BBO
+ stxvd2x vs46, o32, BBO
+ stxvd2x vs47, o48, BBO
+ addi BBO, BBO, 64
+
+ stxvd2x vs48, o0, BBO
+ stxvd2x vs49, o16, BBO
+ stxvd2x vs50, o32, BBO
+ stxvd2x vs51, o48, BBO
+ addi BBO, BBO, 64
+
+ stxvd2x vs52, o0, BBO
+ stxvd2x vs53, o16, BBO
+ stxvd2x vs54, o32, BBO
+ stxvd2x vs55, o48, BBO
+ addi BBO, BBO, 64
+
+.endm
+
+
diff --git a/kernel/power/zgemm_tcopy_8_power8.S b/kernel/power/zgemm_tcopy_8_power8.S
new file mode 100644
index 000000000..1f3f35419
--- /dev/null
+++ b/kernel/power/zgemm_tcopy_8_power8.S
@@ -0,0 +1,205 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define M r3
+#define N r4
+#define A r5
+#define LDA r6
+#define B r7
+
+#define A0 r8
+#define A1 r9
+#define A2 r10
+#define A3 r11
+
+#define J r12
+
+#define PREA r14
+#define PREB r15
+#define BO r16
+#define B8 r17
+#define B4 r18
+#define B2 r19
+#define B1 r20
+#define NOTUS1 r21
+#define T2 r22
+#define I r23
+#define o16 r24
+#define o32 r25
+#define o48 r26
+#define NOTUS2 r27
+#define M8 r30
+#define T1 r31
+
+#define o0 0
+
+#include "zgemm_tcopy_macros_8_power8.S"
+
+#define STACKSIZE 384
+
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+ cmpwi cr0, M, 0
+ ble- L999
+ cmpwi cr0, N, 0
+ ble- L999
+
+ slwi LDA, LDA, ZBASE_SHIFT
+ slwi M8, M, 3 + ZBASE_SHIFT
+
+ li T2, -8
+ li PREA, -4
+ li PREB, -2
+
+ and B4, N, T2
+ and B2, N, PREA
+ and B1, N, PREB
+
+ mullw B4, B4, M
+ mullw B2, B2, M
+ mullw B1, B1, M
+
+ slwi B4, B4, ZBASE_SHIFT
+ slwi B2, B2, ZBASE_SHIFT
+ slwi B1, B1, ZBASE_SHIFT
+
+ add B4, B4, B
+ add B2, B2, B
+ add B1, B1, B
+
+ li PREA, 384
+ addi PREB, M8, 128
+
+ li o16, 16
+ li o32, 32
+ li o48, 48
+
+#include "zgemm_tcopy_logic_8_power8.S"
+
+L999:
+
+ li r3, 0
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ addi SP, SP, STACKSIZE
+
+ blr
+ EPILOGUE
+
+
diff --git a/kernel/power/zgemm_tcopy_logic_8_power8.S b/kernel/power/zgemm_tcopy_logic_8_power8.S
new file mode 100644
index 000000000..34fd307bd
--- /dev/null
+++ b/kernel/power/zgemm_tcopy_logic_8_power8.S
@@ -0,0 +1,246 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+ srawi. I, M, 2
+ ble ZCOPYT_L2_BEGIN
+
+
+ZCOPYT_L4_BEGIN:
+
+ mr A0, A
+ add A1, A0, LDA
+ add A2, A1, LDA
+ add A3, A2, LDA
+ add A, A3, LDA
+ mr B8, B
+ addi B, B, 64*SIZE
+
+ sradi. J, N, 3
+ ble ZCOPYT_L4x4_BEGIN
+
+ mr BO, B8
+
+ .align 5
+
+ZCOPYT_L4x8_LOOP:
+
+ addi T1, PREB, 128
+ addi T2, PREB, 256
+ dcbt A0, PREA
+ dcbt A1, PREA
+ dcbt A2, PREA
+ dcbt A3, PREA
+ dcbtst BO, M8
+ dcbtst BO, PREB
+ dcbtst BO, T1
+ dcbtst BO, T2
+
+ COPY_4x8
+
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt ZCOPYT_L4x8_LOOP
+
+ZCOPYT_L4x4_BEGIN:
+
+ andi. T1, N, 4
+ ble ZCOPYT_L4x2_BEGIN
+
+ mr BO, B4
+
+ COPY_4x4
+
+
+ addi B4, B4, 32*SIZE
+
+ZCOPYT_L4x2_BEGIN:
+
+ andi. T1, N, 2
+ ble ZCOPYT_L4x1_BEGIN
+
+ mr BO, B2
+
+ COPY_4x2
+
+
+ addi B2, B2, 16*SIZE
+
+ZCOPYT_L4x1_BEGIN:
+
+ andi. T1, N, 1
+ ble ZCOPYT_L4_END
+
+ mr BO, B1
+
+ COPY_4x1
+
+
+ addi B1, B1, 8*SIZE
+
+ZCOPYT_L4_END:
+
+ addic. I, I, -1
+ bgt ZCOPYT_L4_BEGIN
+
+
+
+ZCOPYT_L2_BEGIN:
+
+ andi. T1, M, 2
+ ble ZCOPYT_L1_BEGIN
+
+ mr A0, A
+ add A1, A0, LDA
+ add A, A1, LDA
+ mr B8, B
+ addi B, B, 32*SIZE
+
+ sradi. J, N, 3
+ ble ZCOPYT_L2x4_BEGIN
+
+ mr BO, B8
+
+ZCOPYT_L2x8_LOOP:
+
+ COPY_2x8
+
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt ZCOPYT_L2x8_LOOP
+
+ZCOPYT_L2x4_BEGIN:
+
+ andi. T1, N, 4
+ ble ZCOPYT_L2x2_BEGIN
+
+ mr BO, B4
+
+ COPY_2x4
+
+
+ addi B4, B4, 16*SIZE
+
+ZCOPYT_L2x2_BEGIN:
+
+ andi. T1, N, 2
+ ble ZCOPYT_L2x1_BEGIN
+
+ mr BO, B2
+
+ COPY_2x2
+
+
+ addi B2, B2, 8*SIZE
+
+ZCOPYT_L2x1_BEGIN:
+
+ andi. T1, N, 1
+ ble ZCOPYT_L2_END
+
+ mr BO, B1
+
+ COPY_2x1
+
+
+ addi B1, B1, 4*SIZE
+
+ZCOPYT_L2_END:
+
+
+ZCOPYT_L1_BEGIN:
+
+ andi. T1, M, 1
+ ble L999
+
+ mr A0, A
+ add A, A0, LDA
+ mr B8, B
+ addi B, B, 16*SIZE
+
+ sradi. J, N, 3
+ ble ZCOPYT_L1x4_BEGIN
+
+ mr BO, B8
+
+ZCOPYT_L1x8_LOOP:
+
+ COPY_1x8
+
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt ZCOPYT_L1x8_LOOP
+
+ZCOPYT_L1x4_BEGIN:
+
+ andi. T1, N, 4
+ ble ZCOPYT_L1x2_BEGIN
+
+ mr BO, B4
+
+ COPY_1x4
+
+
+ addi B4, B4, 8*SIZE
+
+ZCOPYT_L1x2_BEGIN:
+
+ andi. T1, N, 2
+ ble ZCOPYT_L1x1_BEGIN
+
+ mr BO, B2
+
+ COPY_1x2
+
+
+ addi B2, B2, 4*SIZE
+
+ZCOPYT_L1x1_BEGIN:
+
+ andi. T1, N, 1
+ ble ZCOPYT_L1_END
+
+ mr BO, B1
+
+ COPY_1x1
+
+
+ addi B1, B1, 2*SIZE
+
+ZCOPYT_L1_END:
+
diff --git a/kernel/power/zgemm_tcopy_macros_8_power8.S b/kernel/power/zgemm_tcopy_macros_8_power8.S
new file mode 100644
index 000000000..e8c2f0baa
--- /dev/null
+++ b/kernel/power/zgemm_tcopy_macros_8_power8.S
@@ -0,0 +1,535 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro COPY_4x8
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+ lxvd2x vs36, o0, A0
+ lxvd2x vs37, o16, A0
+ lxvd2x vs38, o32, A0
+ lxvd2x vs39, o48, A0
+ addi A0, A0, 64
+
+
+ lxvd2x vs40, o0, A1
+ lxvd2x vs41, o16, A1
+ lxvd2x vs42, o32, A1
+ lxvd2x vs43, o48, A1
+ addi A1, A1, 64
+
+ lxvd2x vs44, o0, A1
+ lxvd2x vs45, o16, A1
+ lxvd2x vs46, o32, A1
+ lxvd2x vs47, o48, A1
+ addi A1, A1, 64
+
+
+ lxvd2x vs48, o0, A2
+ lxvd2x vs49, o16, A2
+ lxvd2x vs50, o32, A2
+ lxvd2x vs51, o48, A2
+ addi A2, A2, 64
+
+ lxvd2x vs52, o0, A2
+ lxvd2x vs53, o16, A2
+ lxvd2x vs54, o32, A2
+ lxvd2x vs55, o48, A2
+ addi A2, A2, 64
+
+
+ lxvd2x vs56, o0, A3
+ lxvd2x vs57, o16, A3
+ lxvd2x vs58, o32, A3
+ lxvd2x vs59, o48, A3
+ addi A3, A3, 64
+
+ lxvd2x vs60, o0, A3
+ lxvd2x vs61, o16, A3
+ lxvd2x vs62, o32, A3
+ lxvd2x vs63, o48, A3
+ addi A3, A3, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs40, o0, T1
+ stxvd2x vs41, o16, T1
+ stxvd2x vs42, o32, T1
+ stxvd2x vs43, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs44, o0, T1
+ stxvd2x vs45, o16, T1
+ stxvd2x vs46, o32, T1
+ stxvd2x vs47, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs48, o0, T1
+ stxvd2x vs49, o16, T1
+ stxvd2x vs50, o32, T1
+ stxvd2x vs51, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs52, o0, T1
+ stxvd2x vs53, o16, T1
+ stxvd2x vs54, o32, T1
+ stxvd2x vs55, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs56, o0, T1
+ stxvd2x vs57, o16, T1
+ stxvd2x vs58, o32, T1
+ stxvd2x vs59, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs60, o0, T1
+ stxvd2x vs61, o16, T1
+ stxvd2x vs62, o32, T1
+ stxvd2x vs63, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro COPY_4x4
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+
+ lxvd2x vs36, o0, A1
+ lxvd2x vs37, o16, A1
+ lxvd2x vs38, o32, A1
+ lxvd2x vs39, o48, A1
+ addi A1, A1, 64
+
+
+ lxvd2x vs40, o0, A2
+ lxvd2x vs41, o16, A2
+ lxvd2x vs42, o32, A2
+ lxvd2x vs43, o48, A2
+ addi A2, A2, 64
+
+
+ lxvd2x vs44, o0, A3
+ lxvd2x vs45, o16, A3
+ lxvd2x vs46, o32, A3
+ lxvd2x vs47, o48, A3
+ addi A3, A3, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs40, o0, T1
+ stxvd2x vs41, o16, T1
+ stxvd2x vs42, o32, T1
+ stxvd2x vs43, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs44, o0, T1
+ stxvd2x vs45, o16, T1
+ stxvd2x vs46, o32, T1
+ stxvd2x vs47, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro COPY_4x2
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ addi A0, A0, 32
+
+
+ lxvd2x vs34, o0, A1
+ lxvd2x vs35, o16, A1
+ addi A1, A1, 32
+
+
+ lxvd2x vs36, o0, A2
+ lxvd2x vs37, o16, A2
+ addi A2, A2, 32
+
+
+ lxvd2x vs38, o0, A3
+ lxvd2x vs39, o16, A3
+ addi A3, A3, 32
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro COPY_4x1
+
+ lxvd2x vs32, o0, A0
+ addi A0, A0, 16
+
+
+ lxvd2x vs33, o0, A1
+ addi A1, A1, 16
+
+
+ lxvd2x vs34, o0, A2
+ addi A2, A2, 16
+
+
+ lxvd2x vs35, o0, A3
+ addi A3, A3, 16
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+
+ stxvd2x vs33, o16, T1
+
+ stxvd2x vs34, o32, T1
+
+ stxvd2x vs35, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro COPY_2x8
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+ lxvd2x vs36, o0, A0
+ lxvd2x vs37, o16, A0
+ lxvd2x vs38, o32, A0
+ lxvd2x vs39, o48, A0
+ addi A0, A0, 64
+
+
+ lxvd2x vs40, o0, A1
+ lxvd2x vs41, o16, A1
+ lxvd2x vs42, o32, A1
+ lxvd2x vs43, o48, A1
+ addi A1, A1, 64
+
+ lxvd2x vs44, o0, A1
+ lxvd2x vs45, o16, A1
+ lxvd2x vs46, o32, A1
+ lxvd2x vs47, o48, A1
+ addi A1, A1, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs40, o0, T1
+ stxvd2x vs41, o16, T1
+ stxvd2x vs42, o32, T1
+ stxvd2x vs43, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs44, o0, T1
+ stxvd2x vs45, o16, T1
+ stxvd2x vs46, o32, T1
+ stxvd2x vs47, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro COPY_2x4
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+
+ lxvd2x vs36, o0, A1
+ lxvd2x vs37, o16, A1
+ lxvd2x vs38, o32, A1
+ lxvd2x vs39, o48, A1
+ addi A1, A1, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro COPY_2x2
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ addi A0, A0, 32
+
+
+ lxvd2x vs34, o0, A1
+ lxvd2x vs35, o16, A1
+ addi A1, A1, 32
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro COPY_2x1
+
+ lxvd2x vs32, o0, A0
+ addi A0, A0, 16
+
+
+ lxvd2x vs33, o0, A1
+ addi A1, A1, 16
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+
+ stxvd2x vs33, o16, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro COPY_1x8
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+ lxvd2x vs36, o0, A0
+ lxvd2x vs37, o16, A0
+ lxvd2x vs38, o32, A0
+ lxvd2x vs39, o48, A0
+ addi A0, A0, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro COPY_1x4
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro COPY_1x2
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ addi A0, A0, 32
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro COPY_1x1
+
+ lxvd2x vs32, o0, A0
+ addi A0, A0, 16
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+
+.endm
+
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index a4d1486fc..ba44b8f61 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -933,6 +933,23 @@ static void init_parameter(void) {
#endif
#endif
+#ifdef EXCAVATOR
+
+#ifdef DEBUG
+ fprintf(stderr, "Excavator\n");
+#endif
+
+ TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
+ TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
+ TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
+ TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
+#ifdef EXPRECISION
+ TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
+ TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
+#endif
+#endif
+
+
#ifdef PILEDRIVER
#ifdef DEBUG
diff --git a/kernel/x86_64/KERNEL.EXCAVATOR b/kernel/x86_64/KERNEL.EXCAVATOR
index dbdd1fe9b..4ec748284 100644
--- a/kernel/x86_64/KERNEL.EXCAVATOR
+++ b/kernel/x86_64/KERNEL.EXCAVATOR
@@ -1,3 +1,7 @@
+DSCALKERNEL = dscal.c
+CSCALKERNEL = cscal.c
+ZSCALKERNEL = zscal.c
+
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
@@ -20,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c
DGEMVNKERNEL = dgemv_n_4.c
DGEMVTKERNEL = dgemv_t_4.c
-ZGEMVNKERNEL = zgemv_n_dup.S
+ZGEMVNKERNEL = zgemv_n_4.c
ZGEMVTKERNEL = zgemv_t_4.c
DCOPYKERNEL = dcopy_bulldozer.S
@@ -68,25 +72,23 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
-STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+STRSMKERNEL_LN = strsm_kernel_LN_bulldozer.c
+STRSMKERNEL_LT = strsm_kernel_LT_bulldozer.c
+STRSMKERNEL_RN = strsm_kernel_RN_bulldozer.c
+STRSMKERNEL_RT = strsm_kernel_RT_bulldozer.c
-
-DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LN = dtrsm_kernel_LN_bulldozer.c
DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
-DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+DTRSMKERNEL_RT = dtrsm_kernel_RT_bulldozer.c
-CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+CTRSMKERNEL_LN = ctrsm_kernel_LN_bulldozer.c
+CTRSMKERNEL_LT = ctrsm_kernel_LT_bulldozer.c
+CTRSMKERNEL_RN = ctrsm_kernel_RN_bulldozer.c
+CTRSMKERNEL_RT = ctrsm_kernel_RT_bulldozer.c
+ZTRSMKERNEL_LN = ztrsm_kernel_LN_bulldozer.c
+ZTRSMKERNEL_LT = ztrsm_kernel_LT_bulldozer.c
+ZTRSMKERNEL_RN = ztrsm_kernel_RN_bulldozer.c
+ZTRSMKERNEL_RT = ztrsm_kernel_RT_bulldozer.c
diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c
index 1ee0499a7..5af9b8fcc 100644
--- a/kernel/x86_64/caxpy.c
+++ b/kernel/x86_64/caxpy.c
@@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
-#if defined(PILEDRIVER) || defined(STEAMROLLER)
+#if defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "caxpy_microk_steamroller-2.c"
#elif defined(BULLDOZER)
#include "caxpy_microk_bulldozer-2.c"
diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c
index 2b2c4ff7a..9bba72ba2 100644
--- a/kernel/x86_64/cdot.c
+++ b/kernel/x86_64/cdot.c
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER)
#include "cdot_microk_bulldozer-2.c"
-#elif defined(STEAMROLLER) || defined(PILEDRIVER)
+#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
#include "cdot_microk_steamroller-2.c"
#elif defined(HASWELL)
#include "cdot_microk_haswell-2.c"
diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c
index d60e4475d..235510534 100644
--- a/kernel/x86_64/cgemv_n_4.c
+++ b/kernel/x86_64/cgemv_n_4.c
@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(HASWELL)
#include "cgemv_n_microk_haswell-4.c"
-#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
+#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "cgemv_n_microk_bulldozer-4.c"
#endif
diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c
index b558164ff..1a714f61f 100644
--- a/kernel/x86_64/cgemv_t_4.c
+++ b/kernel/x86_64/cgemv_t_4.c
@@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(HASWELL)
#include "cgemv_t_microk_haswell-4.c"
-#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
+#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "cgemv_t_microk_bulldozer-4.c"
#endif
diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c
index 5d86b1929..c44d12e3d 100644
--- a/kernel/x86_64/cscal.c
+++ b/kernel/x86_64/cscal.c
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cscal_microk_haswell-2.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER)
#include "cscal_microk_bulldozer-2.c"
-#elif defined(STEAMROLLER)
+#elif defined(STEAMROLLER) || defined(EXCAVATOR)
#include "cscal_microk_steamroller-2.c"
#elif defined(SANDYBRIDGE)
#include "cscal_microk_bulldozer-2.c"
diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c
index 56d323cbe..18569e6e4 100644
--- a/kernel/x86_64/daxpy.c
+++ b/kernel/x86_64/daxpy.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "daxpy_microk_nehalem-2.c"
#elif defined(BULLDOZER)
#include "daxpy_microk_bulldozer-2.c"
-#elif defined(STEAMROLLER)
+#elif defined(STEAMROLLER) || defined(EXCAVATOR)
#include "daxpy_microk_steamroller-2.c"
#elif defined(PILEDRIVER)
#include "daxpy_microk_piledriver-2.c"
diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c
index 4bf8082c9..a45dd7f3b 100644
--- a/kernel/x86_64/ddot.c
+++ b/kernel/x86_64/ddot.c
@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER)
#include "ddot_microk_bulldozer-2.c"
-#elif defined(STEAMROLLER)
+#elif defined(STEAMROLLER) || defined(EXCAVATOR)
#include "ddot_microk_steamroller-2.c"
#elif defined(PILEDRIVER)
#include "ddot_microk_piledriver-2.c"
diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c
index 485b234b0..4200b8acd 100644
--- a/kernel/x86_64/dgemv_n_4.c
+++ b/kernel/x86_64/dgemv_n_4.c
@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NEHALEM)
#include "dgemv_n_microk_nehalem-4.c"
-#elif defined(HASWELL) || defined(STEAMROLLER)
+#elif defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dgemv_n_microk_haswell-4.c"
#endif
diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c
index 8ed821dd0..42f11f39a 100644
--- a/kernel/x86_64/dgemv_t_4.c
+++ b/kernel/x86_64/dgemv_t_4.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
-#if defined(HASWELL) || defined(STEAMROLLER)
+#if defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dgemv_t_microk_haswell-4.c"
#endif
diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c
index b7110e6ac..bbc1c9660 100644
--- a/kernel/x86_64/dscal.c
+++ b/kernel/x86_64/dscal.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
-#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dscal_microk_bulldozer-2.c"
#elif defined(SANDYBRIDGE)
#include "dscal_microk_sandy-2.c"
diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c
index 3f5e77e5f..e10784ad7 100644
--- a/kernel/x86_64/dsymv_L.c
+++ b/kernel/x86_64/dsymv_L.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
-#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dsymv_L_microk_bulldozer-2.c"
#elif defined(HASWELL)
#include "dsymv_L_microk_haswell-2.c"
diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c
index 9f5ae3015..bd07ce2c3 100644
--- a/kernel/x86_64/dsymv_U.c
+++ b/kernel/x86_64/dsymv_U.c
@@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
-#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dsymv_U_microk_bulldozer-2.c"
#elif defined(HASWELL)
#include "dsymv_U_microk_haswell-2.c"
diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c
index 0b76c42f7..b9e5d5784 100644
--- a/kernel/x86_64/saxpy.c
+++ b/kernel/x86_64/saxpy.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "saxpy_microk_haswell-2.c"
#elif defined(SANDYBRIDGE)
#include "saxpy_microk_sandy-2.c"
-#elif defined(PILEDRIVER) || defined(STEAMROLLER)
+#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "saxpy_microk_piledriver-2.c"
#endif
diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c
index a3d20d276..d9fc417a0 100644
--- a/kernel/x86_64/sdot.c
+++ b/kernel/x86_64/sdot.c
@@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER)
#include "sdot_microk_bulldozer-2.c"
-#elif defined(STEAMROLLER) || defined(PILEDRIVER)
+#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
#include "sdot_microk_steamroller-2.c"
#elif defined(NEHALEM)
#include "sdot_microk_nehalem-2.c"
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index c7b4516c3..bdf68dd07 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
-#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "sgemv_n_microk_bulldozer-4.c"
#elif defined(NEHALEM)
#include "sgemv_n_microk_nehalem-4.c"
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sgemv_n_microk_haswell-4.c"
#endif
-#if defined(STEAMROLLER)
+#if defined(STEAMROLLER) || defined(EXCAVATOR)
#define NBMAX 2048
#else
#define NBMAX 4096
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 5c7d1a53b..62550e65c 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NEHALEM)
#include "sgemv_t_microk_nehalem-4.c"
-#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
+#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "sgemv_t_microk_bulldozer-4.c"
#elif defined(SANDYBRIDGE)
#include "sgemv_t_microk_sandy-4.c"
@@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sgemv_t_microk_haswell-4.c"
#endif
-#if defined(STEAMROLLER)
+#if defined(STEAMROLLER) || defined(EXCAVATOR)
#define NBMAX 2048
#else
#define NBMAX 4096
diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c
index 0997f108d..3813981ed 100644
--- a/kernel/x86_64/ssymv_L.c
+++ b/kernel/x86_64/ssymv_L.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
-#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "ssymv_L_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "ssymv_L_microk_nehalem-2.c"
diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c
index ed1e8236c..e4d3c9b30 100644
--- a/kernel/x86_64/ssymv_U.c
+++ b/kernel/x86_64/ssymv_U.c
@@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
-#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "ssymv_U_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "ssymv_U_microk_nehalem-2.c"
diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c
index 560acc7f9..0cd555a68 100644
--- a/kernel/x86_64/zaxpy.c
+++ b/kernel/x86_64/zaxpy.c
@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER)
#include "zaxpy_microk_bulldozer-2.c"
-#elif defined(PILEDRIVER) || defined(STEAMROLLER)
+#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "zaxpy_microk_steamroller-2.c"
#elif defined(HASWELL)
#include "zaxpy_microk_haswell-2.c"
diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c
index eee00fd9f..4533d4e88 100644
--- a/kernel/x86_64/zdot.c
+++ b/kernel/x86_64/zdot.c
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(BULLDOZER)
#include "zdot_microk_bulldozer-2.c"
-#elif defined(STEAMROLLER) || defined(PILEDRIVER)
+#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
#include "zdot_microk_steamroller-2.c"
#elif defined(HASWELL)
#include "zdot_microk_haswell-2.c"
diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c
index 63e49f2af..4171fc99f 100644
--- a/kernel/x86_64/zgemv_n_4.c
+++ b/kernel/x86_64/zgemv_n_4.c
@@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "zgemv_n_microk_haswell-4.c"
#elif defined(SANDYBRIDGE)
#include "zgemv_n_microk_sandy-4.c"
-#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
+#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "zgemv_n_microk_bulldozer-4.c"
#endif
diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c
index 4abb2d5ad..0524c71f7 100644
--- a/kernel/x86_64/zgemv_t_4.c
+++ b/kernel/x86_64/zgemv_t_4.c
@@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
-#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "zgemv_t_microk_bulldozer-4.c"
#elif defined(HASWELL)
#include "zgemv_t_microk_haswell-4.c"
diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c
index a96766032..7ca8774b7 100644
--- a/kernel/x86_64/zscal.c
+++ b/kernel/x86_64/zscal.c
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "zscal_microk_haswell-2.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER)
#include "zscal_microk_bulldozer-2.c"
-#elif defined(STEAMROLLER)
+#elif defined(STEAMROLLER) || defined(EXCAVATOR)
#include "zscal_microk_steamroller-2.c"
#endif
diff --git a/param.h b/param.h
index 6948e6a76..abe739af2 100644
--- a/param.h
+++ b/param.h
@@ -1977,15 +1977,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 2
-#define SGEMM_DEFAULT_P 960
-#define DGEMM_DEFAULT_P 480
-#define CGEMM_DEFAULT_P 720
-#define ZGEMM_DEFAULT_P 480
+#define SGEMM_DEFAULT_P 1280
+#define DGEMM_DEFAULT_P 640
+#define CGEMM_DEFAULT_P 640
+#define ZGEMM_DEFAULT_P 320
-#define SGEMM_DEFAULT_Q 720
-#define DGEMM_DEFAULT_Q 720
-#define CGEMM_DEFAULT_Q 720
-#define ZGEMM_DEFAULT_Q 720
+#define SGEMM_DEFAULT_Q 640
+#define DGEMM_DEFAULT_Q 640
+#define CGEMM_DEFAULT_Q 640
+#define ZGEMM_DEFAULT_Q 640
#define SYMV_P 8
diff --git a/test/Makefile b/test/Makefile
index 75ea6de60..65fb6f438 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -4,6 +4,7 @@ include ../Makefile.system
all :: level1 level2 level3
level1 : sblat1 dblat1 cblat1 zblat1
+ifndef CROSS
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1
@@ -21,8 +22,10 @@ else
OPENBLAS_NUM_THREADS=2 ./zblat1
endif
endif
+endif
level2 : sblat2 dblat2 cblat2 zblat2
+ifndef CROSS
rm -f ?BLAT2.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat
@$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0
@@ -54,8 +57,10 @@ else
@$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0
endif
endif
+endif
level3 : sblat3 dblat3 cblat3 zblat3
+ifndef CROSS
rm -f ?BLAT3.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat
@$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0
@@ -87,9 +92,11 @@ else
@$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0
endif
endif
+endif
level3_3m : zblat3_3m cblat3_3m
+ifndef CROSS
rm -f ?BLAT3_3M.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat
@$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
@@ -109,6 +116,7 @@ else
@$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0
endif
endif
+endif
diff --git a/utest/Makefile b/utest/Makefile
index 9f9808920..3ccc0a041 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -21,7 +21,9 @@ $(UTESTBIN): $(OBJS)
$(CC) $(CFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
run_test: $(UTESTBIN)
+ifndef CROSS
./$(UTESTBIN)
+endif
clean:
-rm -f *.o $(UTESTBIN)