diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 4431103bd..999413be2 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -151,5 +151,9 @@ In chronological order: * [2016-03-20] Fix compiler error in VisualStudio with CMake * [2016-03-22] Fix access violation on Windows while static linking +* Paul Mustière + * [2016-02-04] Fix Android build on ARMV7 + * [2016-04-26] Android build with LAPACK for ARMV7 & ARMV8 + * Shivraj Patil * [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA diff --git a/Makefile b/Makefile index 9ba2bffb3..2ae004798 100644 --- a/Makefile +++ b/Makefile @@ -108,8 +108,6 @@ endif tests : ifndef NOFORTRAN -ifndef TARGET -ifndef CROSS touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -119,8 +117,6 @@ ifndef NO_CBLAS $(MAKE) -C ctest all endif endif -endif -endif libs : ifeq ($(CORE), UNKOWN) diff --git a/Makefile.install b/Makefile.install index 5da4e68c9..1b9388a8b 100644 --- a/Makefile.install +++ b/Makefile.install @@ -20,75 +20,75 @@ lib.grd : $(error OpenBLAS: Please run "make" firstly) install : lib.grd - @-mkdir -p $(DESTDIR)$(PREFIX) - @-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR) - @-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR) + @-mkdir -p "$(DESTDIR)$(PREFIX)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) #for inc - @echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h + @echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" + @echo \#define OPENBLAS_CONFIG_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" + @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" + @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" + @cat openblas_config_template.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" + @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" @echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @echo \#ifndef OPENBLAS_F77BLAS_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h - @echo \#define OPENBLAS_F77BLAS_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h - @echo \#include \"openblas_config.h\" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h - @cat common_interface.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h - @echo \#endif >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h + @echo \#ifndef OPENBLAS_F77BLAS_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" + @echo \#define OPENBLAS_F77BLAS_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" + @echo \#include \"openblas_config.h\" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" + @cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" + @echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" ifndef NO_CBLAS @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @sed 's/common/openblas_config/g' cblas.h > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h + @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" endif #for install static library ifndef NO_STATIC @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ + @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library ifndef NO_SHARED @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) - @install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ + @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), FreeBSD) - @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ + @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), NetBSD) - @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ + @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), Darwin) - @-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) - @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ + @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" + @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib endif ifeq ($(OSNAME), WINNT) - @-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR) - @-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) + @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" + @-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" endif ifeq ($(OSNAME), CYGWIN_NT) @-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR) @@ -96,34 +96,34 @@ endif endif #Generating OpenBLASConfig.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) - @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) - @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" ifndef NO_SHARED #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), Darwin) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif else #only static - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif #Generating OpenBLASConfigVersion.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) - @echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) - @echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo "else ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" + @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! diff --git a/README.md b/README.md index 32a861081..8ac88840a 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,7 @@ Please read GotoBLAS_01Readme.txt - **MingWin or Visual Studio(CMake)/Windows**: Please read . - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. - **FreeBSD**: Supported by community. We didn't test the library on this OS. +- **Android**: Supported by community. Please read . ## Usages Link with libopenblas.a or -lopenblas for shared library. diff --git a/c_check b/c_check index d624472dc..50ff360a2 100644 --- a/c_check +++ b/c_check @@ -1,5 +1,7 @@ #!/usr/bin/perl +use File::Basename; + # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); @@ -26,14 +28,12 @@ if ($?) { $cross_suffix = ""; -if ($ARGV[0] =~ /(.*)(-[.\d]+)/) { - if ($1 =~ /(.*-)(.*)/) { - $cross_suffix = $1; - } -} else { - if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) { - $cross_suffix = $1; - } +if (dirname($compiler_name) ne ".") { + $cross_suffix .= dirname($compiler_name) . "/"; +} + +if (basename($compiler_name) =~ /(.*-)(.*)/) { + $cross_suffix .= $1; } $compiler = ""; @@ -243,7 +243,7 @@ print MAKEFILE "BINARY64=\n" if $binformat ne bin64; print MAKEFILE "BINARY32=1\n" if $binformat eq bin32; print MAKEFILE "BINARY64=1\n" if $binformat eq bin64; print MAKEFILE "FU=$need_fu\n" if $need_fu ne ""; -print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne ""; +print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne ""; print MAKEFILE "CROSS=1\n" if $cross != 0; print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; diff --git a/ctest/Makefile b/ctest/Makefile index 7a5d236aa..6eda43863 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -42,6 +42,7 @@ ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o all :: all1 all2 all3 all1: xscblat1 xdcblat1 xccblat1 xzcblat1 +ifndef CROSS ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./xscblat1 OMP_NUM_THREADS=2 ./xdcblat1 @@ -53,8 +54,10 @@ else OPENBLAS_NUM_THREADS=2 ./xccblat1 OPENBLAS_NUM_THREADS=2 ./xzcblat1 endif +endif all2: xscblat2 xdcblat2 xccblat2 xzcblat2 +ifndef CROSS ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./xscblat2 < sin2 OMP_NUM_THREADS=2 ./xdcblat2 < din2 @@ -66,8 +69,10 @@ else OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2 OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2 endif +endif all3: xscblat3 xdcblat3 xccblat3 xzcblat3 +ifndef CROSS ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./xscblat3 < sin3 OMP_NUM_THREADS=2 ./xdcblat3 < din3 @@ -88,6 +93,7 @@ else OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m endif +endif diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 2fde07fcc..9e8cce438 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -439,7 +439,7 @@ static gotoblas_t *force_coretype(char *coretype){ char message[128]; //char mname[20]; - for ( i=1 ; i <= 21; i++) + for ( i=1 ; i <= 22; i++) { if (!strncasecmp(coretype,corename[i],20)) { diff --git a/driver/others/init.c b/driver/others/init.c index f134f85f7..801f93991 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -361,6 +361,9 @@ static void numa_mapping(void) { unsigned long work, bit; int count = 0; int bitmask_idx = 0; + int current_cpu; + int current_node = 0; + int cpu_count = 0; for (node = 0; node < common -> num_nodes; node ++) { core = 0; @@ -382,33 +385,84 @@ static void numa_mapping(void) { fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); #endif - h = 1; + current_cpu = sched_getcpu(); + for (cpu = 0; cpu < count; cpu++) { + if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) { + current_node = READ_NODE(common -> cpu_info[cpu]); + break; + } + } + for (i = 0; i < MAX_BITMASK_LEN; i++) + cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]); - while (h < count) h = 2 * h + 1; + /* + * If all the processes can be accommodated in the + * in the current node itself, then bind to cores + * from the current node only + */ + if (numprocs <= cpu_count) { + /* + * First sort all the cores in order from the current node. + * Then take remaining nodes one by one in order, + * and sort their cores in order. + */ + for (i = 0; i < count; i++) { + for (j = 0; j < count - 1; j++) { + int node_1, node_2; + int core_1, core_2; + int swap = 0; - while (h > 1) { - h /= 2; - for (i = h; i < count; i++) { - work = common -> cpu_info[i]; - bit = CPU_ISSET(i, &cpu_orig_mask[0]); - j = i - h; - while (work < common -> cpu_info[j]) { - common -> cpu_info[j + h] = common -> cpu_info[j]; - if (CPU_ISSET(j, &cpu_orig_mask[0])) { - CPU_SET(j + h, &cpu_orig_mask[0]); - } else { - CPU_CLR(j + h, &cpu_orig_mask[0]); - } - j -= h; - if (j < 0) break; - } - common -> cpu_info[j + h] = work; - if (bit) { - CPU_SET(j + h, &cpu_orig_mask[0]); - } else { - CPU_CLR(j + h, &cpu_orig_mask[0]); + node_1 = READ_NODE(common -> cpu_info[j]); + node_2 = READ_NODE(common -> cpu_info[j + 1]); + core_1 = READ_CORE(common -> cpu_info[j]); + core_2 = READ_CORE(common -> cpu_info[j + 1]); + + if (node_1 == node_2) { + if (core_1 > core_2) + swap = 1; + } else { + if ((node_2 == current_node) || + ((node_1 != current_node) && (node_1 > node_2))) + swap = 1; + } + if (swap) { + unsigned long temp; + + temp = common->cpu_info[j]; + common->cpu_info[j] = common->cpu_info[j + 1]; + common->cpu_info[j + 1] = temp; + } } + } + } else { + h = 1; + while (h < count) h = 2 * h + 1; + + while (h > 1) { + h /= 2; + for (i = h; i < count; i++) { + work = common -> cpu_info[i]; + bit = CPU_ISSET(i, &cpu_orig_mask[0]); + j = i - h; + while (work < common -> cpu_info[j]) { + common -> cpu_info[j + h] = common -> cpu_info[j]; + if (CPU_ISSET(j, &cpu_orig_mask[0])) { + CPU_SET(j + h, &cpu_orig_mask[0]); + } else { + CPU_CLR(j + h, &cpu_orig_mask[0]); + } + j -= h; + if (j < 0) break; + } + common -> cpu_info[j + h] = work; + if (bit) { + CPU_SET(j + h, &cpu_orig_mask[0]); + } else { + CPU_CLR(j + h, &cpu_orig_mask[0]); + } + + } } } @@ -416,7 +470,10 @@ static void numa_mapping(void) { fprintf(stderr, "\nSorting ...\n\n"); for (cpu = 0; cpu < count; cpu++) - fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); + fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu], + READ_CPU(common -> cpu_info[cpu]), + READ_CORE(common -> cpu_info[cpu]), + READ_NODE(common -> cpu_info[cpu])); #endif } diff --git a/driver/others/parameter.c b/driver/others/parameter.c index f4b1a80ad..f22c6b69a 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -167,7 +167,7 @@ int get_L2_size(void){ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ - defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) + defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -251,7 +251,7 @@ int get_L2_size(void){ void blas_set_parameter(void){ int factor; -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) int size = 16; #else int size = get_L2_size(); diff --git a/exports/Makefile b/exports/Makefile index c2b8d9c1c..5632b6fff 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -110,9 +110,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def endif ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2)) #only build without Fortran - $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else - $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif dllinit.$(SUFFIX) : dllinit.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index e1b89cc97..8e3d084aa 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -12,7 +12,7 @@ SGEMMKERNEL = sgemm_kernel_16x8_power8.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMOTCOPY = sgemm_tcopy_8_power8.S SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o @@ -21,16 +21,16 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_16x4_power8.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c DGEMMITCOPY = dgemm_tcopy_16_power8.S -DGEMMONCOPY = gemm_ncopy_4.S -DGEMMOTCOPY = gemm_tcopy_4.S -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPY = dgemm_ncopy_4_power8.S +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c -CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMITCOPY = cgemm_tcopy_8_power8.S CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPYOBJ = cgemm_oncopy.o @@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c -ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c +ZGEMMITCOPY = zgemm_tcopy_8_power8.S ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o ZGEMMINCOPYOBJ = zgemm_incopy.o diff --git a/kernel/power/cgemm_tcopy_8_power8.S b/kernel/power/cgemm_tcopy_8_power8.S new file mode 100644 index 000000000..b1a7d2b27 --- /dev/null +++ b/kernel/power/cgemm_tcopy_8_power8.S @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define B8 r17 +#define B4 r18 +#define B2 r19 +#define B1 r20 +#define o4 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define NOTUS2 r27 +#define M8 r30 +#define T1 r31 + +#define o0 0 + +#include "cgemm_tcopy_macros_8_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, ZBASE_SHIFT + slwi M8, M, 3 + ZBASE_SHIFT + + li T2, -8 + li PREA, -4 + li PREB, -2 + + and B4, N, T2 + and B2, N, PREA + and B1, N, PREB + + mullw B4, B4, M + mullw B2, B2, M + mullw B1, B1, M + + slwi B4, B4, ZBASE_SHIFT + slwi B2, B2, ZBASE_SHIFT + slwi B1, B1, ZBASE_SHIFT + + add B4, B4, B + add B2, B2, B + add B1, B1, B + + li PREA, 384 + addi PREB, M8, 128 + + li o4, 4 + li o16, 16 + li o32, 32 + li o48, 48 + +#include "cgemm_tcopy_logic_8_power8.S" + +L999: + + li r3, 0 + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/cgemm_tcopy_logic_8_power8.S b/kernel/power/cgemm_tcopy_logic_8_power8.S new file mode 100644 index 000000000..9418908b7 --- /dev/null +++ b/kernel/power/cgemm_tcopy_logic_8_power8.S @@ -0,0 +1,247 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. I, M, 2 + ble CCOPYT_L2_BEGIN + + +CCOPYT_L4_BEGIN: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + mr B8, B + addi B, B, 64*SIZE + + sradi. J, N, 3 + ble CCOPYT_L4x4_BEGIN + + mr BO, B8 + +CCOPYT_L4x8_LOOP: + + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + dcbtst BO, M8 + dcbtst BO, PREB + COPY_4x8 + + add BO, BO, M8 + + addic. J, J, -1 + ble CCOPYT_L4x4_BEGIN + + + COPY_4x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt CCOPYT_L4x8_LOOP + +CCOPYT_L4x4_BEGIN: + + andi. T1, N, 4 + ble CCOPYT_L4x2_BEGIN + + mr BO, B4 + + COPY_4x4 + + + addi B4, B4, 32*SIZE + +CCOPYT_L4x2_BEGIN: + + andi. T1, N, 2 + ble CCOPYT_L4x1_BEGIN + + mr BO, B2 + + COPY_4x2 + + + addi B2, B2, 16*SIZE + +CCOPYT_L4x1_BEGIN: + + andi. T1, N, 1 + ble CCOPYT_L4_END + + mr BO, B1 + + COPY_4x1 + + + addi B1, B1, 8*SIZE + +CCOPYT_L4_END: + + addic. I, I, -1 + bgt CCOPYT_L4_BEGIN + + + +CCOPYT_L2_BEGIN: + + andi. T1, M, 2 + ble CCOPYT_L1_BEGIN + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + mr B8, B + addi B, B, 32*SIZE + + sradi. J, N, 3 + ble CCOPYT_L2x4_BEGIN + + mr BO, B8 + +CCOPYT_L2x8_LOOP: + + COPY_2x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt CCOPYT_L2x8_LOOP + +CCOPYT_L2x4_BEGIN: + + andi. T1, N, 4 + ble CCOPYT_L2x2_BEGIN + + mr BO, B4 + + COPY_2x4 + + + addi B4, B4, 16*SIZE + +CCOPYT_L2x2_BEGIN: + + andi. T1, N, 2 + ble CCOPYT_L2x1_BEGIN + + mr BO, B2 + + COPY_2x2 + + + addi B2, B2, 8*SIZE + +CCOPYT_L2x1_BEGIN: + + andi. T1, N, 1 + ble CCOPYT_L2_END + + mr BO, B1 + + COPY_2x1 + + + addi B1, B1, 4*SIZE + +CCOPYT_L2_END: + + +CCOPYT_L1_BEGIN: + + andi. T1, M, 1 + ble L999 + + mr A0, A + add A, A0, LDA + mr B8, B + addi B, B, 16*SIZE + + sradi. J, N, 3 + ble CCOPYT_L1x4_BEGIN + + mr BO, B8 + +CCOPYT_L1x8_LOOP: + + COPY_1x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt CCOPYT_L1x8_LOOP + +CCOPYT_L1x4_BEGIN: + + andi. T1, N, 4 + ble CCOPYT_L1x2_BEGIN + + mr BO, B4 + + COPY_1x4 + + + addi B4, B4, 8*SIZE + +CCOPYT_L1x2_BEGIN: + + andi. T1, N, 2 + ble CCOPYT_L1x1_BEGIN + + mr BO, B2 + + COPY_1x2 + + + addi B2, B2, 4*SIZE + +CCOPYT_L1x1_BEGIN: + + andi. T1, N, 1 + ble CCOPYT_L1_END + + mr BO, B1 + + COPY_1x1 + + + addi B1, B1, 2*SIZE + +CCOPYT_L1_END: + diff --git a/kernel/power/cgemm_tcopy_macros_8_power8.S b/kernel/power/cgemm_tcopy_macros_8_power8.S new file mode 100644 index 000000000..03fda2766 --- /dev/null +++ b/kernel/power/cgemm_tcopy_macros_8_power8.S @@ -0,0 +1,385 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + lxvw4x vs34, o32, A0 + lxvw4x vs35, o48, A0 + + lxvw4x vs36, o0, A1 + lxvw4x vs37, o16, A1 + lxvw4x vs38, o32, A1 + lxvw4x vs39, o48, A1 + + addi A0, A0, 64 + addi A1, A1, 64 + + lxvw4x vs40, o0, A2 + lxvw4x vs41, o16, A2 + lxvw4x vs42, o32, A2 + lxvw4x vs43, o48, A2 + + lxvw4x vs44, o0, A3 + lxvw4x vs45, o16, A3 + lxvw4x vs46, o32, A3 + lxvw4x vs47, o48, A3 + + mr T1, BO + addi A2, A2, 64 + addi A3, A3, 64 + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs40, o0, T1 + stxvw4x vs41, o16, T1 + stxvw4x vs42, o32, T1 + stxvw4x vs43, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs44, o0, T1 + stxvw4x vs45, o16, T1 + stxvw4x vs46, o32, T1 + stxvw4x vs47, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + addi A0, A0, 32 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + addi A1, A1, 32 + + lxvw4x vs36, o0, A2 + lxvw4x vs37, o16, A2 + addi A2, A2, 32 + + lxvw4x vs38, o0, A3 + lxvw4x vs39, o16, A3 + addi A3, A3, 32 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxvw4x vs32, o0, A0 + addi A0, A0, 16 + + lxvw4x vs33, o0, A1 + addi A1, A1, 16 + + lxvw4x vs34, o0, A2 + addi A2, A2, 16 + + lxvw4x vs35, o0, A3 + addi A3, A3, 16 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + addi A0, A0, 8 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + addi A1, A1, 8 + + lxsspx vs36, o0, A2 + lxsspx vs37, o4, A2 + addi A2, A2, 8 + + lxsspx vs38, o0, A3 + lxsspx vs39, o4, A3 + addi A3, A3, 8 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + + addi T1, T1, 8 + + stxsspx vs36, o0, T1 + stxsspx vs37, o4, T1 + + addi T1, T1, 8 + + stxsspx vs38, o0, T1 + stxsspx vs39, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + lxvw4x vs34, o32, A0 + lxvw4x vs35, o48, A0 + addi A0, A0, 64 + + lxvw4x vs36, o0, A1 + lxvw4x vs37, o16, A1 + lxvw4x vs38, o32, A1 + lxvw4x vs39, o48, A1 + addi A1, A1, 64 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + addi A0, A0, 32 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + addi A1, A1, 32 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxvw4x vs32, o0, A0 + addi A0, A0, 16 + + lxvw4x vs33, o0, A1 + addi A1, A1, 16 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + addi A0, A0, 8 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + addi A1, A1, 8 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + lxvw4x vs34, o32, A0 + lxvw4x vs35, o48, A0 + addi A0, A0, 64 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + addi A0, A0, 32 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxvw4x vs32, o0, A0 + addi A0, A0, 16 + + mr T1, BO + + stxvw4x vs32, o0, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + addi A0, A0, 8 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + +.endm + diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S index 4c14b0c6f..8af7fe389 100644 --- a/kernel/power/dgemm_kernel_16x4_power8.S +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -131,13 +131,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define o0 0 +#define T4 r12 +#define T3 r11 + +#define o40 r12 +#define o56 r11 + +#define o112 r14 #define o8 r15 #define o24 r16 -#define ALPHA r17 +#define o64 r17 #define L r18 #define T1 r19 -#define KK r20 -#define BB r21 +#define o80 r20 +#define o96 r21 #define I r22 #define J r23 #define AO r24 @@ -202,6 +209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) + std r14, 280(SP) #else stw r31, 144(SP) stw r30, 148(SP) @@ -220,6 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stw r17, 200(SP) stw r16, 204(SP) stw r15, 208(SP) + stw r14, 212(SP) #endif stfd f1, ALPHA_SP @@ -260,19 +269,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble .L999_H1 #ifdef __64BIT__ - addi ALPHA, SP, 296 + addi T1, SP, 296 #else - addi ALPHA, SP, 224 + addi T1, SP, 224 #endif - li PRE, 256 + li PRE, 384 li o8 , 8 li o16, 16 li o24, 24 li o32, 32 li o48, 48 + li o64, 64 + li o80, 80 + li o96, 96 + li o112, 112 - lxvdsx alpha_r, 0, ALPHA + lxvdsx alpha_r, 0, T1 #include "dgemm_logic_16x4_power8.S" @@ -320,6 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) + ld r14, 280(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) @@ -338,6 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lwz r17, 200(SP) lwz r16, 204(SP) lwz r15, 208(SP) + lwz r14, 212(SP) #endif addi SP, SP, STACKSIZE diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S index 49c438f61..718f80bdd 100644 --- a/kernel/power/dgemm_logic_16x4_power8.S +++ b/kernel/power/dgemm_logic_16x4_power8.S @@ -35,193 +35,187 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. srawi. J, N, 2 - ble .LDGEMM_L4_END + ble LDGEMM_L4_END -.LDGEMM_L4_BEGIN: +LDGEMM_L4_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 2 add C, C, T1 srawi. I, M, 4 - ble .LDGEMM_L4x16_END + ble LDGEMM_L4x16_END -.LDGEMM_L4x16_BEGIN: + .align 4 +LDGEMM_L4x16_BEGIN: + li L, -128 + + mr T1, CO + add T2, T1, LDC + add T3, T2, LDC + add T4, T3, LDC + + and T1, T1, L + and T2, T2, L + and T3, T3, L + and T4, T4, L + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 mr BO, B - srawi. L, K, 3 - ble .LDGEMM_L4x16_SUB0 + srawi. L, K, 1 + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + ble LDGEMM_L4x16_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x16_SUB4 + ble LDGEMM_L4x16_SUB4 -.LDGEMM_L4x16_LOOP_START: + .align 4 +LDGEMM_L4x16_LOOP_START: - dcbt AO, PRE + li o40, 40 + li o56, 56 + + dcbt AO, PRE LOAD4x16_1 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_I1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - + dcbt AO, PRE addic. L, L, -2 - ble .LDGEMM_L4x16_LOOP_END + KERNEL4x16_L2 - .align 5 + ble LDGEMM_L4x16_LOOP_END -.LDGEMM_L4x16_LOOP: + .align 4 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 +LDGEMM_L4x16_LOOP: - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_L1 + dcbt AO, PRE addic. L, L, -1 - bgt .LDGEMM_L4x16_LOOP + KERNEL4x16_L2 -.LDGEMM_L4x16_LOOP_END: + bgt LDGEMM_L4x16_LOOP - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 + .align 4 + +LDGEMM_L4x16_LOOP_END: - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE KERNEL4x16_1 KERNEL4x16_E2 - b .LDGEMM_L4x16_SUB1 + b LDGEMM_L4x16_SUB1 -.LDGEMM_L4x16_SUB4: +LDGEMM_L4x16_SUB4: - dcbt AO, PRE KERNEL4x16_SUBI1 - dcbt AO, PRE - KERNEL4x16_SUB1 - dcbt AO, PRE - KERNEL4x16_SUB1 - dcbt AO, PRE KERNEL4x16_SUB1 - KERNEL4x16_SUB1 - KERNEL4x16_SUB1 - KERNEL4x16_SUB1 - KERNEL4x16_SUB1 + b LDGEMM_L4x16_SUB1 - b .LDGEMM_L4x16_SUB1 +LDGEMM_L4x16_SUB0: -.LDGEMM_L4x16_SUB0: - - andi. L, K, 7 + andi. L, K, 1 KERNEL4x16_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x16_SAVE - b .LDGEMM_L4x16_SUB2 + ble LDGEMM_L4x16_SAVE + b LDGEMM_L4x16_SUB2 -.LDGEMM_L4x16_SUB1: +LDGEMM_L4x16_SUB1: - andi. L, K, 7 - ble .LDGEMM_L4x16_SAVE + andi. L, K, 1 + ble LDGEMM_L4x16_SAVE -.LDGEMM_L4x16_SUB2: +LDGEMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x16_SUB2 + bgt LDGEMM_L4x16_SUB2 -.LDGEMM_L4x16_SAVE: + .align 4 +LDGEMM_L4x16_SAVE: SAVE4x16 addic. I, I, -1 - bgt .LDGEMM_L4x16_BEGIN + bgt LDGEMM_L4x16_BEGIN -.LDGEMM_L4x16_END: +LDGEMM_L4x16_END: -.LDGEMM_L4x8_BEGIN: +LDGEMM_L4x8_BEGIN: andi. T2, M, 15 - ble .LDGEMM_L4x1_END + ble LDGEMM_L4x1_END andi. T1, M, 8 - ble .LDGEMM_L4x8_END + ble LDGEMM_L4x8_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x8_SUB0 + ble LDGEMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x8_SUB4 + ble LDGEMM_L4x8_SUB4 -.LDGEMM_L4x8_LOOP_START: +LDGEMM_L4x8_LOOP_START: + dcbt AO, PRE LOAD4x8_1 KERNEL4x8_I1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 addic. L, L, -2 - ble .LDGEMM_L4x8_LOOP_END + ble LDGEMM_L4x8_LOOP_END .align 5 -.LDGEMM_L4x8_LOOP: +LDGEMM_L4x8_LOOP: KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 addic. L, L, -1 - bgt .LDGEMM_L4x8_LOOP + bgt LDGEMM_L4x8_LOOP -.LDGEMM_L4x8_LOOP_END: +LDGEMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 @@ -233,9 +227,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 KERNEL4x8_E2 - b .LDGEMM_L4x8_SUB1 + b LDGEMM_L4x8_SUB1 -.LDGEMM_L4x8_SUB4: +LDGEMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -247,81 +241,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b .LDGEMM_L4x8_SUB1 + b LDGEMM_L4x8_SUB1 -.LDGEMM_L4x8_SUB0: +LDGEMM_L4x8_SUB0: andi. L, K, 7 KERNEL4x8_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x8_SAVE - b .LDGEMM_L4x8_SUB2 + ble LDGEMM_L4x8_SAVE + b LDGEMM_L4x8_SUB2 -.LDGEMM_L4x8_SUB1: +LDGEMM_L4x8_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x8_SAVE + ble LDGEMM_L4x8_SAVE -.LDGEMM_L4x8_SUB2: +LDGEMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x8_SUB2 + bgt LDGEMM_L4x8_SUB2 -.LDGEMM_L4x8_SAVE: +LDGEMM_L4x8_SAVE: SAVE4x8 -.LDGEMM_L4x8_END: +LDGEMM_L4x8_END: -.LDGEMM_L4x4_BEGIN: +LDGEMM_L4x4_BEGIN: andi. T1, M, 4 - ble .LDGEMM_L4x4_END + ble LDGEMM_L4x4_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x4_SUB0 + ble LDGEMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x4_SUB4 + ble LDGEMM_L4x4_SUB4 -.LDGEMM_L4x4_LOOP_START: +LDGEMM_L4x4_LOOP_START: + dcbt AO, PRE LOAD4x4_1 KERNEL4x4_I1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 addic. L, L, -2 - ble .LDGEMM_L4x4_LOOP_END + ble LDGEMM_L4x4_LOOP_END .align 5 -.LDGEMM_L4x4_LOOP: +LDGEMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 addic. L, L, -1 - bgt .LDGEMM_L4x4_LOOP + bgt LDGEMM_L4x4_LOOP -.LDGEMM_L4x4_LOOP_END: +LDGEMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -333,9 +332,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_1 KERNEL4x4_E2 - b .LDGEMM_L4x4_SUB1 + b LDGEMM_L4x4_SUB1 -.LDGEMM_L4x4_SUB4: +LDGEMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -347,48 +346,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b .LDGEMM_L4x4_SUB1 + b LDGEMM_L4x4_SUB1 -.LDGEMM_L4x4_SUB0: +LDGEMM_L4x4_SUB0: andi. L, K, 7 KERNEL4x4_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x4_SAVE - b .LDGEMM_L4x4_SUB2 + ble LDGEMM_L4x4_SAVE + b LDGEMM_L4x4_SUB2 -.LDGEMM_L4x4_SUB1: +LDGEMM_L4x4_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x4_SAVE + ble LDGEMM_L4x4_SAVE -.LDGEMM_L4x4_SUB2: +LDGEMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x4_SUB2 + bgt LDGEMM_L4x4_SUB2 -.LDGEMM_L4x4_SAVE: +LDGEMM_L4x4_SAVE: SAVE4x4 -.LDGEMM_L4x4_END: +LDGEMM_L4x4_END: -.LDGEMM_L4x2_BEGIN: +LDGEMM_L4x2_BEGIN: andi. T1, M, 2 - ble .LDGEMM_L4x2_END + ble LDGEMM_L4x2_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x2_SUB0 + ble LDGEMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x2_SUB4 + ble LDGEMM_L4x2_SUB4 -.LDGEMM_L4x2_LOOP_START: +LDGEMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -402,11 +401,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -2 - ble .LDGEMM_L4x2_LOOP_END + ble LDGEMM_L4x2_LOOP_END .align 5 -.LDGEMM_L4x2_LOOP: +LDGEMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -419,9 +418,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -1 - bgt .LDGEMM_L4x2_LOOP + bgt LDGEMM_L4x2_LOOP -.LDGEMM_L4x2_LOOP_END: +LDGEMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -433,9 +432,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_1 KERNEL4x2_E2 - b .LDGEMM_L4x2_SUB1 + b LDGEMM_L4x2_SUB1 -.LDGEMM_L4x2_SUB4: +LDGEMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -447,48 +446,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b .LDGEMM_L4x2_SUB1 + b LDGEMM_L4x2_SUB1 -.LDGEMM_L4x2_SUB0: +LDGEMM_L4x2_SUB0: andi. L, K, 7 KERNEL4x2_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x2_SAVE - b .LDGEMM_L4x2_SUB2 + ble LDGEMM_L4x2_SAVE + b LDGEMM_L4x2_SUB2 -.LDGEMM_L4x2_SUB1: +LDGEMM_L4x2_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x2_SAVE + ble LDGEMM_L4x2_SAVE -.LDGEMM_L4x2_SUB2: +LDGEMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x2_SUB2 + bgt LDGEMM_L4x2_SUB2 -.LDGEMM_L4x2_SAVE: +LDGEMM_L4x2_SAVE: SAVE4x2 -.LDGEMM_L4x2_END: +LDGEMM_L4x2_END: -.LDGEMM_L4x1_BEGIN: +LDGEMM_L4x1_BEGIN: andi. T1, M, 1 - ble .LDGEMM_L4x1_END + ble LDGEMM_L4x1_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x1_SUB0 + ble LDGEMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x1_SUB4 + ble LDGEMM_L4x1_SUB4 -.LDGEMM_L4x1_LOOP_START: +LDGEMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -502,11 +501,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -2 - ble .LDGEMM_L4x1_LOOP_END + ble LDGEMM_L4x1_LOOP_END .align 5 -.LDGEMM_L4x1_LOOP: +LDGEMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -519,9 +518,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -1 - bgt .LDGEMM_L4x1_LOOP + bgt LDGEMM_L4x1_LOOP -.LDGEMM_L4x1_LOOP_END: +LDGEMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -533,9 +532,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_1 KERNEL4x1_E2 - b .LDGEMM_L4x1_SUB1 + b LDGEMM_L4x1_SUB1 -.LDGEMM_L4x1_SUB4: +LDGEMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -547,74 +546,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b .LDGEMM_L4x1_SUB1 + b LDGEMM_L4x1_SUB1 -.LDGEMM_L4x1_SUB0: +LDGEMM_L4x1_SUB0: andi. L, K, 7 KERNEL4x1_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x1_SAVE - b .LDGEMM_L4x1_SUB2 + ble LDGEMM_L4x1_SAVE + b LDGEMM_L4x1_SUB2 -.LDGEMM_L4x1_SUB1: +LDGEMM_L4x1_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x1_SAVE + ble LDGEMM_L4x1_SAVE -.LDGEMM_L4x1_SUB2: +LDGEMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x1_SUB2 + bgt LDGEMM_L4x1_SUB2 -.LDGEMM_L4x1_SAVE: +LDGEMM_L4x1_SAVE: SAVE4x1 -.LDGEMM_L4x1_END: +LDGEMM_L4x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 - bgt .LDGEMM_L4_BEGIN + bgt LDGEMM_L4_BEGIN andi. T2, N, 3 ble .L999 -.LDGEMM_L4_END: +LDGEMM_L4_END: - b .LDGEMM_L2_BEGIN + b LDGEMM_L2_BEGIN .L999_H1: b .L999 -.LDGEMM_L2_BEGIN: +LDGEMM_L2_BEGIN: andi. T1, N, 2 - ble .LDGEMM_L2_END + ble LDGEMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 4 - ble .LDGEMM_L2x16_END + ble LDGEMM_L2x16_END -.LDGEMM_L2x16_BEGIN: +LDGEMM_L2x16_BEGIN: mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x16_SUB0 + ble LDGEMM_L2x16_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x16_SUB4 + ble LDGEMM_L2x16_SUB4 -.LDGEMM_L2x16_LOOP_START: +LDGEMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 @@ -637,11 +636,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -2 - ble .LDGEMM_L2x16_LOOP_END + ble LDGEMM_L2x16_LOOP_END .align 5 -.LDGEMM_L2x16_LOOP: +LDGEMM_L2x16_LOOP: dcbt AO, PRE KERNEL2x16_1 @@ -662,9 +661,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -1 - bgt .LDGEMM_L2x16_LOOP + bgt LDGEMM_L2x16_LOOP -.LDGEMM_L2x16_LOOP_END: +LDGEMM_L2x16_LOOP_END: dcbt AO, PRE KERNEL2x16_1 @@ -683,9 +682,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_1 KERNEL2x16_E2 - b .LDGEMM_L2x16_SUB1 + b LDGEMM_L2x16_SUB1 -.LDGEMM_L2x16_SUB4: +LDGEMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 @@ -701,86 +700,95 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_SUB1 KERNEL2x16_SUB1 - b .LDGEMM_L2x16_SUB1 + b LDGEMM_L2x16_SUB1 -.LDGEMM_L2x16_SUB0: +LDGEMM_L2x16_SUB0: andi. L, K, 7 KERNEL2x16_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x16_SAVE - b .LDGEMM_L2x16_SUB2 + ble LDGEMM_L2x16_SAVE + b LDGEMM_L2x16_SUB2 -.LDGEMM_L2x16_SUB1: +LDGEMM_L2x16_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x16_SAVE + ble LDGEMM_L2x16_SAVE -.LDGEMM_L2x16_SUB2: +LDGEMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x16_SUB2 + bgt LDGEMM_L2x16_SUB2 -.LDGEMM_L2x16_SAVE: +LDGEMM_L2x16_SAVE: SAVE2x16 addic. I, I, -1 - bgt .LDGEMM_L2x16_BEGIN + bgt LDGEMM_L2x16_BEGIN -.LDGEMM_L2x16_END: +LDGEMM_L2x16_END: -.LDGEMM_L2x8_BEGIN: +LDGEMM_L2x8_BEGIN: andi. T2, M, 15 - ble .LDGEMM_L2x1_END + ble LDGEMM_L2x1_END andi. T1, M, 8 - ble .LDGEMM_L2x8_END + ble LDGEMM_L2x8_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x8_SUB0 + ble LDGEMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x8_SUB4 + ble LDGEMM_L2x8_SUB4 -.LDGEMM_L2x8_LOOP_START: +LDGEMM_L2x8_LOOP_START: + dcbt AO, PRE LOAD2x8_1 KERNEL2x8_I1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 addic. L, L, -2 - ble .LDGEMM_L2x8_LOOP_END + ble LDGEMM_L2x8_LOOP_END .align 5 -.LDGEMM_L2x8_LOOP: +LDGEMM_L2x8_LOOP: KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 addic. L, L, -1 - bgt .LDGEMM_L2x8_LOOP + bgt LDGEMM_L2x8_LOOP -.LDGEMM_L2x8_LOOP_END: +LDGEMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 @@ -792,9 +800,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_1 KERNEL2x8_E2 - b .LDGEMM_L2x8_SUB1 + b LDGEMM_L2x8_SUB1 -.LDGEMM_L2x8_SUB4: +LDGEMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -806,48 +814,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b .LDGEMM_L2x8_SUB1 + b LDGEMM_L2x8_SUB1 -.LDGEMM_L2x8_SUB0: +LDGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x8_SAVE - b .LDGEMM_L2x8_SUB2 + ble LDGEMM_L2x8_SAVE + b LDGEMM_L2x8_SUB2 -.LDGEMM_L2x8_SUB1: +LDGEMM_L2x8_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x8_SAVE + ble LDGEMM_L2x8_SAVE -.LDGEMM_L2x8_SUB2: +LDGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x8_SUB2 + bgt LDGEMM_L2x8_SUB2 -.LDGEMM_L2x8_SAVE: +LDGEMM_L2x8_SAVE: SAVE2x8 -.LDGEMM_L2x8_END: +LDGEMM_L2x8_END: -.LDGEMM_L2x4_BEGIN: +LDGEMM_L2x4_BEGIN: andi. T1, M, 4 - ble .LDGEMM_L2x4_END + ble LDGEMM_L2x4_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x4_SUB0 + ble LDGEMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x4_SUB4 + ble LDGEMM_L2x4_SUB4 -.LDGEMM_L2x4_LOOP_START: +LDGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -861,11 +869,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -2 - ble .LDGEMM_L2x4_LOOP_END + ble LDGEMM_L2x4_LOOP_END .align 5 -.LDGEMM_L2x4_LOOP: +LDGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -878,9 +886,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -1 - bgt .LDGEMM_L2x4_LOOP + bgt LDGEMM_L2x4_LOOP -.LDGEMM_L2x4_LOOP_END: +LDGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -892,9 +900,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_1 KERNEL2x4_E2 - b .LDGEMM_L2x4_SUB1 + b LDGEMM_L2x4_SUB1 -.LDGEMM_L2x4_SUB4: +LDGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -906,48 +914,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b .LDGEMM_L2x4_SUB1 + b LDGEMM_L2x4_SUB1 -.LDGEMM_L2x4_SUB0: +LDGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x4_SAVE - b .LDGEMM_L2x4_SUB2 + ble LDGEMM_L2x4_SAVE + b LDGEMM_L2x4_SUB2 -.LDGEMM_L2x4_SUB1: +LDGEMM_L2x4_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x4_SAVE + ble LDGEMM_L2x4_SAVE -.LDGEMM_L2x4_SUB2: +LDGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x4_SUB2 + bgt LDGEMM_L2x4_SUB2 -.LDGEMM_L2x4_SAVE: +LDGEMM_L2x4_SAVE: SAVE2x4 -.LDGEMM_L2x4_END: +LDGEMM_L2x4_END: -.LDGEMM_L2x2_BEGIN: +LDGEMM_L2x2_BEGIN: andi. T1, M, 2 - ble .LDGEMM_L2x2_END + ble LDGEMM_L2x2_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x2_SUB0 + ble LDGEMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x2_SUB4 + ble LDGEMM_L2x2_SUB4 -.LDGEMM_L2x2_LOOP_START: +LDGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -961,11 +969,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -2 - ble .LDGEMM_L2x2_LOOP_END + ble LDGEMM_L2x2_LOOP_END .align 5 -.LDGEMM_L2x2_LOOP: +LDGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -978,9 +986,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -1 - bgt .LDGEMM_L2x2_LOOP + bgt LDGEMM_L2x2_LOOP -.LDGEMM_L2x2_LOOP_END: +LDGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -992,9 +1000,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_1 KERNEL2x2_E2 - b .LDGEMM_L2x2_SUB1 + b LDGEMM_L2x2_SUB1 -.LDGEMM_L2x2_SUB4: +LDGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -1006,48 +1014,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b .LDGEMM_L2x2_SUB1 + b LDGEMM_L2x2_SUB1 -.LDGEMM_L2x2_SUB0: +LDGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x2_SAVE - b .LDGEMM_L2x2_SUB2 + ble LDGEMM_L2x2_SAVE + b LDGEMM_L2x2_SUB2 -.LDGEMM_L2x2_SUB1: +LDGEMM_L2x2_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x2_SAVE + ble LDGEMM_L2x2_SAVE -.LDGEMM_L2x2_SUB2: +LDGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x2_SUB2 + bgt LDGEMM_L2x2_SUB2 -.LDGEMM_L2x2_SAVE: +LDGEMM_L2x2_SAVE: SAVE2x2 -.LDGEMM_L2x2_END: +LDGEMM_L2x2_END: -.LDGEMM_L2x1_BEGIN: +LDGEMM_L2x1_BEGIN: andi. T1, M, 1 - ble .LDGEMM_L2x1_END + ble LDGEMM_L2x1_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x1_SUB0 + ble LDGEMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x1_SUB4 + ble LDGEMM_L2x1_SUB4 -.LDGEMM_L2x1_LOOP_START: +LDGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -1061,11 +1069,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -2 - ble .LDGEMM_L2x1_LOOP_END + ble LDGEMM_L2x1_LOOP_END .align 5 -.LDGEMM_L2x1_LOOP: +LDGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -1078,9 +1086,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -1 - bgt .LDGEMM_L2x1_LOOP + bgt LDGEMM_L2x1_LOOP -.LDGEMM_L2x1_LOOP_END: +LDGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -1092,9 +1100,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_1 KERNEL2x1_E2 - b .LDGEMM_L2x1_SUB1 + b LDGEMM_L2x1_SUB1 -.LDGEMM_L2x1_SUB4: +LDGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -1106,59 +1114,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b .LDGEMM_L2x1_SUB1 + b LDGEMM_L2x1_SUB1 -.LDGEMM_L2x1_SUB0: +LDGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x1_SAVE - b .LDGEMM_L2x1_SUB2 + ble LDGEMM_L2x1_SAVE + b LDGEMM_L2x1_SUB2 -.LDGEMM_L2x1_SUB1: +LDGEMM_L2x1_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x1_SAVE + ble LDGEMM_L2x1_SAVE -.LDGEMM_L2x1_SUB2: +LDGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x1_SUB2 + bgt LDGEMM_L2x1_SUB2 -.LDGEMM_L2x1_SAVE: +LDGEMM_L2x1_SAVE: SAVE2x1 -.LDGEMM_L2x1_END: +LDGEMM_L2x1_END: slwi T1, K, 4 add B, B, T1 -.LDGEMM_L2_END: -.LDGEMM_L1_BEGIN: +LDGEMM_L2_END: +LDGEMM_L1_BEGIN: andi. T1, N, 1 - ble .LDGEMM_L1_END + ble LDGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 4 - ble .LDGEMM_L1x16_END + ble LDGEMM_L1x16_END -.LDGEMM_L1x16_BEGIN: +LDGEMM_L1x16_BEGIN: mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x16_SUB0 + ble LDGEMM_L1x16_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x16_SUB4 + ble LDGEMM_L1x16_SUB4 -.LDGEMM_L1x16_LOOP_START: +LDGEMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 @@ -1181,11 +1189,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -2 - ble .LDGEMM_L1x16_LOOP_END + ble LDGEMM_L1x16_LOOP_END .align 5 -.LDGEMM_L1x16_LOOP: +LDGEMM_L1x16_LOOP: dcbt AO, PRE KERNEL1x16_1 @@ -1206,9 +1214,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -1 - bgt .LDGEMM_L1x16_LOOP + bgt LDGEMM_L1x16_LOOP -.LDGEMM_L1x16_LOOP_END: +LDGEMM_L1x16_LOOP_END: dcbt AO, PRE KERNEL1x16_1 @@ -1227,9 +1235,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_1 KERNEL1x16_E2 - b .LDGEMM_L1x16_SUB1 + b LDGEMM_L1x16_SUB1 -.LDGEMM_L1x16_SUB4: +LDGEMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 @@ -1245,86 +1253,95 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_SUB1 KERNEL1x16_SUB1 - b .LDGEMM_L1x16_SUB1 + b LDGEMM_L1x16_SUB1 -.LDGEMM_L1x16_SUB0: +LDGEMM_L1x16_SUB0: andi. L, K, 7 KERNEL1x16_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x16_SAVE - b .LDGEMM_L1x16_SUB2 + ble LDGEMM_L1x16_SAVE + b LDGEMM_L1x16_SUB2 -.LDGEMM_L1x16_SUB1: +LDGEMM_L1x16_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x16_SAVE + ble LDGEMM_L1x16_SAVE -.LDGEMM_L1x16_SUB2: +LDGEMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x16_SUB2 + bgt LDGEMM_L1x16_SUB2 -.LDGEMM_L1x16_SAVE: +LDGEMM_L1x16_SAVE: SAVE1x16 addic. I, I, -1 - bgt .LDGEMM_L1x16_BEGIN + bgt LDGEMM_L1x16_BEGIN -.LDGEMM_L1x16_END: +LDGEMM_L1x16_END: -.LDGEMM_L1x8_BEGIN: +LDGEMM_L1x8_BEGIN: andi. T2, M, 15 - ble .LDGEMM_L1x1_END + ble LDGEMM_L1x1_END andi. T1, M, 8 - ble .LDGEMM_L1x8_END + ble LDGEMM_L1x8_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x8_SUB0 + ble LDGEMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x8_SUB4 + ble LDGEMM_L1x8_SUB4 -.LDGEMM_L1x8_LOOP_START: +LDGEMM_L1x8_LOOP_START: + dcbt AO, PRE LOAD1x8_1 KERNEL1x8_I1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 addic. L, L, -2 - ble .LDGEMM_L1x8_LOOP_END + ble LDGEMM_L1x8_LOOP_END .align 5 -.LDGEMM_L1x8_LOOP: +LDGEMM_L1x8_LOOP: KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 addic. L, L, -1 - bgt .LDGEMM_L1x8_LOOP + bgt LDGEMM_L1x8_LOOP -.LDGEMM_L1x8_LOOP_END: +LDGEMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 @@ -1336,9 +1353,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_1 KERNEL1x8_E2 - b .LDGEMM_L1x8_SUB1 + b LDGEMM_L1x8_SUB1 -.LDGEMM_L1x8_SUB4: +LDGEMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -1350,48 +1367,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b .LDGEMM_L1x8_SUB1 + b LDGEMM_L1x8_SUB1 -.LDGEMM_L1x8_SUB0: +LDGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x8_SAVE - b .LDGEMM_L1x8_SUB2 + ble LDGEMM_L1x8_SAVE + b LDGEMM_L1x8_SUB2 -.LDGEMM_L1x8_SUB1: +LDGEMM_L1x8_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x8_SAVE + ble LDGEMM_L1x8_SAVE -.LDGEMM_L1x8_SUB2: +LDGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x8_SUB2 + bgt LDGEMM_L1x8_SUB2 -.LDGEMM_L1x8_SAVE: +LDGEMM_L1x8_SAVE: SAVE1x8 -.LDGEMM_L1x8_END: +LDGEMM_L1x8_END: -.LDGEMM_L1x4_BEGIN: +LDGEMM_L1x4_BEGIN: andi. T1, M, 4 - ble .LDGEMM_L1x4_END + ble LDGEMM_L1x4_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x4_SUB0 + ble LDGEMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x4_SUB4 + ble LDGEMM_L1x4_SUB4 -.LDGEMM_L1x4_LOOP_START: +LDGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -1405,11 +1422,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -2 - ble .LDGEMM_L1x4_LOOP_END + ble LDGEMM_L1x4_LOOP_END .align 5 -.LDGEMM_L1x4_LOOP: +LDGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -1422,9 +1439,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -1 - bgt .LDGEMM_L1x4_LOOP + bgt LDGEMM_L1x4_LOOP -.LDGEMM_L1x4_LOOP_END: +LDGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -1436,9 +1453,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_1 KERNEL1x4_E2 - b .LDGEMM_L1x4_SUB1 + b LDGEMM_L1x4_SUB1 -.LDGEMM_L1x4_SUB4: +LDGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -1450,48 +1467,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b .LDGEMM_L1x4_SUB1 + b LDGEMM_L1x4_SUB1 -.LDGEMM_L1x4_SUB0: +LDGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x4_SAVE - b .LDGEMM_L1x4_SUB2 + ble LDGEMM_L1x4_SAVE + b LDGEMM_L1x4_SUB2 -.LDGEMM_L1x4_SUB1: +LDGEMM_L1x4_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x4_SAVE + ble LDGEMM_L1x4_SAVE -.LDGEMM_L1x4_SUB2: +LDGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x4_SUB2 + bgt LDGEMM_L1x4_SUB2 -.LDGEMM_L1x4_SAVE: +LDGEMM_L1x4_SAVE: SAVE1x4 -.LDGEMM_L1x4_END: +LDGEMM_L1x4_END: -.LDGEMM_L1x2_BEGIN: +LDGEMM_L1x2_BEGIN: andi. T1, M, 2 - ble .LDGEMM_L1x2_END + ble LDGEMM_L1x2_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x2_SUB0 + ble LDGEMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x2_SUB4 + ble LDGEMM_L1x2_SUB4 -.LDGEMM_L1x2_LOOP_START: +LDGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -1505,11 +1522,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -2 - ble .LDGEMM_L1x2_LOOP_END + ble LDGEMM_L1x2_LOOP_END .align 5 -.LDGEMM_L1x2_LOOP: +LDGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -1522,9 +1539,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -1 - bgt .LDGEMM_L1x2_LOOP + bgt LDGEMM_L1x2_LOOP -.LDGEMM_L1x2_LOOP_END: +LDGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -1536,9 +1553,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_1 KERNEL1x2_E2 - b .LDGEMM_L1x2_SUB1 + b LDGEMM_L1x2_SUB1 -.LDGEMM_L1x2_SUB4: +LDGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -1550,48 +1567,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b .LDGEMM_L1x2_SUB1 + b LDGEMM_L1x2_SUB1 -.LDGEMM_L1x2_SUB0: +LDGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x2_SAVE - b .LDGEMM_L1x2_SUB2 + ble LDGEMM_L1x2_SAVE + b LDGEMM_L1x2_SUB2 -.LDGEMM_L1x2_SUB1: +LDGEMM_L1x2_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x2_SAVE + ble LDGEMM_L1x2_SAVE -.LDGEMM_L1x2_SUB2: +LDGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x2_SUB2 + bgt LDGEMM_L1x2_SUB2 -.LDGEMM_L1x2_SAVE: +LDGEMM_L1x2_SAVE: SAVE1x2 -.LDGEMM_L1x2_END: +LDGEMM_L1x2_END: -.LDGEMM_L1x1_BEGIN: +LDGEMM_L1x1_BEGIN: andi. T1, M, 1 - ble .LDGEMM_L1x1_END + ble LDGEMM_L1x1_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x1_SUB0 + ble LDGEMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x1_SUB4 + ble LDGEMM_L1x1_SUB4 -.LDGEMM_L1x1_LOOP_START: +LDGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -1605,11 +1622,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -2 - ble .LDGEMM_L1x1_LOOP_END + ble LDGEMM_L1x1_LOOP_END .align 5 -.LDGEMM_L1x1_LOOP: +LDGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -1622,9 +1639,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -1 - bgt .LDGEMM_L1x1_LOOP + bgt LDGEMM_L1x1_LOOP -.LDGEMM_L1x1_LOOP_END: +LDGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -1636,9 +1653,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_1 KERNEL1x1_E2 - b .LDGEMM_L1x1_SUB1 + b LDGEMM_L1x1_SUB1 -.LDGEMM_L1x1_SUB4: +LDGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -1650,34 +1667,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b .LDGEMM_L1x1_SUB1 + b LDGEMM_L1x1_SUB1 -.LDGEMM_L1x1_SUB0: +LDGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x1_SAVE - b .LDGEMM_L1x1_SUB2 + ble LDGEMM_L1x1_SAVE + b LDGEMM_L1x1_SUB2 -.LDGEMM_L1x1_SUB1: +LDGEMM_L1x1_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x1_SAVE + ble LDGEMM_L1x1_SAVE -.LDGEMM_L1x1_SUB2: +LDGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x1_SUB2 + bgt LDGEMM_L1x1_SUB2 -.LDGEMM_L1x1_SAVE: +LDGEMM_L1x1_SAVE: SAVE1x1 -.LDGEMM_L1x1_END: +LDGEMM_L1x1_END: -.LDGEMM_L1_END: +LDGEMM_L1_END: diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S index 27c05e08e..2c7851207 100644 --- a/kernel/power/dgemm_macros_16x4_power8.S +++ b/kernel/power/dgemm_macros_16x4_power8.S @@ -47,88 +47,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO - addi AO, AO, 64 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO - addi AO, AO, 64 + addi AO, AO, 128 addi BO, BO, 32 .endm + .macro KERNEL4x16_I1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 - lxvd2x vs8, 0, AO + lxvd2x vs8, o0, AO lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 - addi AO, AO, 64 - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO + lxvd2x vs12, o64, AO + lxvd2x vs13, o80, AO - xvmuldp vs52, vs4, vs26 - xvmuldp vs53, vs5, vs26 - xvmuldp vs54, vs6, vs26 - xvmuldp vs55, vs7, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO + lxvd2x vs14, o96, AO + lxvd2x vs15, o112, AO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO - xvmuldp vs60, vs4, vs27 - xvmuldp vs61, vs5, vs27 - xvmuldp vs62, vs6, vs27 - xvmuldp vs63, vs7, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 - addi AO, AO, 64 - addi BO, BO, 32 + addi AO, AO, 128 .endm + + .macro KERNEL4x16_1 xvmaddadp vs32, vs0, vs24 @@ -136,8 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 - lxvd2x vs8, 0, AO + lxvd2x vs8, o0, AO lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 @@ -152,31 +154,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 - addi AO, AO, 64 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO + lxvd2x vs12, o64, AO + lxvd2x vs13, o80, AO xvmaddadp vs52, vs4, vs26 xvmaddadp vs53, vs5, vs26 xvmaddadp vs54, vs6, vs26 xvmaddadp vs55, vs7, vs26 - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO + lxvd2x vs14, o96, AO + lxvd2x vs15, o112, AO xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 @@ -192,7 +191,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 - addi AO, AO, 64 + addi AO, AO, 128 addi BO, BO, 32 .endm @@ -228,23 +227,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 - addi AO, AO, 64 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs50, vs10, vs30 xvmaddadp vs51, vs11, vs30 - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO xvmaddadp vs52, vs12, vs30 xvmaddadp vs53, vs13, vs30 xvmaddadp vs54, vs14, vs30 xvmaddadp vs55, vs15, vs30 - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 @@ -259,11 +257,144 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs14, vs31 xvmaddadp vs63, vs15, vs31 - addi AO, AO, 64 + addi AO, AO, 128 addi BO, BO, 32 .endm +.macro KERNEL4x16_L1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, o0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvd2x vs12, o64, AO + lxvd2x vs13, o80, AO + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + lxvd2x vs14, o96, AO + lxvd2x vs15, o112, AO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + + addi AO, AO, 128 + +.endm + +.macro KERNEL4x16_L2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + lxvdsx vs24, o32, BO + lxvdsx vs25, o40, BO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO + + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + lxvdsx vs26, o48, BO + lxvdsx vs27, o56, BO + + xvmaddadp vs60, vs12, vs31 + addi AO, AO, 128 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + addi BO, BO, 64 + xvmaddadp vs63, vs15, vs31 + + +.endm + + .macro KERNEL4x16_E2 @@ -378,15 +509,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO - addi AO, AO, 64 - addi BO, BO, 32 - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO - addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 @@ -402,6 +530,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 + addi BO, BO, 32 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 @@ -411,6 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 + addi AO, AO, 128 xvmaddadp vs52, vs4, vs26 xvmaddadp vs53, vs5, vs26 xvmaddadp vs54, vs6, vs26 @@ -430,21 +560,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x16 mr T1, CO - addi T2, T1, 64 + add T2, T1, LDC + add T3, T2, LDC + add T4, T3, LDC -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 + lxvd2x vs0, 0, CO + lxvd2x vs1, o16, CO + lxvd2x vs2, o32, CO + lxvd2x vs3, o48, CO + lxvd2x vs4, o64, CO + lxvd2x vs5, o80, CO + lxvd2x vs6, o96, CO + lxvd2x vs7, o112, CO - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 -#endif + lxvd2x vs8, 0, T2 + lxvd2x vs9, o16, T2 + lxvd2x vs10, o32, T2 + lxvd2x vs11, o48, T2 + lxvd2x vs12, o64, T2 + lxvd2x vs13, o80, T2 + lxvd2x vs14, o96, T2 + lxvd2x vs15, o112, T2 + + lxvd2x vs24, 0, T3 + lxvd2x vs25, o16, T3 + lxvd2x vs26, o32, T3 + lxvd2x vs27, o48, T3 + lxvd2x vs28, o64, T3 + lxvd2x vs29, o80, T3 + lxvd2x vs30, o96, T3 + lxvd2x vs31, o112, T3 -#ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r @@ -453,172 +599,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs5, vs37, alpha_r xvmaddadp vs6, vs38, alpha_r xvmaddadp vs7, vs39, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r - xvmuldp vs4, vs36, alpha_r - xvmuldp vs5, vs37, alpha_r - xvmuldp vs6, vs38, alpha_r - xvmuldp vs7, vs39, alpha_r -#endif - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 + lxvd2x vs32, 0, T4 + lxvd2x vs33, o16, T4 + lxvd2x vs34, o32, T4 + lxvd2x vs35, o48, T4 + lxvd2x vs36, o64, T4 + lxvd2x vs37, o80, T4 + lxvd2x vs38, o96, T4 + lxvd2x vs39, o112, T4 - dcbt T1, PRE - - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - add T1, T1, LDC - add T2, T2, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 - - lxvd2x vs12, 0, T2 - lxvd2x vs13, o16, T2 - lxvd2x vs14, o32, T2 - lxvd2x vs15, o48, T2 -#endif - -#ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r - xvmaddadp vs12, vs44, alpha_r - xvmaddadp vs13, vs45, alpha_r - xvmaddadp vs14, vs46, alpha_r - xvmaddadp vs15, vs47, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r - xvmuldp vs10, vs42, alpha_r - xvmuldp vs11, vs43, alpha_r - xvmuldp vs12, vs44, alpha_r - xvmuldp vs13, vs45, alpha_r - xvmuldp vs14, vs46, alpha_r - xvmuldp vs15, vs47, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 - - dcbt T1, PRE - - stxvd2x vs12, 0, T2 - stxvd2x vs13, o16, T2 - stxvd2x vs14, o32, T2 - stxvd2x vs15, o48, T2 - - add T1, T1, LDC - add T2, T2, LDC - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 - - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs48, alpha_r - xvmaddadp vs1, vs49, alpha_r - xvmaddadp vs2, vs50, alpha_r - xvmaddadp vs3, vs51, alpha_r - xvmaddadp vs4, vs52, alpha_r - xvmaddadp vs5, vs53, alpha_r - xvmaddadp vs6, vs54, alpha_r - xvmaddadp vs7, vs55, alpha_r -#else - xvmuldp vs0, vs48, alpha_r - xvmuldp vs1, vs49, alpha_r - xvmuldp vs2, vs50, alpha_r - xvmuldp vs3, vs51, alpha_r - xvmuldp vs4, vs52, alpha_r - xvmuldp vs5, vs53, alpha_r - xvmuldp vs6, vs54, alpha_r - xvmuldp vs7, vs55, alpha_r -#endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 - dcbt T1, PRE + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 + stxvd2x vs4, o64, T1 + stxvd2x vs5, o80, T1 + stxvd2x vs6, o96, T1 + stxvd2x vs7, o112, T1 - add T1, T1, LDC - add T2, T2, LDC + xvmaddadp vs24, vs48, alpha_r + xvmaddadp vs25, vs49, alpha_r + xvmaddadp vs26, vs50, alpha_r + xvmaddadp vs27, vs51, alpha_r -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 + stxvd2x vs8, o0, T2 + stxvd2x vs9, o16, T2 + stxvd2x vs10, o32, T2 + stxvd2x vs11, o48, T2 - lxvd2x vs12, 0, T2 - lxvd2x vs13, o16, T2 - lxvd2x vs14, o32, T2 - lxvd2x vs15, o48, T2 -#endif + xvmaddadp vs28, vs52, alpha_r + xvmaddadp vs29, vs53, alpha_r + xvmaddadp vs30, vs54, alpha_r + xvmaddadp vs31, vs55, alpha_r -#ifndef TRMMKERNEL - xvmaddadp vs8, vs56, alpha_r - xvmaddadp vs9, vs57, alpha_r - xvmaddadp vs10, vs58, alpha_r - xvmaddadp vs11, vs59, alpha_r - xvmaddadp vs12, vs60, alpha_r - xvmaddadp vs13, vs61, alpha_r - xvmaddadp vs14, vs62, alpha_r - xvmaddadp vs15, vs63, alpha_r -#else - xvmuldp vs8, vs56, alpha_r - xvmuldp vs9, vs57, alpha_r - xvmuldp vs10, vs58, alpha_r - xvmuldp vs11, vs59, alpha_r - xvmuldp vs12, vs60, alpha_r - xvmuldp vs13, vs61, alpha_r - xvmuldp vs14, vs62, alpha_r - xvmuldp vs15, vs63, alpha_r -#endif + stxvd2x vs12, o64, T2 + stxvd2x vs13, o80, T2 + stxvd2x vs14, o96, T2 + stxvd2x vs15, o112, T2 - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 + xvmaddadp vs32, vs56, alpha_r + xvmaddadp vs33, vs57, alpha_r + xvmaddadp vs34, vs58, alpha_r + xvmaddadp vs35, vs59, alpha_r - dcbt T1, PRE + stxvd2x vs24, 0, T3 + stxvd2x vs25, o16, T3 + stxvd2x vs26, o32, T3 + stxvd2x vs27, o48, T3 - stxvd2x vs12, 0, T2 - stxvd2x vs13, o16, T2 - stxvd2x vs14, o32, T2 - stxvd2x vs15, o48, T2 + xvmaddadp vs36, vs60, alpha_r + xvmaddadp vs37, vs61, alpha_r + xvmaddadp vs38, vs62, alpha_r + xvmaddadp vs39, vs63, alpha_r + + stxvd2x vs28, o64, T3 + stxvd2x vs29, o80, T3 + stxvd2x vs30, o96, T3 + stxvd2x vs31, o112, T3 + + stxvd2x vs32, o0, T4 + stxvd2x vs33, o16, T4 + stxvd2x vs34, o32, T4 + stxvd2x vs35, o48, T4 addi CO, CO, 128 + stxvd2x vs36, o64, T4 + stxvd2x vs37, o80, T4 + stxvd2x vs38, o96, T4 + stxvd2x vs39, o112, T4 + + .endm /********************************************************************* diff --git a/kernel/power/dgemm_ncopy_4_power8.S b/kernel/power/dgemm_ncopy_4_power8.S new file mode 100644 index 000000000..31966047f --- /dev/null +++ b/kernel/power/dgemm_ncopy_4_power8.S @@ -0,0 +1,228 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define o64 r17 +#define o80 r18 +#define o96 r19 +#define o112 r20 +#define o8 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define NOTU1 r27 +#define NOTU2 r30 +#define T1 r31 + +#define o0 0 + +#include "dgemm_ncopy_macros_4_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, BASE_SHIFT + + li PREA, 384 + li PREB, 384 + + li o8, 8 + li o16, 16 + li o32, 32 + li o48, 48 + li o64, 64 + li o80, 80 + li o96, 96 + li o112, 112 + +#include "dgemm_ncopy_logic_4_power8.S" + +L999: + + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/dgemm_ncopy_logic_4_power8.S b/kernel/power/dgemm_ncopy_logic_4_power8.S new file mode 100644 index 000000000..6944a7818 --- /dev/null +++ b/kernel/power/dgemm_ncopy_logic_4_power8.S @@ -0,0 +1,237 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + mr BO, B + srawi. I, N, 2 + ble DCOPYN_L2_BEGIN + + +DCOPYN_L4_BEGIN: + + +DCOPYN_L4_LOOP: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + +DCOPYN_L4x16_BEGIN: + + srawi. J, M, 4 + ble DCOPYN_L4x16_END + +DCOPYN_L4x16_LOOP: + + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + COPY_4x16 + addic. J, J, -1 + bgt DCOPYN_L4x16_LOOP + +DCOPYN_L4x16_END: + + +DCOPYN_L4x8_BEGIN: + + andi. J, M, 8 + ble DCOPYN_L4x8_END + COPY_4x8 + +DCOPYN_L4x8_END: + + +DCOPYN_L4x4_BEGIN: + + andi. J, M, 4 + ble DCOPYN_L4x4_END + COPY_4x4 + +DCOPYN_L4x4_END: + + +DCOPYN_L4x2_BEGIN: + + andi. J, M, 2 + ble DCOPYN_L4x2_END + COPY_4x2 + +DCOPYN_L4x2_END: + + +DCOPYN_L4x1_BEGIN: + + andi. J, M, 1 + ble DCOPYN_L4x1_END + COPY_4x1 + +DCOPYN_L4x1_END: + + +DCOPYN_L4_END: + + addic. I, I, -1 + bgt DCOPYN_L4_LOOP + +DCOPYN_L2_BEGIN: + + andi. T1, 4, 2 + ble DCOPYN_L2_END + +DCOPYN_L2_LOOP: + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + +DCOPYN_L2x16_BEGIN: + + srawi. J, M, 4 + ble DCOPYN_L2x16_END + +DCOPYN_L2x16_LOOP: + + COPY_2x16 + addic. J, J, -1 + bgt DCOPYN_L2x16_LOOP + +DCOPYN_L2x16_END: + + +DCOPYN_L2x8_BEGIN: + + andi. J, M, 8 + ble DCOPYN_L2x8_END + COPY_2x8 + +DCOPYN_L2x8_END: + + +DCOPYN_L2x4_BEGIN: + + andi. J, M, 4 + ble DCOPYN_L2x4_END + COPY_2x4 + +DCOPYN_L2x4_END: + + +DCOPYN_L2x2_BEGIN: + + andi. J, M, 2 + ble DCOPYN_L2x2_END + COPY_2x2 + +DCOPYN_L2x2_END: + + +DCOPYN_L2x1_BEGIN: + + andi. J, M, 1 + ble DCOPYN_L2x1_END + COPY_2x1 + +DCOPYN_L2x1_END: + + +DCOPYN_L2_END: + + +DCOPYN_L1_BEGIN: + + andi. T1, 4, 1 + ble DCOPYN_L1_END + +DCOPYN_L1_LOOP: + + mr A0, A + add A, A0, LDA + +DCOPYN_L1x16_BEGIN: + + srawi. J, M, 4 + ble DCOPYN_L1x16_END + +DCOPYN_L1x16_LOOP: + + COPY_1x16 + addic. J, J, -1 + bgt DCOPYN_L1x16_LOOP + +DCOPYN_L1x16_END: + + +DCOPYN_L1x8_BEGIN: + + andi. J, M, 8 + ble DCOPYN_L1x8_END + COPY_1x8 + +DCOPYN_L1x8_END: + + +DCOPYN_L1x4_BEGIN: + + andi. J, M, 4 + ble DCOPYN_L1x4_END + COPY_1x4 + +DCOPYN_L1x4_END: + + +DCOPYN_L1x2_BEGIN: + + andi. J, M, 2 + ble DCOPYN_L1x2_END + COPY_1x2 + +DCOPYN_L1x2_END: + + +DCOPYN_L1x1_BEGIN: + + andi. J, M, 1 + ble DCOPYN_L1x1_END + COPY_1x1 + +DCOPYN_L1x1_END: + + +DCOPYN_L1_END: + diff --git a/kernel/power/dgemm_ncopy_macros_4_power8.S b/kernel/power/dgemm_ncopy_macros_4_power8.S new file mode 100644 index 000000000..9b07d73f5 --- /dev/null +++ b/kernel/power/dgemm_ncopy_macros_4_power8.S @@ -0,0 +1,691 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro COPY_4x16 + + lxvd2x vs0, o0, A0 + lxvd2x vs8, o0, A1 + lxvd2x vs24, o0, A3 + lxvd2x vs16, o0, A2 + + lxvd2x vs1, o16, A0 + lxvd2x vs9, o16, A1 + lxvd2x vs17, o16, A2 + lxvd2x vs25, o16, A3 + + lxvd2x vs2, o32, A0 + lxvd2x vs10, o32, A1 + lxvd2x vs18, o32, A2 + lxvd2x vs26, o32, A3 + + lxvd2x vs3, o48, A0 + lxvd2x vs11, o48, A1 + lxvd2x vs19, o48, A2 + lxvd2x vs27, o48, A3 + + lxvd2x vs4, o64, A0 + lxvd2x vs12, o64, A1 + lxvd2x vs20, o64, A2 + lxvd2x vs28, o64, A3 + + lxvd2x vs5, o80, A0 + lxvd2x vs13, o80, A1 + lxvd2x vs21, o80, A2 + lxvd2x vs29, o80, A3 + + lxvd2x vs6, o96, A0 + lxvd2x vs14, o96, A1 + lxvd2x vs22, o96, A2 + lxvd2x vs30, o96, A3 + + lxvd2x vs7, o112, A0 + lxvd2x vs15, o112, A1 + lxvd2x vs23, o112, A2 + lxvd2x vs31, o112, A3 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + xxpermdi vs36, vs1, vs9, 0 + xxpermdi vs37, vs17, vs25, 0 + xxpermdi vs38, vs1, vs9, 3 + xxpermdi vs39, vs17, vs25, 3 + + xxpermdi vs40, vs2, vs10, 0 + xxpermdi vs41, vs18, vs26, 0 + xxpermdi vs42, vs2, vs10, 3 + xxpermdi vs43, vs18, vs26, 3 + + xxpermdi vs44, vs3, vs11, 0 + xxpermdi vs45, vs19, vs27, 0 + xxpermdi vs46, vs3, vs11, 3 + xxpermdi vs47, vs19, vs27, 3 + + xxpermdi vs48, vs4, vs12, 0 + xxpermdi vs49, vs20, vs28, 0 + xxpermdi vs50, vs4, vs12, 3 + xxpermdi vs51, vs20, vs28, 3 + + xxpermdi vs52, vs5, vs13, 0 + xxpermdi vs53, vs21, vs29, 0 + xxpermdi vs54, vs5, vs13, 3 + xxpermdi vs55, vs21, vs29, 3 + + addi A0, A0, 128 + addi A1, A1, 128 + + xxpermdi vs56, vs6, vs14, 0 + xxpermdi vs57, vs22, vs30, 0 + xxpermdi vs58, vs6, vs14, 3 + xxpermdi vs59, vs22, vs30, 3 + + addi A3, A3, 128 + addi A2, A2, 128 + + xxpermdi vs60, vs7, vs15, 0 + xxpermdi vs61, vs23, vs31, 0 + xxpermdi vs62, vs7, vs15, 3 + xxpermdi vs63, vs23, vs31, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + stxvd2x vs40, o0, BO + stxvd2x vs41, o16, BO + stxvd2x vs42, o32, BO + stxvd2x vs43, o48, BO + stxvd2x vs44, o64, BO + stxvd2x vs45, o80, BO + stxvd2x vs46, o96, BO + stxvd2x vs47, o112, BO + addi BO, BO, 128 + + stxvd2x vs48, o0, BO + stxvd2x vs49, o16, BO + stxvd2x vs50, o32, BO + stxvd2x vs51, o48, BO + stxvd2x vs52, o64, BO + stxvd2x vs53, o80, BO + stxvd2x vs54, o96, BO + stxvd2x vs55, o112, BO + addi BO, BO, 128 + + stxvd2x vs56, o0, BO + stxvd2x vs57, o16, BO + stxvd2x vs58, o32, BO + stxvd2x vs59, o48, BO + stxvd2x vs60, o64, BO + stxvd2x vs61, o80, BO + stxvd2x vs62, o96, BO + stxvd2x vs63, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + lxvd2x vs10, o32, A1 + lxvd2x vs11, o48, A1 + addi A1, A1, 64 + + + lxvd2x vs16, o0, A2 + lxvd2x vs17, o16, A2 + lxvd2x vs18, o32, A2 + lxvd2x vs19, o48, A2 + addi A2, A2, 64 + + + lxvd2x vs24, o0, A3 + lxvd2x vs25, o16, A3 + lxvd2x vs26, o32, A3 + lxvd2x vs27, o48, A3 + addi A3, A3, 64 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + xxpermdi vs36, vs1, vs9, 0 + xxpermdi vs37, vs17, vs25, 0 + xxpermdi vs38, vs1, vs9, 3 + xxpermdi vs39, vs17, vs25, 3 + + xxpermdi vs40, vs2, vs10, 0 + xxpermdi vs41, vs18, vs26, 0 + xxpermdi vs42, vs2, vs10, 3 + xxpermdi vs43, vs18, vs26, 3 + + xxpermdi vs44, vs3, vs11, 0 + xxpermdi vs45, vs19, vs27, 0 + xxpermdi vs46, vs3, vs11, 3 + xxpermdi vs47, vs19, vs27, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + stxvd2x vs40, o0, BO + stxvd2x vs41, o16, BO + stxvd2x vs42, o32, BO + stxvd2x vs43, o48, BO + stxvd2x vs44, o64, BO + stxvd2x vs45, o80, BO + stxvd2x vs46, o96, BO + stxvd2x vs47, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + addi A1, A1, 32 + + + lxvd2x vs16, o0, A2 + lxvd2x vs17, o16, A2 + addi A2, A2, 32 + + + lxvd2x vs24, o0, A3 + lxvd2x vs25, o16, A3 + addi A3, A3, 32 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + xxpermdi vs36, vs1, vs9, 0 + xxpermdi vs37, vs17, vs25, 0 + xxpermdi vs38, vs1, vs9, 3 + xxpermdi vs39, vs17, vs25, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxvd2x vs0, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs8, o0, A1 + addi A1, A1, 16 + + + lxvd2x vs16, o0, A2 + addi A2, A2, 16 + + + lxvd2x vs24, o0, A3 + addi A3, A3, 16 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxsdx vs0, o0, A0 + addi A0, A0, 8 + + + lxsdx vs8, o0, A1 + addi A1, A1, 8 + + + lxsdx vs16, o0, A2 + addi A2, A2, 8 + + + lxsdx vs24, o0, A3 + addi A3, A3, 8 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + addi BO, BO, 32 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +.macro COPY_2x16 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + lxvd2x vs4, o64, A0 + lxvd2x vs5, o80, A0 + lxvd2x vs6, o96, A0 + lxvd2x vs7, o112, A0 + addi A0, A0, 128 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + lxvd2x vs10, o32, A1 + lxvd2x vs11, o48, A1 + lxvd2x vs12, o64, A1 + lxvd2x vs13, o80, A1 + lxvd2x vs14, o96, A1 + lxvd2x vs15, o112, A1 + addi A1, A1, 128 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + xxpermdi vs34, vs1, vs9, 0 + xxpermdi vs35, vs1, vs9, 3 + + xxpermdi vs36, vs2, vs10, 0 + xxpermdi vs37, vs2, vs10, 3 + + xxpermdi vs38, vs3, vs11, 0 + xxpermdi vs39, vs3, vs11, 3 + + xxpermdi vs40, vs4, vs12, 0 + xxpermdi vs41, vs4, vs12, 3 + + xxpermdi vs42, vs5, vs13, 0 + xxpermdi vs43, vs5, vs13, 3 + + xxpermdi vs44, vs6, vs14, 0 + xxpermdi vs45, vs6, vs14, 3 + + xxpermdi vs46, vs7, vs15, 0 + xxpermdi vs47, vs7, vs15, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + stxvd2x vs40, o0, BO + stxvd2x vs41, o16, BO + stxvd2x vs42, o32, BO + stxvd2x vs43, o48, BO + stxvd2x vs44, o64, BO + stxvd2x vs45, o80, BO + stxvd2x vs46, o96, BO + stxvd2x vs47, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + lxvd2x vs10, o32, A1 + lxvd2x vs11, o48, A1 + addi A1, A1, 64 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + xxpermdi vs34, vs1, vs9, 0 + xxpermdi vs35, vs1, vs9, 3 + + xxpermdi vs36, vs2, vs10, 0 + xxpermdi vs37, vs2, vs10, 3 + + xxpermdi vs38, vs3, vs11, 0 + xxpermdi vs39, vs3, vs11, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + addi A1, A1, 32 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + xxpermdi vs34, vs1, vs9, 0 + xxpermdi vs35, vs1, vs9, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxvd2x vs0, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs8, o0, A1 + addi A1, A1, 16 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + addi BO, BO, 32 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxsdx vs0, o0, A0 + addi A0, A0, 8 + + + lxsdx vs8, o0, A1 + addi A1, A1, 8 + + + xxpermdi vs32, vs0, vs8, 0 + + + stxvd2x vs32, o0, BO + addi BO, BO, 16 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +.macro COPY_1x16 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + lxvd2x vs4, o64, A0 + lxvd2x vs5, o80, A0 + lxvd2x vs6, o96, A0 + lxvd2x vs7, o112, A0 + addi A0, A0, 128 + + + stxvd2x vs0, o0, BO + stxvd2x vs1, o16, BO + stxvd2x vs2, o32, BO + stxvd2x vs3, o48, BO + addi BO, BO, 64 + + stxvd2x vs4, o0, BO + stxvd2x vs5, o16, BO + stxvd2x vs6, o32, BO + stxvd2x vs7, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + addi A0, A0, 64 + + + stxvd2x vs0, o0, BO + stxvd2x vs1, o16, BO + stxvd2x vs2, o32, BO + stxvd2x vs3, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + addi A0, A0, 32 + + + stxvd2x vs0, o0, BO + stxvd2x vs1, o16, BO + addi BO, BO, 32 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxvd2x vs0, o0, A0 + addi A0, A0, 16 + + + stxvd2x vs0, o0, BO + addi BO, BO, 16 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxsdx vs0, o0, A0 + addi A0, A0, 8 + + + stxsdx vs0, o0, BO + addi BO, BO, 8 + + +.endm + diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S index f87af535d..eca78bac4 100644 --- a/kernel/power/dgemm_tcopy_16_power8.S +++ b/kernel/power/dgemm_tcopy_16_power8.S @@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add B2, B2, B add B1, B1, B - li PREA, 768 + li PREA, 256 addi PREB, M16, 128 li o8, 8 diff --git a/kernel/power/dgemm_tcopy_logic_16_power8.S b/kernel/power/dgemm_tcopy_logic_16_power8.S index 776cd3401..28fc74793 100644 --- a/kernel/power/dgemm_tcopy_logic_16_power8.S +++ b/kernel/power/dgemm_tcopy_logic_16_power8.S @@ -57,16 +57,20 @@ DCOPYT_L4_BEGIN: DCOPYT_L4x16_LOOP: +/* addi T1, PREB, 128 addi T2, PREB, 256 +*/ dcbt A0, PREA dcbt A1, PREA dcbt A2, PREA dcbt A3, PREA +/* dcbtst BO, M16 dcbtst BO, PREB dcbtst BO, T1 dcbtst BO, T2 +*/ COPY_4x16 add BO, BO, M16 diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S index 2294128a2..e9dbd991e 100644 --- a/kernel/power/dtrmm_kernel_16x4_power8.S +++ b/kernel/power/dtrmm_kernel_16x4_power8.S @@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PRE r30 #define T2 r31 -#include "dgemm_macros_16x4_power8.S" +#include "dtrmm_macros_16x4_power8.S" #ifndef NEEDPARAM diff --git a/kernel/power/dtrmm_macros_16x4_power8.S b/kernel/power/dtrmm_macros_16x4_power8.S new file mode 100644 index 000000000..079144a90 --- /dev/null +++ b/kernel/power/dtrmm_macros_16x4_power8.S @@ -0,0 +1,3431 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/********************************************************************* +* Macros for N=4, M=16 * +*********************************************************************/ + +.macro LOAD4x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_I1 + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + addi AO, AO, 64 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + addi AO, AO, 64 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + addi AO, AO, 64 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + xvmaddadp vs60, vs12, vs31 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + xvmaddadp vs63, vs15, vs31 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + xvmaddadp vs60, vs12, vs31 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + xvmaddadp vs63, vs15, vs31 + +.endm + +.macro KERNEL4x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + +.endm + +.macro KERNEL4x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + +.endm + +.macro SAVE4x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r + xvmaddadp vs2, vs50, alpha_r + xvmaddadp vs3, vs51, alpha_r + xvmaddadp vs4, vs52, alpha_r + xvmaddadp vs5, vs53, alpha_r + xvmaddadp vs6, vs54, alpha_r + xvmaddadp vs7, vs55, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r + xvmuldp vs2, vs50, alpha_r + xvmuldp vs3, vs51, alpha_r + xvmuldp vs4, vs52, alpha_r + xvmuldp vs5, vs53, alpha_r + xvmuldp vs6, vs54, alpha_r + xvmuldp vs7, vs55, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r + xvmaddadp vs10, vs58, alpha_r + xvmaddadp vs11, vs59, alpha_r + xvmaddadp vs12, vs60, alpha_r + xvmaddadp vs13, vs61, alpha_r + xvmaddadp vs14, vs62, alpha_r + xvmaddadp vs15, vs63, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r + xvmuldp vs10, vs58, alpha_r + xvmuldp vs11, vs59, alpha_r + xvmuldp vs12, vs60, alpha_r + xvmuldp vs13, vs61, alpha_r + xvmuldp vs14, vs62, alpha_r + xvmuldp vs15, vs63, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD4x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_I1 + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + +.endm + +.macro KERNEL4x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.endm + +.macro KERNEL4x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + +.endm + +.macro SAVE4x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r + xvmaddadp vs2, vs50, alpha_r + xvmaddadp vs3, vs51, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r + xvmuldp vs2, vs50, alpha_r + xvmuldp vs3, vs51, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r + xvmaddadp vs10, vs58, alpha_r + xvmaddadp vs11, vs59, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r + xvmuldp vs10, vs58, alpha_r + xvmuldp vs11, vs59, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=4, M=4 * +*********************************************************************/ + +.macro LOAD4x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=4, M=2 * +*********************************************************************/ + +.macro LOAD4x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r +#else + xvmuldp vs0, vs48, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r +#else + xvmuldp vs8, vs56, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=4, M=1 * +*********************************************************************/ + +.macro LOAD4x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs48, alpha_r +#else + xsmuldp vs0, vs48, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs56, alpha_r +#else + xsmuldp vs8, vs56, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=2, M=16 * +*********************************************************************/ + +.macro LOAD2x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL2x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro SAVE2x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD2x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=2, M=4 * +*********************************************************************/ + +.macro LOAD2x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=2, M=2 * +*********************************************************************/ + +.macro LOAD2x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=2, M=1 * +*********************************************************************/ + +.macro LOAD2x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=1, M=16 * +*********************************************************************/ + +.macro LOAD1x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL1x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro SAVE1x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD1x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=1, M=4 * +*********************************************************************/ + +.macro LOAD1x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=1, M=2 * +*********************************************************************/ + +.macro LOAD1x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=1, M=1 * +*********************************************************************/ + +.macro LOAD1x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + addi CO, CO, 8 + +.endm + diff --git a/kernel/power/sgemm_tcopy_8_power8.S b/kernel/power/sgemm_tcopy_8_power8.S new file mode 100644 index 000000000..2bbd6e696 --- /dev/null +++ b/kernel/power/sgemm_tcopy_8_power8.S @@ -0,0 +1,207 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define B8 r17 +#define B4 r18 +#define B2 r19 +#define B1 r20 +#define o4 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define NOTU1 r29 +#define M8 r30 +#define T1 r31 + +#define o0 0 + +#include "sgemm_tcopy_macros_8_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, BASE_SHIFT + slwi M8, M, 3 + BASE_SHIFT + + li T2, -8 + li PREA, -4 + li PREB, -2 + + and B4, N, T2 + and B2, N, PREA + and B1, N, PREB + + mullw B4, B4, M + mullw B2, B2, M + mullw B1, B1, M + + slwi B4, B4, BASE_SHIFT + slwi B2, B2, BASE_SHIFT + slwi B1, B1, BASE_SHIFT + + add B4, B4, B + add B2, B2, B + add B1, B1, B + + li PREA, 384 + addi PREB, M8, 128 + + li o4, 4 + li o16, 16 + li o32, 32 + li o48, 48 + +#include "sgemm_tcopy_logic_8_power8.S" + +L999: + + li r3, 0 + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/sgemm_tcopy_logic_8_power8.S b/kernel/power/sgemm_tcopy_logic_8_power8.S new file mode 100644 index 000000000..4cf74baa3 --- /dev/null +++ b/kernel/power/sgemm_tcopy_logic_8_power8.S @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. I, M, 2 + ble SCOPYOT_L2_BEGIN + + +SCOPYOT_L4_BEGIN: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + mr B8, B + addi B, B, 32*SIZE + + sradi. J, N, 3 + ble SCOPYOT_L4x4_BEGIN + + mr BO, B8 + .align 5 + +SCOPYOT_L4x8_LOOP: + + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + COPY_4x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + addi A2, A2, 8*SIZE + addi A3, A3, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + ble SCOPYOT_L4x4_BEGIN + + COPY_4x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + addi A2, A2, 8*SIZE + addi A3, A3, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + ble SCOPYOT_L4x4_BEGIN + + COPY_4x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + addi A2, A2, 8*SIZE + addi A3, A3, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + ble SCOPYOT_L4x4_BEGIN + + COPY_4x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + addi A2, A2, 8*SIZE + addi A3, A3, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + bgt SCOPYOT_L4x8_LOOP + +SCOPYOT_L4x4_BEGIN: + + andi. T1, N, 4 + ble SCOPYOT_L4x2_BEGIN + + mr BO, B4 + + COPY_4x4 + + addi A0, A0, 4*SIZE + addi A1, A1, 4*SIZE + addi A2, A2, 4*SIZE + addi A3, A3, 4*SIZE + + addi B4, B4, 16*SIZE + +SCOPYOT_L4x2_BEGIN: + + andi. T1, N, 2 + ble SCOPYOT_L4x1_BEGIN + + mr BO, B2 + + COPY_4x2 + + addi A0, A0, 2*SIZE + addi A1, A1, 2*SIZE + addi A2, A2, 2*SIZE + addi A3, A3, 2*SIZE + + addi B2, B2, 8*SIZE + +SCOPYOT_L4x1_BEGIN: + + andi. T1, N, 1 + ble SCOPYOT_L4_END + + mr BO, B1 + + COPY_4x1 + + addi A0, A0, 1*SIZE + addi A1, A1, 1*SIZE + addi A2, A2, 1*SIZE + addi A3, A3, 1*SIZE + + addi B1, B1, 4*SIZE + +SCOPYOT_L4_END: + + addic. I, I, -1 + bgt SCOPYOT_L4_BEGIN + + + +SCOPYOT_L2_BEGIN: + + andi. T1, M, 2 + ble SCOPYOT_L1_BEGIN + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + mr B8, B + addi B, B, 16*SIZE + + sradi. J, N, 3 + ble SCOPYOT_L2x4_BEGIN + + mr BO, B8 + +SCOPYOT_L2x8_LOOP: + + COPY_2x8 + + addi A0, A0, 8*SIZE + addi A1, A1, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + bgt SCOPYOT_L2x8_LOOP + +SCOPYOT_L2x4_BEGIN: + + andi. T1, N, 4 + ble SCOPYOT_L2x2_BEGIN + + mr BO, B4 + + COPY_2x4 + + addi A0, A0, 4*SIZE + addi A1, A1, 4*SIZE + + addi B4, B4, 8*SIZE + +SCOPYOT_L2x2_BEGIN: + + andi. T1, N, 2 + ble SCOPYOT_L2x1_BEGIN + + mr BO, B2 + + COPY_2x2 + + addi A0, A0, 2*SIZE + addi A1, A1, 2*SIZE + + addi B2, B2, 4*SIZE + +SCOPYOT_L2x1_BEGIN: + + andi. T1, N, 1 + ble SCOPYOT_L2_END + + mr BO, B1 + + COPY_2x1 + + addi A0, A0, 1*SIZE + addi A1, A1, 1*SIZE + + addi B1, B1, 2*SIZE + +SCOPYOT_L2_END: + + +SCOPYOT_L1_BEGIN: + + andi. T1, M, 1 + ble L999 + + mr A0, A + add A, A0, LDA + mr B8, B + addi B, B, 8*SIZE + + sradi. J, N, 3 + ble SCOPYOT_L1x4_BEGIN + + mr BO, B8 + +SCOPYOT_L1x8_LOOP: + + COPY_1x8 + + addi A0, A0, 8*SIZE + add BO, BO, M8 + + addic. J, J, -1 + bgt SCOPYOT_L1x8_LOOP + +SCOPYOT_L1x4_BEGIN: + + andi. T1, N, 4 + ble SCOPYOT_L1x2_BEGIN + + mr BO, B4 + + COPY_1x4 + + addi A0, A0, 4*SIZE + + addi B4, B4, 4*SIZE + +SCOPYOT_L1x2_BEGIN: + + andi. T1, N, 2 + ble SCOPYOT_L1x1_BEGIN + + mr BO, B2 + + COPY_1x2 + + addi A0, A0, 2*SIZE + + addi B2, B2, 2*SIZE + +SCOPYOT_L1x1_BEGIN: + + andi. T1, N, 1 + ble SCOPYOT_L1_END + + mr BO, B1 + + COPY_1x1 + + addi A0, A0, 1*SIZE + + addi B1, B1, 1*SIZE + +SCOPYOT_L1_END: + diff --git a/kernel/power/sgemm_tcopy_macros_8_power8.S b/kernel/power/sgemm_tcopy_macros_8_power8.S new file mode 100644 index 000000000..1b71d5bb3 --- /dev/null +++ b/kernel/power/sgemm_tcopy_macros_8_power8.S @@ -0,0 +1,308 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + + lxvw4x vs36, o0, A2 + lxvw4x vs37, o16, A2 + + lxvw4x vs38, o0, A3 + lxvw4x vs39, o16, A3 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + + addi T1, T1, 64 + + stxvw4x vs36, o0, T1 + stxvw4x vs37, o16, T1 + + stxvw4x vs38, o32, T1 + stxvw4x vs39, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvw4x vs32, o0, A0 + + lxvw4x vs33, o0, A1 + + lxvw4x vs34, o0, A2 + + lxvw4x vs35, o0, A3 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + + lxsspx vs36, o0, A2 + lxsspx vs37, o4, A2 + + lxsspx vs38, o0, A3 + lxsspx vs39, o4, A3 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + + addi T1, T1, 8 + + stxsspx vs36, o0, T1 + stxsspx vs37, o4, T1 + + addi T1, T1, 8 + + stxsspx vs38, o0, T1 + stxsspx vs39, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxsspx vs32, o0, A0 + + lxsspx vs33, o0, A1 + + lxsspx vs34, o0, A2 + + lxsspx vs35, o0, A3 + + mr T1, BO + + stxsspx vs32, o0, T1 + + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + + stxsspx vs35, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + + lxvw4x vs34, o0, A1 + lxvw4x vs35, o16, A1 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + + stxvw4x vs34, o32, T1 + stxvw4x vs35, o48, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvw4x vs32, o0, A0 + + lxvw4x vs33, o0, A1 + + mr T1, BO + + stxvw4x vs32, o0, T1 + + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + + lxsspx vs34, o0, A1 + lxsspx vs35, o4, A1 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + + addi T1, T1, 8 + + stxsspx vs34, o0, T1 + stxsspx vs35, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxsspx vs32, o0, A0 + + lxsspx vs33, o0, A1 + + mr T1, BO + + stxsspx vs32, o0, T1 + + stxsspx vs33, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvw4x vs32, o0, A0 + lxvw4x vs33, o16, A0 + + mr T1, BO + + stxvw4x vs32, o0, T1 + stxvw4x vs33, o16, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvw4x vs32, o0, A0 + + mr T1, BO + + stxvw4x vs32, o0, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxsspx vs32, o0, A0 + lxsspx vs33, o4, A0 + + mr T1, BO + + stxsspx vs32, o0, T1 + stxsspx vs33, o4, T1 + +.endm + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxsspx vs32, o0, A0 + + mr T1, BO + + stxsspx vs32, o0, T1 + +.endm + diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S index 336b13b1f..02c94a88a 100644 --- a/kernel/power/zgemm_kernel_8x2_power8.S +++ b/kernel/power/zgemm_kernel_8x2_power8.S @@ -1,3 +1,73 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -250,7 +320,7 @@ ble L999 slwi LDC, LDC, ZBASE_SHIFT - li PRE, 384 + li PRE, 512 li o8 , 8 li o16 , 16 li o24 , 24 diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S index 96612da82..0cd784cc0 100644 --- a/kernel/power/zgemm_logic_8x2_power8.S +++ b/kernel/power/zgemm_logic_8x2_power8.S @@ -1,3 +1,39 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + srawi. J, N, 1 ble ZGEMM_L2_END @@ -5,20 +41,34 @@ ZGEMM_L2_BEGIN: mr BO, B mr BBO, BBUFFER - slwi T1, K, 1 + srawi. T1, K, 2 + ble ZGEMM_L2_COPYB1 -ZGEMM_L2_COPYB: +ZGEMM_L2_COPYB8: - lxvdsx vs4, o0, BO // b0_r - lxvdsx vs5, o8, BO // b0_i - addi BO, BO, 16 - stxvd2x vs4, o0, BBO - stxvd2x vs5, o16, BBO + addi T2, PRE, 128 + dcbt BO, PRE + dcbtst BBO, PRE + dcbtst BBO, T2 + ZCOPYB_8x1 addic. T1, T1, -1 - addi BBO, BBO, 32 - bge ZGEMM_L2_COPYB + bgt ZGEMM_L2_COPYB8 +ZGEMM_L2_COPYB1: + + andi. T1, K, 3 + ble ZGEMM_L2_COPYB_END + +ZGEMM_L2_COPYB_LOOP: + + ZCOPYB_1x1 + ZCOPYB_1x1 + addic. T1, T1, -1 + + bgt ZGEMM_L2_COPYB_LOOP + +ZGEMM_L2_COPYB_END: mr CO, C mr AO, A @@ -493,6 +543,7 @@ ZGEMM_L1_BEGIN: slwi T1, K, 0 ZGEMM_L1_COPYB: + dcbtst BBO, PRE lxvdsx vs4, o0, BO // b0_r lxvdsx vs5, o8, BO // b0_i diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S index a0fbb2e11..c43a115b2 100644 --- a/kernel/power/zgemm_macros_8x2_power8.S +++ b/kernel/power/zgemm_macros_8x2_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define XSFADD_R1 xsadddp @@ -3055,3 +3090,76 @@ .endm + + +.macro ZCOPYB_1x1 + + lxvdsx vs4, o0, BO // b0_r + lxvdsx vs5, o8, BO // b0_i + addi BO, BO, 16 + stxvd2x vs4, o0, BBO + stxvd2x vs5, o16, BBO + addi BBO, BBO, 32 + +.endm + + +.macro ZCOPYB_8x1 + + lxvd2x vs32, o0, BO + lxvd2x vs33, o16, BO + lxvd2x vs34, o32, BO + lxvd2x vs35, o48, BO + addi BO, BO, 64 + + lxvd2x vs36, o0, BO + lxvd2x vs37, o16, BO + lxvd2x vs38, o32, BO + lxvd2x vs39, o48, BO + addi BO, BO, 64 + + xxspltd vs40, vs32, 0 + xxspltd vs41, vs32, 1 + xxspltd vs42, vs33, 0 + xxspltd vs43, vs33, 1 + xxspltd vs44, vs34, 0 + xxspltd vs45, vs34, 1 + xxspltd vs46, vs35, 0 + xxspltd vs47, vs35, 1 + + xxspltd vs48, vs36, 0 + xxspltd vs49, vs36, 1 + xxspltd vs50, vs37, 0 + xxspltd vs51, vs37, 1 + xxspltd vs52, vs38, 0 + xxspltd vs53, vs38, 1 + xxspltd vs54, vs39, 0 + xxspltd vs55, vs39, 1 + + stxvd2x vs40, o0, BBO + stxvd2x vs41, o16, BBO + stxvd2x vs42, o32, BBO + stxvd2x vs43, o48, BBO + addi BBO, BBO, 64 + + stxvd2x vs44, o0, BBO + stxvd2x vs45, o16, BBO + stxvd2x vs46, o32, BBO + stxvd2x vs47, o48, BBO + addi BBO, BBO, 64 + + stxvd2x vs48, o0, BBO + stxvd2x vs49, o16, BBO + stxvd2x vs50, o32, BBO + stxvd2x vs51, o48, BBO + addi BBO, BBO, 64 + + stxvd2x vs52, o0, BBO + stxvd2x vs53, o16, BBO + stxvd2x vs54, o32, BBO + stxvd2x vs55, o48, BBO + addi BBO, BBO, 64 + +.endm + + diff --git a/kernel/power/zgemm_tcopy_8_power8.S b/kernel/power/zgemm_tcopy_8_power8.S new file mode 100644 index 000000000..1f3f35419 --- /dev/null +++ b/kernel/power/zgemm_tcopy_8_power8.S @@ -0,0 +1,205 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define B8 r17 +#define B4 r18 +#define B2 r19 +#define B1 r20 +#define NOTUS1 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define NOTUS2 r27 +#define M8 r30 +#define T1 r31 + +#define o0 0 + +#include "zgemm_tcopy_macros_8_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, ZBASE_SHIFT + slwi M8, M, 3 + ZBASE_SHIFT + + li T2, -8 + li PREA, -4 + li PREB, -2 + + and B4, N, T2 + and B2, N, PREA + and B1, N, PREB + + mullw B4, B4, M + mullw B2, B2, M + mullw B1, B1, M + + slwi B4, B4, ZBASE_SHIFT + slwi B2, B2, ZBASE_SHIFT + slwi B1, B1, ZBASE_SHIFT + + add B4, B4, B + add B2, B2, B + add B1, B1, B + + li PREA, 384 + addi PREB, M8, 128 + + li o16, 16 + li o32, 32 + li o48, 48 + +#include "zgemm_tcopy_logic_8_power8.S" + +L999: + + li r3, 0 + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/zgemm_tcopy_logic_8_power8.S b/kernel/power/zgemm_tcopy_logic_8_power8.S new file mode 100644 index 000000000..34fd307bd --- /dev/null +++ b/kernel/power/zgemm_tcopy_logic_8_power8.S @@ -0,0 +1,246 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. I, M, 2 + ble ZCOPYT_L2_BEGIN + + +ZCOPYT_L4_BEGIN: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + mr B8, B + addi B, B, 64*SIZE + + sradi. J, N, 3 + ble ZCOPYT_L4x4_BEGIN + + mr BO, B8 + + .align 5 + +ZCOPYT_L4x8_LOOP: + + addi T1, PREB, 128 + addi T2, PREB, 256 + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + dcbtst BO, M8 + dcbtst BO, PREB + dcbtst BO, T1 + dcbtst BO, T2 + + COPY_4x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt ZCOPYT_L4x8_LOOP + +ZCOPYT_L4x4_BEGIN: + + andi. T1, N, 4 + ble ZCOPYT_L4x2_BEGIN + + mr BO, B4 + + COPY_4x4 + + + addi B4, B4, 32*SIZE + +ZCOPYT_L4x2_BEGIN: + + andi. T1, N, 2 + ble ZCOPYT_L4x1_BEGIN + + mr BO, B2 + + COPY_4x2 + + + addi B2, B2, 16*SIZE + +ZCOPYT_L4x1_BEGIN: + + andi. T1, N, 1 + ble ZCOPYT_L4_END + + mr BO, B1 + + COPY_4x1 + + + addi B1, B1, 8*SIZE + +ZCOPYT_L4_END: + + addic. I, I, -1 + bgt ZCOPYT_L4_BEGIN + + + +ZCOPYT_L2_BEGIN: + + andi. T1, M, 2 + ble ZCOPYT_L1_BEGIN + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + mr B8, B + addi B, B, 32*SIZE + + sradi. J, N, 3 + ble ZCOPYT_L2x4_BEGIN + + mr BO, B8 + +ZCOPYT_L2x8_LOOP: + + COPY_2x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt ZCOPYT_L2x8_LOOP + +ZCOPYT_L2x4_BEGIN: + + andi. T1, N, 4 + ble ZCOPYT_L2x2_BEGIN + + mr BO, B4 + + COPY_2x4 + + + addi B4, B4, 16*SIZE + +ZCOPYT_L2x2_BEGIN: + + andi. T1, N, 2 + ble ZCOPYT_L2x1_BEGIN + + mr BO, B2 + + COPY_2x2 + + + addi B2, B2, 8*SIZE + +ZCOPYT_L2x1_BEGIN: + + andi. T1, N, 1 + ble ZCOPYT_L2_END + + mr BO, B1 + + COPY_2x1 + + + addi B1, B1, 4*SIZE + +ZCOPYT_L2_END: + + +ZCOPYT_L1_BEGIN: + + andi. T1, M, 1 + ble L999 + + mr A0, A + add A, A0, LDA + mr B8, B + addi B, B, 16*SIZE + + sradi. J, N, 3 + ble ZCOPYT_L1x4_BEGIN + + mr BO, B8 + +ZCOPYT_L1x8_LOOP: + + COPY_1x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt ZCOPYT_L1x8_LOOP + +ZCOPYT_L1x4_BEGIN: + + andi. T1, N, 4 + ble ZCOPYT_L1x2_BEGIN + + mr BO, B4 + + COPY_1x4 + + + addi B4, B4, 8*SIZE + +ZCOPYT_L1x2_BEGIN: + + andi. T1, N, 2 + ble ZCOPYT_L1x1_BEGIN + + mr BO, B2 + + COPY_1x2 + + + addi B2, B2, 4*SIZE + +ZCOPYT_L1x1_BEGIN: + + andi. T1, N, 1 + ble ZCOPYT_L1_END + + mr BO, B1 + + COPY_1x1 + + + addi B1, B1, 2*SIZE + +ZCOPYT_L1_END: + diff --git a/kernel/power/zgemm_tcopy_macros_8_power8.S b/kernel/power/zgemm_tcopy_macros_8_power8.S new file mode 100644 index 000000000..e8c2f0baa --- /dev/null +++ b/kernel/power/zgemm_tcopy_macros_8_power8.S @@ -0,0 +1,535 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs40, o0, A1 + lxvd2x vs41, o16, A1 + lxvd2x vs42, o32, A1 + lxvd2x vs43, o48, A1 + addi A1, A1, 64 + + lxvd2x vs44, o0, A1 + lxvd2x vs45, o16, A1 + lxvd2x vs46, o32, A1 + lxvd2x vs47, o48, A1 + addi A1, A1, 64 + + + lxvd2x vs48, o0, A2 + lxvd2x vs49, o16, A2 + lxvd2x vs50, o32, A2 + lxvd2x vs51, o48, A2 + addi A2, A2, 64 + + lxvd2x vs52, o0, A2 + lxvd2x vs53, o16, A2 + lxvd2x vs54, o32, A2 + lxvd2x vs55, o48, A2 + addi A2, A2, 64 + + + lxvd2x vs56, o0, A3 + lxvd2x vs57, o16, A3 + lxvd2x vs58, o32, A3 + lxvd2x vs59, o48, A3 + addi A3, A3, 64 + + lxvd2x vs60, o0, A3 + lxvd2x vs61, o16, A3 + lxvd2x vs62, o32, A3 + lxvd2x vs63, o48, A3 + addi A3, A3, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs48, o0, T1 + stxvd2x vs49, o16, T1 + stxvd2x vs50, o32, T1 + stxvd2x vs51, o48, T1 + addi T1, T1, 64 + + stxvd2x vs52, o0, T1 + stxvd2x vs53, o16, T1 + stxvd2x vs54, o32, T1 + stxvd2x vs55, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs56, o0, T1 + stxvd2x vs57, o16, T1 + stxvd2x vs58, o32, T1 + stxvd2x vs59, o48, T1 + addi T1, T1, 64 + + stxvd2x vs60, o0, T1 + stxvd2x vs61, o16, T1 + stxvd2x vs62, o32, T1 + stxvd2x vs63, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs36, o0, A1 + lxvd2x vs37, o16, A1 + lxvd2x vs38, o32, A1 + lxvd2x vs39, o48, A1 + addi A1, A1, 64 + + + lxvd2x vs40, o0, A2 + lxvd2x vs41, o16, A2 + lxvd2x vs42, o32, A2 + lxvd2x vs43, o48, A2 + addi A2, A2, 64 + + + lxvd2x vs44, o0, A3 + lxvd2x vs45, o16, A3 + lxvd2x vs46, o32, A3 + lxvd2x vs47, o48, A3 + addi A3, A3, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs34, o0, A1 + lxvd2x vs35, o16, A1 + addi A1, A1, 32 + + + lxvd2x vs36, o0, A2 + lxvd2x vs37, o16, A2 + addi A2, A2, 32 + + + lxvd2x vs38, o0, A3 + lxvd2x vs39, o16, A3 + addi A3, A3, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs33, o0, A1 + addi A1, A1, 16 + + + lxvd2x vs34, o0, A2 + addi A2, A2, 16 + + + lxvd2x vs35, o0, A3 + addi A3, A3, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs40, o0, A1 + lxvd2x vs41, o16, A1 + lxvd2x vs42, o32, A1 + lxvd2x vs43, o48, A1 + addi A1, A1, 64 + + lxvd2x vs44, o0, A1 + lxvd2x vs45, o16, A1 + lxvd2x vs46, o32, A1 + lxvd2x vs47, o48, A1 + addi A1, A1, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs36, o0, A1 + lxvd2x vs37, o16, A1 + lxvd2x vs38, o32, A1 + lxvd2x vs39, o48, A1 + addi A1, A1, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs34, o0, A1 + lxvd2x vs35, o16, A1 + addi A1, A1, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs33, o0, A1 + addi A1, A1, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + + stxvd2x vs33, o16, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + +.endm + diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index a4d1486fc..ba44b8f61 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -933,6 +933,23 @@ static void init_parameter(void) { #endif #endif +#ifdef EXCAVATOR + +#ifdef DEBUG + fprintf(stderr, "Excavator\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + + #ifdef PILEDRIVER #ifdef DEBUG diff --git a/kernel/x86_64/KERNEL.EXCAVATOR b/kernel/x86_64/KERNEL.EXCAVATOR index dbdd1fe9b..4ec748284 100644 --- a/kernel/x86_64/KERNEL.EXCAVATOR +++ b/kernel/x86_64/KERNEL.EXCAVATOR @@ -1,3 +1,7 @@ +DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c +ZSCALKERNEL = zscal.c + SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c @@ -20,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n_4.c DGEMVTKERNEL = dgemv_t_4.c -ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c DCOPYKERNEL = dcopy_bulldozer.S @@ -68,25 +72,23 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = strsm_kernel_LN_bulldozer.c +STRSMKERNEL_LT = strsm_kernel_LT_bulldozer.c +STRSMKERNEL_RN = strsm_kernel_RN_bulldozer.c +STRSMKERNEL_RT = strsm_kernel_RT_bulldozer.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LN = dtrsm_kernel_LN_bulldozer.c DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_RT = dtrsm_kernel_RT_bulldozer.c -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CTRSMKERNEL_LN = ctrsm_kernel_LN_bulldozer.c +CTRSMKERNEL_LT = ctrsm_kernel_LT_bulldozer.c +CTRSMKERNEL_RN = ctrsm_kernel_RN_bulldozer.c +CTRSMKERNEL_RT = ctrsm_kernel_RT_bulldozer.c +ZTRSMKERNEL_LN = ztrsm_kernel_LN_bulldozer.c +ZTRSMKERNEL_LT = ztrsm_kernel_LT_bulldozer.c +ZTRSMKERNEL_RN = ztrsm_kernel_RN_bulldozer.c +ZTRSMKERNEL_RT = ztrsm_kernel_RT_bulldozer.c diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index 1ee0499a7..5af9b8fcc 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "caxpy_microk_steamroller-2.c" #elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index 2b2c4ff7a..9bba72ba2 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "cdot_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) || defined(PILEDRIVER) +#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "cdot_microk_steamroller-2.c" #elif defined(HASWELL) #include "cdot_microk_haswell-2.c" diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index d60e4475d..235510534 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "cgemv_n_microk_haswell-4.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_n_microk_bulldozer-4.c" #endif diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index b558164ff..1a714f61f 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "cgemv_t_microk_haswell-4.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_t_microk_bulldozer-4.c" #endif diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 5d86b1929..c44d12e3d 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) +#elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "cscal_microk_steamroller-2.c" #elif defined(SANDYBRIDGE) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 56d323cbe..18569e6e4 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_nehalem-2.c" #elif defined(BULLDOZER) #include "daxpy_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) +#elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "daxpy_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "daxpy_microk_piledriver-2.c" diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 4bf8082c9..a45dd7f3b 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "ddot_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) +#elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "ddot_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "ddot_microk_piledriver-2.c" diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 485b234b0..4200b8acd 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "dgemv_n_microk_nehalem-4.c" -#elif defined(HASWELL) || defined(STEAMROLLER) +#elif defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index 8ed821dd0..42f11f39a 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(STEAMROLLER) +#if defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index b7110e6ac..bbc1c9660 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "dscal_microk_sandy-2.c" diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 3f5e77e5f..e10784ad7 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_L_microk_bulldozer-2.c" #elif defined(HASWELL) #include "dsymv_L_microk_haswell-2.c" diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index 9f5ae3015..bd07ce2c3 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_U_microk_bulldozer-2.c" #elif defined(HASWELL) #include "dsymv_U_microk_haswell-2.c" diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index 0b76c42f7..b9e5d5784 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "saxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" -#elif defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "saxpy_microk_piledriver-2.c" #endif diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index a3d20d276..d9fc417a0 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "sdot_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) || defined(PILEDRIVER) +#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "sdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index c7b4516c3..bdf68dd07 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "sgemv_n_microk_bulldozer-4.c" #elif defined(NEHALEM) #include "sgemv_n_microk_nehalem-4.c" @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_haswell-4.c" #endif -#if defined(STEAMROLLER) +#if defined(STEAMROLLER) || defined(EXCAVATOR) #define NBMAX 2048 #else #define NBMAX 4096 diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 5c7d1a53b..62550e65c 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "sgemv_t_microk_nehalem-4.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_haswell-4.c" #endif -#if defined(STEAMROLLER) +#if defined(STEAMROLLER) || defined(EXCAVATOR) #define NBMAX 2048 #else #define NBMAX 4096 diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 0997f108d..3813981ed 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index ed1e8236c..e4d3c9b30 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index 560acc7f9..0cd555a68 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "zaxpy_microk_bulldozer-2.c" -#elif defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zaxpy_microk_steamroller-2.c" #elif defined(HASWELL) #include "zaxpy_microk_haswell-2.c" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index eee00fd9f..4533d4e88 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "zdot_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) || defined(PILEDRIVER) +#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "zdot_microk_steamroller-2.c" #elif defined(HASWELL) #include "zdot_microk_haswell-2.c" diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 63e49f2af..4171fc99f 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zgemv_n_microk_haswell-4.c" #elif defined(SANDYBRIDGE) #include "zgemv_n_microk_sandy-4.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_n_microk_bulldozer-4.c" #endif diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 4abb2d5ad..0524c71f7 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" #elif defined(HASWELL) #include "zgemv_t_microk_haswell-4.c" diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index a96766032..7ca8774b7 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" -#elif defined(STEAMROLLER) +#elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "zscal_microk_steamroller-2.c" #endif diff --git a/param.h b/param.h index 6948e6a76..abe739af2 100644 --- a/param.h +++ b/param.h @@ -1977,15 +1977,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 960 -#define DGEMM_DEFAULT_P 480 -#define CGEMM_DEFAULT_P 720 -#define ZGEMM_DEFAULT_P 480 +#define SGEMM_DEFAULT_P 1280 +#define DGEMM_DEFAULT_P 640 +#define CGEMM_DEFAULT_P 640 +#define ZGEMM_DEFAULT_P 320 -#define SGEMM_DEFAULT_Q 720 -#define DGEMM_DEFAULT_Q 720 -#define CGEMM_DEFAULT_Q 720 -#define ZGEMM_DEFAULT_Q 720 +#define SGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 640 +#define CGEMM_DEFAULT_Q 640 +#define ZGEMM_DEFAULT_Q 640 #define SYMV_P 8 diff --git a/test/Makefile b/test/Makefile index 75ea6de60..65fb6f438 100644 --- a/test/Makefile +++ b/test/Makefile @@ -4,6 +4,7 @@ include ../Makefile.system all :: level1 level2 level3 level1 : sblat1 dblat1 cblat1 zblat1 +ifndef CROSS OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1 @@ -21,8 +22,10 @@ else OPENBLAS_NUM_THREADS=2 ./zblat1 endif endif +endif level2 : sblat2 dblat2 cblat2 zblat2 +ifndef CROSS rm -f ?BLAT2.SUMM OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 @@ -54,8 +57,10 @@ else @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 endif endif +endif level3 : sblat3 dblat3 cblat3 zblat3 +ifndef CROSS rm -f ?BLAT3.SUMM OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 @@ -87,9 +92,11 @@ else @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 endif endif +endif level3_3m : zblat3_3m cblat3_3m +ifndef CROSS rm -f ?BLAT3_3M.SUMM OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 @@ -109,6 +116,7 @@ else @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 endif endif +endif diff --git a/utest/Makefile b/utest/Makefile index 9f9808920..3ccc0a041 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -21,7 +21,9 @@ $(UTESTBIN): $(OBJS) $(CC) $(CFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) run_test: $(UTESTBIN) +ifndef CROSS ./$(UTESTBIN) +endif clean: -rm -f *.o $(UTESTBIN)