diff --git a/CMakeLists.txt b/CMakeLists.txt
index 78d5e0eb6..ead63bff8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4)
project(OpenBLAS)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 2)
-set(OpenBLAS_PATCH_VERSION 17)
+set(OpenBLAS_PATCH_VERSION 18)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
enable_language(ASM)
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index da56c0758..ebe52ea8a 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -147,5 +147,6 @@ In chronological order:
* [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
* [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
-* [Your name or handle] <[email or website]>
- * [Date] [Brief summary of your changes]
+* theoractice
+ * [2016-03-20] Fix compiler error in VisualStudio with CMake
+ * [2016-03-22] Fix access violation on Windows while static linking
diff --git a/Changelog.txt b/Changelog.txt
index c59166c38..7f82e8e88 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,22 @@
OpenBLAS ChangeLog
+====================================================================
+Version 0.2.18
+12-Apr-2016
+common:
+ * If you set MAKE_NB_JOBS flag less or equal than zero,
+ make will be without -j.
+
+x86/x86_64:
+ * Support building Visual Studio static library. (#813, Thanks, theoractice)
+ * Fix bugs to pass buidbot CI tests (http://build.openblas.net)
+
+ARM:
+ * Provide DGEMM 8x4 kernel for Cortex-A57 (Thanks, Ashwin Sekhar T K)
+
+POWER:
+ * Optimize S and C BLAS3 on Power8
+ * Optimize BLAS2/1 on Power8
+
====================================================================
Version 0.2.17
20-Mar-2016
diff --git a/Makefile.rule b/Makefile.rule
index 0758a48a8..d8db6102c 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
#
# This library's version
-VERSION = 0.2.17
+VERSION = 0.2.18
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -112,7 +112,10 @@ NO_AFFINITY = 1
# NO_PARALLEL_MAKE = 1
# Force number of make jobs. The default is the number of logical CPU of the host.
-# This is particularly useful when using distcc
+# This is particularly useful when using distcc.
+# A negative value will disable adding a -j flag to make, allowing to use a parent
+# make -j value. This is useful to call OpenBLAS make from an other project
+# makefile
# MAKE_NB_JOBS = 2
# If you would like to know minute performance report of GotoBLAS.
diff --git a/appveyor.yml b/appveyor.yml
index 172a49b42..5360a9ef9 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,4 +1,4 @@
-version: 0.2.15.{build}
+version: 0.2.18.{build}
#environment:
diff --git a/benchmark/Makefile b/benchmark/Makefile
index 11d3c5bec..8166f3863 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -33,6 +33,10 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread
# Apple vecLib
LIBVECLIB = -framework Accelerate
+ESSL=/opt/ibm/lib
+#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
+LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
+
ifeq ($(OSNAME), WINNT)
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
@@ -44,6 +48,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
sger.goto dger.goto cger.goto zger.goto \
sdot.goto ddot.goto \
+ srot.goto drot.goto \
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
scopy.goto dcopy.goto ccopy.goto zcopy.goto \
sswap.goto dswap.goto cswap.goto zswap.goto \
@@ -151,6 +156,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
sger.goto dger.goto cger.goto zger.goto \
sdot.goto ddot.goto cdot.goto zdot.goto \
+ srot.goto drot.goto \
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
scopy.goto dcopy.goto ccopy.goto zcopy.goto \
sswap.goto dswap.goto cswap.goto zswap.goto \
@@ -253,7 +259,9 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
endif
-
+essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \
+ cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \
+ slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
@@ -306,6 +314,9 @@ slinpack.mkl : slinpack.$(SUFFIX)
slinpack.veclib : slinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+slinpack.essl : slinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Dlinpack ####################################################
dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -322,6 +333,9 @@ dlinpack.mkl : dlinpack.$(SUFFIX)
dlinpack.veclib : dlinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+dlinpack.essl : dlinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Clinpack ####################################################
clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME)
@@ -339,6 +353,9 @@ clinpack.mkl : clinpack.$(SUFFIX)
clinpack.veclib : clinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+clinpack.essl : clinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Zlinpack ####################################################
zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME)
@@ -356,6 +373,9 @@ zlinpack.mkl : zlinpack.$(SUFFIX)
zlinpack.veclib : zlinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+zlinpack.essl : zlinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Scholesky ###################################################
scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME)
@@ -441,6 +461,9 @@ sgemm.mkl : sgemm.$(SUFFIX)
sgemm.veclib : sgemm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+sgemm.essl : sgemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Dgemm ####################################################
dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -457,6 +480,9 @@ dgemm.mkl : dgemm.$(SUFFIX)
dgemm.veclib : dgemm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+dgemm.essl : dgemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Cgemm ####################################################
cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME)
@@ -474,6 +500,9 @@ cgemm.mkl : cgemm.$(SUFFIX)
cgemm.veclib : cgemm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+cgemm.essl : cgemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Zgemm ####################################################
zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME)
@@ -491,6 +520,9 @@ zgemm.mkl : zgemm.$(SUFFIX)
zgemm.veclib : zgemm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+zgemm.essl : zgemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Ssymm ####################################################
ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -573,6 +605,9 @@ strmm.mkl : strmm.$(SUFFIX)
strmm.veclib : strmm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+strmm.essl : strmm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Dtrmm ####################################################
dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -589,6 +624,9 @@ dtrmm.mkl : dtrmm.$(SUFFIX)
dtrmm.veclib : dtrmm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+dtrmm.essl : dtrmm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Ctrmm ####################################################
ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME)
@@ -606,6 +644,9 @@ ctrmm.mkl : ctrmm.$(SUFFIX)
ctrmm.veclib : ctrmm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+ctrmm.essl : ctrmm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Ztrmm ####################################################
ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME)
@@ -623,6 +664,9 @@ ztrmm.mkl : ztrmm.$(SUFFIX)
ztrmm.veclib : ztrmm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+ztrmm.essl : ztrmm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Strsm ####################################################
strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -1413,6 +1457,39 @@ zdot.mkl : zdot-intel.$(SUFFIX)
zdot.veclib : zdot-intel.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Srot ####################################################
+srot.goto : srot.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+srot.acml : srot.$(SUFFIX)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+srot.atlas : srot.$(SUFFIX)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+srot.mkl : srot.$(SUFFIX)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+srot.veclib : srot.$(SUFFIX)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Drot ####################################################
+drot.goto : drot.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+drot.acml : drot.$(SUFFIX)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+drot.atlas : drot.$(SUFFIX)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+drot.mkl : drot.$(SUFFIX)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+drot.veclib : drot.$(SUFFIX)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+
##################################### Saxpy ####################################################
saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -2124,6 +2201,13 @@ cgesv.$(SUFFIX) : gesv.c
zgesv.$(SUFFIX) : gesv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+srot.$(SUFFIX) : rot.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+drot.$(SUFFIX) : rot.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+
@@ -2137,7 +2221,7 @@ smallscaling: smallscaling.c ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
clean ::
- @rm -f *.goto *.mkl *.acml *.atlas *.veclib
+ @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl
include $(TOPDIR)/Makefile.tail
diff --git a/benchmark/rot.c b/benchmark/rot.c
new file mode 100644
index 000000000..32322bebb
--- /dev/null
+++ b/benchmark/rot.c
@@ -0,0 +1,197 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include
+#include
+#ifdef __CYGWIN32__
+#include
+#endif
+#include "common.h"
+
+
+#undef DOT
+
+
+#ifdef DOUBLE
+#define ROT BLASFUNC(drot)
+#else
+#define ROT BLASFUNC(srot)
+#endif
+
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int main(int argc, char *argv[]){
+
+ FLOAT *x, *y;
+ // FLOAT result;
+ blasint m, i;
+ blasint inc_x=1,inc_y=1;
+ FLOAT c[1] = { 2.0 };
+ FLOAT s[1] = { 2.0 };
+ int loops = 1;
+ int l;
+ char *p;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1,timeg;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
+ if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
+ if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
+
+ if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Flops\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ timeg=0;
+
+ fprintf(stderr, " %6d : ", (int)m);
+
+
+ for (l=0; l 0
printf("MAKE += -j %d\n", MAKE_NB_JOBS);
+ #else
+ // Let make use parent -j argument or -j1 if there
+ // is no make parent
+ #endif
#elif NO_PARALLEL_MAKE==1
printf("MAKE += -j 1\n");
#else
diff --git a/getarch_2nd.c b/getarch_2nd.c
index fad647fed..cf9c578cb 100644
--- a/getarch_2nd.c
+++ b/getarch_2nd.c
@@ -64,10 +64,13 @@ int main(int argc, char **argv) {
if ((argc >= 2) && (*argv[1] == '1')) {
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float)));
printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double)));
printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float)));
printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double)));
+#endif
#ifdef USE64BITINT
printf("#define USE64BITINT\n");
diff --git a/kernel/arm64/cgemm_kernel_4x4.S b/kernel/arm64/cgemm_kernel_4x4.S
index 7a70264ca..7f2ddea07 100644
--- a/kernel/arm64/cgemm_kernel_4x4.S
+++ b/kernel/arm64/cgemm_kernel_4x4.S
@@ -179,93 +179,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [ppA]
add ppA, ppA, #32
- fmul v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
+ fmul v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.4s, v0.4s, v9.4s[0]
+ fmls v17.4s, v0.4s, v9.s[0]
#else
- fmul v17.4s, v0.4s, v9.4s[0]
+ fmul v17.4s, v0.4s, v9.s[0]
#endif
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
+ fmul v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.4s, v0.4s, v9.4s[1]
+ fmls v21.4s, v0.4s, v9.s[1]
#else
- fmul v21.4s, v0.4s, v9.4s[1]
+ fmul v21.4s, v0.4s, v9.s[1]
#endif
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- fmul v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
+ fmul v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.4s, v0.4s, v9.4s[2]
+ fmls v25.4s, v0.4s, v9.s[2]
#else
- fmul v25.4s, v0.4s, v9.4s[2]
+ fmul v25.4s, v0.4s, v9.s[2]
#endif
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- fmul v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
+ fmul v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.4s, v0.4s, v9.4s[3]
+ fmls v29.4s, v0.4s, v9.s[3]
#else
- fmul v29.4s, v0.4s, v9.4s[3]
+ fmul v29.4s, v0.4s, v9.s[3]
#endif
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
- fmul v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
+ fmul v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
- fmls v19.4s, v2.4s, v9.4s[0]
+ fmls v19.4s, v2.4s, v9.s[0]
#else
- fmul v19.4s, v2.4s, v9.4s[0]
+ fmul v19.4s, v2.4s, v9.s[0]
#endif
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
- fmul v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
+ fmul v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b
- fmls v23.4s, v2.4s, v9.4s[1]
+ fmls v23.4s, v2.4s, v9.s[1]
#else
- fmul v23.4s, v2.4s, v9.4s[1]
+ fmul v23.4s, v2.4s, v9.s[1]
#endif
- OP_ir v23.4s, v3.4s, v8.4s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
- fmul v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
+ fmul v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b
- fmls v27.4s, v2.4s, v9.4s[2]
+ fmls v27.4s, v2.4s, v9.s[2]
#else
- fmul v27.4s, v2.4s, v9.4s[2]
+ fmul v27.4s, v2.4s, v9.s[2]
#endif
- OP_ir v27.4s, v3.4s, v8.4s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
- fmul v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
+ fmul v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b
- fmls v31.4s, v2.4s, v9.4s[3]
+ fmls v31.4s, v2.4s, v9.s[3]
#else
- fmul v31.4s, v2.4s, v9.4s[3]
+ fmul v31.4s, v2.4s, v9.s[3]
#endif
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32
@@ -276,159 +276,159 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_M1
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // for next round
add pB, pB, #32
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
ld2 {v4.4s, v5.4s} , [pA] // for next round
add pA, pA, #32
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
- OP_ri v19.4s, v2.4s, v9.4s[0]
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
ld2 {v6.4s, v7.4s} , [ppA] // for next round
add ppA, ppA, #32
- OP_rr v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
- OP_ri v23.4s, v2.4s, v9.4s[1]
- OP_ir v23.4s, v3.4s, v8.4s[1]
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
prfm PLDL1KEEP, [ppA, #512]
- OP_rr v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
- OP_ri v27.4s, v2.4s, v9.4s[2]
- OP_ir v27.4s, v3.4s, v8.4s[2]
+ OP_rr v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
+ OP_ri v27.4s, v2.4s, v9.s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
- OP_rr v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
- OP_ri v31.4s, v2.4s, v9.4s[3]
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_rr v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
+ OP_ri v31.4s, v2.4s, v9.s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
.endm
.macro KERNEL8x4_M2
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // for next round
add pB, pB, #32
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
ld2 {v0.4s, v1.4s}, [pA] // for next round
add pA, pA, #32
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
prfm PLDL1KEEP, [ppA, #512]
- OP_rr v18.4s, v6.4s, v12.4s[0]
- OP_ii v18.4s, v7.4s, v13.4s[0]
- OP_ri v19.4s, v6.4s, v13.4s[0]
- OP_ir v19.4s, v7.4s, v12.4s[0]
+ OP_rr v18.4s, v6.4s, v12.s[0]
+ OP_ii v18.4s, v7.4s, v13.s[0]
+ OP_ri v19.4s, v6.4s, v13.s[0]
+ OP_ir v19.4s, v7.4s, v12.s[0]
ld2 {v2.4s, v3.4s}, [ppA] // for next round
add ppA, ppA, #32
- OP_rr v22.4s, v6.4s, v12.4s[1]
- OP_ii v22.4s, v7.4s, v13.4s[1]
- OP_ri v23.4s, v6.4s, v13.4s[1]
- OP_ir v23.4s, v7.4s, v12.4s[1]
+ OP_rr v22.4s, v6.4s, v12.s[1]
+ OP_ii v22.4s, v7.4s, v13.s[1]
+ OP_ri v23.4s, v6.4s, v13.s[1]
+ OP_ir v23.4s, v7.4s, v12.s[1]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v26.4s, v6.4s, v12.4s[2]
- OP_ii v26.4s, v7.4s, v13.4s[2]
- OP_ri v27.4s, v6.4s, v13.4s[2]
- OP_ir v27.4s, v7.4s, v12.4s[2]
+ OP_rr v26.4s, v6.4s, v12.s[2]
+ OP_ii v26.4s, v7.4s, v13.s[2]
+ OP_ri v27.4s, v6.4s, v13.s[2]
+ OP_ir v27.4s, v7.4s, v12.s[2]
- OP_rr v30.4s, v6.4s, v12.4s[3]
- OP_ii v30.4s, v7.4s, v13.4s[3]
- OP_ri v31.4s, v6.4s, v13.4s[3]
- OP_ir v31.4s, v7.4s, v12.4s[3]
+ OP_rr v30.4s, v6.4s, v12.s[3]
+ OP_ii v30.4s, v7.4s, v13.s[3]
+ OP_ri v31.4s, v6.4s, v13.s[3]
+ OP_ir v31.4s, v7.4s, v12.s[3]
.endm
.macro KERNEL8x4_E
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
- OP_rr v18.4s, v6.4s, v12.4s[0]
- OP_ii v18.4s, v7.4s, v13.4s[0]
- OP_ri v19.4s, v6.4s, v13.4s[0]
- OP_ir v19.4s, v7.4s, v12.4s[0]
+ OP_rr v18.4s, v6.4s, v12.s[0]
+ OP_ii v18.4s, v7.4s, v13.s[0]
+ OP_ri v19.4s, v6.4s, v13.s[0]
+ OP_ir v19.4s, v7.4s, v12.s[0]
- OP_rr v22.4s, v6.4s, v12.4s[1]
- OP_ii v22.4s, v7.4s, v13.4s[1]
- OP_ri v23.4s, v6.4s, v13.4s[1]
- OP_ir v23.4s, v7.4s, v12.4s[1]
+ OP_rr v22.4s, v6.4s, v12.s[1]
+ OP_ii v22.4s, v7.4s, v13.s[1]
+ OP_ri v23.4s, v6.4s, v13.s[1]
+ OP_ir v23.4s, v7.4s, v12.s[1]
- OP_rr v26.4s, v6.4s, v12.4s[2]
- OP_ii v26.4s, v7.4s, v13.4s[2]
- OP_ri v27.4s, v6.4s, v13.4s[2]
- OP_ir v27.4s, v7.4s, v12.4s[2]
+ OP_rr v26.4s, v6.4s, v12.s[2]
+ OP_ii v26.4s, v7.4s, v13.s[2]
+ OP_ri v27.4s, v6.4s, v13.s[2]
+ OP_ir v27.4s, v7.4s, v12.s[2]
- OP_rr v30.4s, v6.4s, v12.4s[3]
- OP_ii v30.4s, v7.4s, v13.4s[3]
- OP_ri v31.4s, v6.4s, v13.4s[3]
- OP_ir v31.4s, v7.4s, v12.4s[3]
+ OP_rr v30.4s, v6.4s, v12.s[3]
+ OP_ii v30.4s, v7.4s, v13.s[3]
+ OP_ri v31.4s, v6.4s, v13.s[3]
+ OP_ir v31.4s, v7.4s, v12.s[3]
.endm
.macro KERNEL8x4_SUB
@@ -437,48 +437,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v2.4s, v3.4s}, [ppA]
add ppA, ppA, #32
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
- OP_ri v19.4s, v2.4s, v9.4s[0]
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
- OP_rr v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
- OP_ri v23.4s, v2.4s, v9.4s[1]
- OP_ir v23.4s, v3.4s, v8.4s[1]
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
- OP_rr v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
- OP_ri v27.4s, v2.4s, v9.4s[2]
- OP_ir v27.4s, v3.4s, v8.4s[2]
+ OP_rr v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
+ OP_ri v27.4s, v2.4s, v9.s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
- OP_rr v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
- OP_ri v31.4s, v2.4s, v9.4s[3]
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_rr v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
+ OP_ri v31.4s, v2.4s, v9.s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
.endm
.macro SAVE8x4
@@ -578,25 +578,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
.endm
.macro SAVE4x4
@@ -658,25 +658,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.4s[0]
- OP_ii v16.2s, v1.2s, v9.4s[0]
- OP_ri v17.2s, v0.2s, v9.4s[0]
- OP_ir v17.2s, v1.2s, v8.4s[0]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
- OP_rr v20.2s, v0.2s, v8.4s[1]
- OP_ii v20.2s, v1.2s, v9.4s[1]
- OP_ri v21.2s, v0.2s, v9.4s[1]
- OP_ir v21.2s, v1.2s, v8.4s[1]
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
- OP_rr v24.2s, v0.2s, v8.4s[2]
- OP_ii v24.2s, v1.2s, v9.4s[2]
- OP_ri v25.2s, v0.2s, v9.4s[2]
- OP_ir v25.2s, v1.2s, v8.4s[2]
+ OP_rr v24.2s, v0.2s, v8.s[2]
+ OP_ii v24.2s, v1.2s, v9.s[2]
+ OP_ri v25.2s, v0.2s, v9.s[2]
+ OP_ir v25.2s, v1.2s, v8.s[2]
- OP_rr v28.2s, v0.2s, v8.4s[3]
- OP_ii v28.2s, v1.2s, v9.4s[3]
- OP_ri v29.2s, v0.2s, v9.4s[3]
- OP_ir v29.2s, v1.2s, v8.4s[3]
+ OP_rr v28.2s, v0.2s, v8.s[3]
+ OP_ii v28.2s, v1.2s, v9.s[3]
+ OP_ri v29.2s, v0.2s, v9.s[3]
+ OP_ir v29.2s, v1.2s, v8.s[3]
.endm
.macro SAVE2x4
@@ -738,25 +738,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.4s[0]
- OP_ii s16, s1, v9.4s[0]
- OP_ri s17, s0, v9.4s[0]
- OP_ir s17, s1, v8.4s[0]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
- OP_rr s20, s0, v8.4s[1]
- OP_ii s20, s1, v9.4s[1]
- OP_ri s21, s0, v9.4s[1]
- OP_ir s21, s1, v8.4s[1]
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
- OP_rr s24, s0, v8.4s[2]
- OP_ii s24, s1, v9.4s[2]
- OP_ri s25, s0, v9.4s[2]
- OP_ir s25, s1, v8.4s[2]
+ OP_rr s24, s0, v8.s[2]
+ OP_ii s24, s1, v9.s[2]
+ OP_ri s25, s0, v9.s[2]
+ OP_ir s25, s1, v8.s[2]
- OP_rr s28, s0, v8.4s[3]
- OP_ii s28, s1, v9.4s[3]
- OP_ri s29, s0, v9.4s[3]
- OP_ir s29, s1, v8.4s[3]
+ OP_rr s28, s0, v8.s[3]
+ OP_ii s28, s1, v9.s[3]
+ OP_ri s29, s0, v9.s[3]
+ OP_ir s29, s1, v8.s[3]
.endm
.macro SAVE1x4
@@ -814,15 +814,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.2s[0]
- OP_ii v16.4s, v1.4s, v9.2s[0]
- OP_ri v17.4s, v0.4s, v9.2s[0]
- OP_ir v17.4s, v1.4s, v8.2s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.2s[1]
- OP_ii v20.4s, v1.4s, v9.2s[1]
- OP_ri v21.4s, v0.4s, v9.2s[1]
- OP_ir v21.4s, v1.4s, v8.2s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE4x2
@@ -862,15 +862,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.2s[0]
- OP_ii v16.2s, v1.2s, v9.2s[0]
- OP_ri v17.2s, v0.2s, v9.2s[0]
- OP_ir v17.2s, v1.2s, v8.2s[0]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
- OP_rr v20.2s, v0.2s, v8.2s[1]
- OP_ii v20.2s, v1.2s, v9.2s[1]
- OP_ri v21.2s, v0.2s, v9.2s[1]
- OP_ir v21.2s, v1.2s, v8.2s[1]
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE2x2
@@ -910,15 +910,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.2s[0]
- OP_ii s16, s1, v9.2s[0]
- OP_ri s17, s0, v9.2s[0]
- OP_ir s17, s1, v8.2s[0]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
- OP_rr s20, s0, v8.2s[1]
- OP_ii s20, s1, v9.2s[1]
- OP_ri s21, s0, v9.2s[1]
- OP_ir s21, s1, v8.2s[1]
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
.endm
.macro SAVE1x2
diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S
old mode 100755
new mode 100644
index 40b98cee2..d58cef52d
--- a/kernel/arm64/cgemm_kernel_8x4.S
+++ b/kernel/arm64/cgemm_kernel_8x4.S
@@ -178,93 +178,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- fmul v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
+ fmul v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.4s, v0.4s, v9.4s[0]
+ fmls v17.4s, v0.4s, v9.s[0]
#else
- fmul v17.4s, v0.4s, v9.4s[0]
+ fmul v17.4s, v0.4s, v9.s[0]
#endif
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- fmul v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
+ fmul v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
- fmls v19.4s, v2.4s, v9.4s[0]
+ fmls v19.4s, v2.4s, v9.s[0]
#else
- fmul v19.4s, v2.4s, v9.4s[0]
+ fmul v19.4s, v2.4s, v9.s[0]
#endif
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
+ fmul v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.4s, v0.4s, v9.4s[1]
+ fmls v21.4s, v0.4s, v9.s[1]
#else
- fmul v21.4s, v0.4s, v9.4s[1]
+ fmul v21.4s, v0.4s, v9.s[1]
#endif
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- fmul v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
+ fmul v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b
- fmls v23.4s, v2.4s, v9.4s[1]
+ fmls v23.4s, v2.4s, v9.s[1]
#else
- fmul v23.4s, v2.4s, v9.4s[1]
+ fmul v23.4s, v2.4s, v9.s[1]
#endif
- OP_ir v23.4s, v3.4s, v8.4s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
- fmul v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
+ fmul v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.4s, v0.4s, v9.4s[2]
+ fmls v25.4s, v0.4s, v9.s[2]
#else
- fmul v25.4s, v0.4s, v9.4s[2]
+ fmul v25.4s, v0.4s, v9.s[2]
#endif
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- fmul v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
+ fmul v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b
- fmls v27.4s, v2.4s, v9.4s[2]
+ fmls v27.4s, v2.4s, v9.s[2]
#else
- fmul v27.4s, v2.4s, v9.4s[2]
+ fmul v27.4s, v2.4s, v9.s[2]
#endif
- OP_ir v27.4s, v3.4s, v8.4s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
- fmul v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
+ fmul v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.4s, v0.4s, v9.4s[3]
+ fmls v29.4s, v0.4s, v9.s[3]
#else
- fmul v29.4s, v0.4s, v9.4s[3]
+ fmul v29.4s, v0.4s, v9.s[3]
#endif
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
- fmul v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
+ fmul v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b
- fmls v31.4s, v2.4s, v9.4s[3]
+ fmls v31.4s, v2.4s, v9.s[3]
#else
- fmul v31.4s, v2.4s, v9.4s[3]
+ fmul v31.4s, v2.4s, v9.s[3]
#endif
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32
@@ -275,45 +275,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_M1
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
- OP_ri v19.4s, v2.4s, v9.4s[0]
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- OP_rr v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
- OP_ri v23.4s, v2.4s, v9.4s[1]
- OP_ir v23.4s, v3.4s, v8.4s[1]
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- OP_rr v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
- OP_ri v27.4s, v2.4s, v9.4s[2]
- OP_ir v27.4s, v3.4s, v8.4s[2]
+ OP_rr v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
+ OP_ri v27.4s, v2.4s, v9.s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
- OP_rr v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
- OP_ri v31.4s, v2.4s, v9.4s[3]
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_rr v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
+ OP_ri v31.4s, v2.4s, v9.s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32
@@ -324,45 +324,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_M2
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
- OP_rr v18.4s, v6.4s, v12.4s[0]
- OP_ii v18.4s, v7.4s, v13.4s[0]
- OP_ri v19.4s, v6.4s, v13.4s[0]
- OP_ir v19.4s, v7.4s, v12.4s[0]
+ OP_rr v18.4s, v6.4s, v12.s[0]
+ OP_ii v18.4s, v7.4s, v13.s[0]
+ OP_ri v19.4s, v6.4s, v13.s[0]
+ OP_ir v19.4s, v7.4s, v12.s[0]
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
- OP_rr v22.4s, v6.4s, v12.4s[1]
- OP_ii v22.4s, v7.4s, v13.4s[1]
- OP_ri v23.4s, v6.4s, v13.4s[1]
- OP_ir v23.4s, v7.4s, v12.4s[1]
+ OP_rr v22.4s, v6.4s, v12.s[1]
+ OP_ii v22.4s, v7.4s, v13.s[1]
+ OP_ri v23.4s, v6.4s, v13.s[1]
+ OP_ir v23.4s, v7.4s, v12.s[1]
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
- OP_rr v26.4s, v6.4s, v12.4s[2]
- OP_ii v26.4s, v7.4s, v13.4s[2]
- OP_ri v27.4s, v6.4s, v13.4s[2]
- OP_ir v27.4s, v7.4s, v12.4s[2]
+ OP_rr v26.4s, v6.4s, v12.s[2]
+ OP_ii v26.4s, v7.4s, v13.s[2]
+ OP_ri v27.4s, v6.4s, v13.s[2]
+ OP_ir v27.4s, v7.4s, v12.s[2]
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
- OP_rr v30.4s, v6.4s, v12.4s[3]
- OP_ii v30.4s, v7.4s, v13.4s[3]
- OP_ri v31.4s, v6.4s, v13.4s[3]
- OP_ir v31.4s, v7.4s, v12.4s[3]
+ OP_rr v30.4s, v6.4s, v12.s[3]
+ OP_ii v30.4s, v7.4s, v13.s[3]
+ OP_ri v31.4s, v6.4s, v13.s[3]
+ OP_ir v31.4s, v7.4s, v12.s[3]
ld2 {v8.4s, v9.4s}, [pB]
add pB, pB, #32
@@ -373,45 +373,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_E
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
- OP_rr v18.4s, v6.4s, v12.4s[0]
- OP_ii v18.4s, v7.4s, v13.4s[0]
- OP_ri v19.4s, v6.4s, v13.4s[0]
- OP_ir v19.4s, v7.4s, v12.4s[0]
+ OP_rr v18.4s, v6.4s, v12.s[0]
+ OP_ii v18.4s, v7.4s, v13.s[0]
+ OP_ri v19.4s, v6.4s, v13.s[0]
+ OP_ir v19.4s, v7.4s, v12.s[0]
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
- OP_rr v22.4s, v6.4s, v12.4s[1]
- OP_ii v22.4s, v7.4s, v13.4s[1]
- OP_ri v23.4s, v6.4s, v13.4s[1]
- OP_ir v23.4s, v7.4s, v12.4s[1]
+ OP_rr v22.4s, v6.4s, v12.s[1]
+ OP_ii v22.4s, v7.4s, v13.s[1]
+ OP_ri v23.4s, v6.4s, v13.s[1]
+ OP_ir v23.4s, v7.4s, v12.s[1]
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
- OP_rr v26.4s, v6.4s, v12.4s[2]
- OP_ii v26.4s, v7.4s, v13.4s[2]
- OP_ri v27.4s, v6.4s, v13.4s[2]
- OP_ir v27.4s, v7.4s, v12.4s[2]
+ OP_rr v26.4s, v6.4s, v12.s[2]
+ OP_ii v26.4s, v7.4s, v13.s[2]
+ OP_ri v27.4s, v6.4s, v13.s[2]
+ OP_ir v27.4s, v7.4s, v12.s[2]
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
- OP_rr v30.4s, v6.4s, v12.4s[3]
- OP_ii v30.4s, v7.4s, v13.4s[3]
- OP_ri v31.4s, v6.4s, v13.4s[3]
- OP_ir v31.4s, v7.4s, v12.4s[3]
+ OP_rr v30.4s, v6.4s, v12.s[3]
+ OP_ii v30.4s, v7.4s, v13.s[3]
+ OP_ri v31.4s, v6.4s, v13.s[3]
+ OP_ir v31.4s, v7.4s, v12.s[3]
.endm
@@ -423,45 +423,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
- OP_ri v19.4s, v2.4s, v9.4s[0]
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- OP_rr v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
- OP_ri v23.4s, v2.4s, v9.4s[1]
- OP_ir v23.4s, v3.4s, v8.4s[1]
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- OP_rr v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
- OP_ri v27.4s, v2.4s, v9.4s[2]
- OP_ir v27.4s, v3.4s, v8.4s[2]
+ OP_rr v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
+ OP_ri v27.4s, v2.4s, v9.s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
- OP_rr v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
- OP_ri v31.4s, v2.4s, v9.4s[3]
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_rr v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
+ OP_ri v31.4s, v2.4s, v9.s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
.endm
@@ -560,49 +560,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- fmul v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
+ fmul v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.4s, v0.4s, v9.4s[0]
+ fmls v17.4s, v0.4s, v9.s[0]
#else
- fmul v17.4s, v0.4s, v9.4s[0]
+ fmul v17.4s, v0.4s, v9.s[0]
#endif
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
+ fmul v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.4s, v0.4s, v9.4s[1]
+ fmls v21.4s, v0.4s, v9.s[1]
#else
- fmul v21.4s, v0.4s, v9.4s[1]
+ fmul v21.4s, v0.4s, v9.s[1]
#endif
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- fmul v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
+ fmul v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.4s, v0.4s, v9.4s[2]
+ fmls v25.4s, v0.4s, v9.s[2]
#else
- fmul v25.4s, v0.4s, v9.4s[2]
+ fmul v25.4s, v0.4s, v9.s[2]
#endif
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- fmul v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
+ fmul v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.4s, v0.4s, v9.4s[3]
+ fmls v29.4s, v0.4s, v9.s[3]
#else
- fmul v29.4s, v0.4s, v9.4s[3]
+ fmul v29.4s, v0.4s, v9.s[3]
#endif
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32
@@ -611,85 +611,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x4_M1
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v4.4s, v5.4s}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
.endm
.macro KERNEL4x4_M2
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
ld2 {v0.4s, v1.4s}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
.endm
.macro KERNEL4x4_E
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
.endm
.macro KERNEL4x4_SUB
@@ -698,25 +698,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
.endm
.macro SAVE4x4
@@ -778,25 +778,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.4s[0]
- OP_ii v16.2s, v1.2s, v9.4s[0]
- OP_ri v17.2s, v0.2s, v9.4s[0]
- OP_ir v17.2s, v1.2s, v8.4s[0]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
- OP_rr v20.2s, v0.2s, v8.4s[1]
- OP_ii v20.2s, v1.2s, v9.4s[1]
- OP_ri v21.2s, v0.2s, v9.4s[1]
- OP_ir v21.2s, v1.2s, v8.4s[1]
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
- OP_rr v24.2s, v0.2s, v8.4s[2]
- OP_ii v24.2s, v1.2s, v9.4s[2]
- OP_ri v25.2s, v0.2s, v9.4s[2]
- OP_ir v25.2s, v1.2s, v8.4s[2]
+ OP_rr v24.2s, v0.2s, v8.s[2]
+ OP_ii v24.2s, v1.2s, v9.s[2]
+ OP_ri v25.2s, v0.2s, v9.s[2]
+ OP_ir v25.2s, v1.2s, v8.s[2]
- OP_rr v28.2s, v0.2s, v8.4s[3]
- OP_ii v28.2s, v1.2s, v9.4s[3]
- OP_ri v29.2s, v0.2s, v9.4s[3]
- OP_ir v29.2s, v1.2s, v8.4s[3]
+ OP_rr v28.2s, v0.2s, v8.s[3]
+ OP_ii v28.2s, v1.2s, v9.s[3]
+ OP_ri v29.2s, v0.2s, v9.s[3]
+ OP_ir v29.2s, v1.2s, v8.s[3]
.endm
.macro SAVE2x4
@@ -858,25 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.4s[0]
- OP_ii s16, s1, v9.4s[0]
- OP_ri s17, s0, v9.4s[0]
- OP_ir s17, s1, v8.4s[0]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
- OP_rr s20, s0, v8.4s[1]
- OP_ii s20, s1, v9.4s[1]
- OP_ri s21, s0, v9.4s[1]
- OP_ir s21, s1, v8.4s[1]
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
- OP_rr s24, s0, v8.4s[2]
- OP_ii s24, s1, v9.4s[2]
- OP_ri s25, s0, v9.4s[2]
- OP_ir s25, s1, v8.4s[2]
+ OP_rr s24, s0, v8.s[2]
+ OP_ii s24, s1, v9.s[2]
+ OP_ri s25, s0, v9.s[2]
+ OP_ir s25, s1, v8.s[2]
- OP_rr s28, s0, v8.4s[3]
- OP_ii s28, s1, v9.4s[3]
- OP_ri s29, s0, v9.4s[3]
- OP_ir s29, s1, v8.4s[3]
+ OP_rr s28, s0, v8.s[3]
+ OP_ii s28, s1, v9.s[3]
+ OP_ri s29, s0, v9.s[3]
+ OP_ir s29, s1, v8.s[3]
.endm
.macro SAVE1x4
@@ -940,25 +940,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.2s[0]
- OP_ii v16.4s, v1.4s, v9.2s[0]
- OP_ri v17.4s, v0.4s, v9.2s[0]
- OP_ir v17.4s, v1.4s, v8.2s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v18.4s, v2.4s, v8.2s[0]
- OP_ii v18.4s, v3.4s, v9.2s[0]
- OP_ri v19.4s, v2.4s, v9.2s[0]
- OP_ir v19.4s, v3.4s, v8.2s[0]
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.2s[1]
- OP_ii v20.4s, v1.4s, v9.2s[1]
- OP_ri v21.4s, v0.4s, v9.2s[1]
- OP_ir v21.4s, v1.4s, v8.2s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- OP_rr v22.4s, v2.4s, v8.2s[1]
- OP_ii v22.4s, v3.4s, v9.2s[1]
- OP_ri v23.4s, v2.4s, v9.2s[1]
- OP_ir v23.4s, v3.4s, v8.2s[1]
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
.endm
.macro SAVE8x2
@@ -1016,15 +1016,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.2s[0]
- OP_ii v16.4s, v1.4s, v9.2s[0]
- OP_ri v17.4s, v0.4s, v9.2s[0]
- OP_ir v17.4s, v1.4s, v8.2s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.2s[1]
- OP_ii v20.4s, v1.4s, v9.2s[1]
- OP_ri v21.4s, v0.4s, v9.2s[1]
- OP_ir v21.4s, v1.4s, v8.2s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE4x2
@@ -1064,15 +1064,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.2s[0]
- OP_ii v16.2s, v1.2s, v9.2s[0]
- OP_ri v17.2s, v0.2s, v9.2s[0]
- OP_ir v17.2s, v1.2s, v8.2s[0]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
- OP_rr v20.2s, v0.2s, v8.2s[1]
- OP_ii v20.2s, v1.2s, v9.2s[1]
- OP_ri v21.2s, v0.2s, v9.2s[1]
- OP_ir v21.2s, v1.2s, v8.2s[1]
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE2x2
@@ -1112,15 +1112,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.2s[0]
- OP_ii s16, s1, v9.2s[0]
- OP_ri s17, s0, v9.2s[0]
- OP_ir s17, s1, v8.2s[0]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
- OP_rr s20, s0, v8.2s[1]
- OP_ii s20, s1, v9.2s[1]
- OP_ri s21, s0, v9.2s[1]
- OP_ir s21, s1, v8.2s[1]
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
.endm
.macro SAVE1x2
@@ -1162,15 +1162,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v8.4s[1]
- OP_ri v17.4s, v0.4s, v8.4s[1]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v8.s[1]
+ OP_ri v17.4s, v0.4s, v8.s[1]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v8.4s[1]
- OP_ri v19.4s, v2.4s, v8.4s[1]
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v8.s[1]
+ OP_ri v19.4s, v2.4s, v8.s[1]
+ OP_ir v19.4s, v3.4s, v8.s[0]
.endm
.macro SAVE8x1
diff --git a/kernel/arm64/ctrmm_kernel_4x4.S b/kernel/arm64/ctrmm_kernel_4x4.S
index be0e9bdef..3de27257a 100644
--- a/kernel/arm64/ctrmm_kernel_4x4.S
+++ b/kernel/arm64/ctrmm_kernel_4x4.S
@@ -170,49 +170,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- fmul v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
+ fmul v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.4s, v0.4s, v9.4s[0]
+ fmls v17.4s, v0.4s, v9.s[0]
#else
- fmul v17.4s, v0.4s, v9.4s[0]
+ fmul v17.4s, v0.4s, v9.s[0]
#endif
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
+ fmul v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.4s, v0.4s, v9.4s[1]
+ fmls v21.4s, v0.4s, v9.s[1]
#else
- fmul v21.4s, v0.4s, v9.4s[1]
+ fmul v21.4s, v0.4s, v9.s[1]
#endif
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- fmul v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
+ fmul v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.4s, v0.4s, v9.4s[2]
+ fmls v25.4s, v0.4s, v9.s[2]
#else
- fmul v25.4s, v0.4s, v9.4s[2]
+ fmul v25.4s, v0.4s, v9.s[2]
#endif
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- fmul v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
+ fmul v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.4s, v0.4s, v9.4s[3]
+ fmls v29.4s, v0.4s, v9.s[3]
#else
- fmul v29.4s, v0.4s, v9.4s[3]
+ fmul v29.4s, v0.4s, v9.s[3]
#endif
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32
@@ -221,85 +221,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x4_M1
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v4.4s, v5.4s}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
.endm
.macro KERNEL4x4_M2
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
ld2 {v0.4s, v1.4s}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
.endm
.macro KERNEL4x4_E
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
.endm
.macro KERNEL4x4_SUB
@@ -308,25 +308,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
.endm
.macro SAVE4x4
@@ -384,25 +384,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.4s[0]
- OP_ii v16.2s, v1.2s, v9.4s[0]
- OP_ri v17.2s, v0.2s, v9.4s[0]
- OP_ir v17.2s, v1.2s, v8.4s[0]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
- OP_rr v20.2s, v0.2s, v8.4s[1]
- OP_ii v20.2s, v1.2s, v9.4s[1]
- OP_ri v21.2s, v0.2s, v9.4s[1]
- OP_ir v21.2s, v1.2s, v8.4s[1]
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
- OP_rr v24.2s, v0.2s, v8.4s[2]
- OP_ii v24.2s, v1.2s, v9.4s[2]
- OP_ri v25.2s, v0.2s, v9.4s[2]
- OP_ir v25.2s, v1.2s, v8.4s[2]
+ OP_rr v24.2s, v0.2s, v8.s[2]
+ OP_ii v24.2s, v1.2s, v9.s[2]
+ OP_ri v25.2s, v0.2s, v9.s[2]
+ OP_ir v25.2s, v1.2s, v8.s[2]
- OP_rr v28.2s, v0.2s, v8.4s[3]
- OP_ii v28.2s, v1.2s, v9.4s[3]
- OP_ri v29.2s, v0.2s, v9.4s[3]
- OP_ir v29.2s, v1.2s, v8.4s[3]
+ OP_rr v28.2s, v0.2s, v8.s[3]
+ OP_ii v28.2s, v1.2s, v9.s[3]
+ OP_ri v29.2s, v0.2s, v9.s[3]
+ OP_ir v29.2s, v1.2s, v8.s[3]
.endm
.macro SAVE2x4
@@ -460,25 +460,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.4s[0]
- OP_ii s16, s1, v9.4s[0]
- OP_ri s17, s0, v9.4s[0]
- OP_ir s17, s1, v8.4s[0]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
- OP_rr s20, s0, v8.4s[1]
- OP_ii s20, s1, v9.4s[1]
- OP_ri s21, s0, v9.4s[1]
- OP_ir s21, s1, v8.4s[1]
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
- OP_rr s24, s0, v8.4s[2]
- OP_ii s24, s1, v9.4s[2]
- OP_ri s25, s0, v9.4s[2]
- OP_ir s25, s1, v8.4s[2]
+ OP_rr s24, s0, v8.s[2]
+ OP_ii s24, s1, v9.s[2]
+ OP_ri s25, s0, v9.s[2]
+ OP_ir s25, s1, v8.s[2]
- OP_rr s28, s0, v8.4s[3]
- OP_ii s28, s1, v9.4s[3]
- OP_ri s29, s0, v9.4s[3]
- OP_ir s29, s1, v8.4s[3]
+ OP_rr s28, s0, v8.s[3]
+ OP_ii s28, s1, v9.s[3]
+ OP_ri s29, s0, v9.s[3]
+ OP_ir s29, s1, v8.s[3]
.endm
.macro SAVE1x4
@@ -532,15 +532,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.2s[0]
- OP_ii v16.4s, v1.4s, v9.2s[0]
- OP_ri v17.4s, v0.4s, v9.2s[0]
- OP_ir v17.4s, v1.4s, v8.2s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.2s[1]
- OP_ii v20.4s, v1.4s, v9.2s[1]
- OP_ri v21.4s, v0.4s, v9.2s[1]
- OP_ir v21.4s, v1.4s, v8.2s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE4x2
@@ -578,15 +578,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.2s[0]
- OP_ii v16.2s, v1.2s, v9.2s[0]
- OP_ri v17.2s, v0.2s, v9.2s[0]
- OP_ir v17.2s, v1.2s, v8.2s[0]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
- OP_rr v20.2s, v0.2s, v8.2s[1]
- OP_ii v20.2s, v1.2s, v9.2s[1]
- OP_ri v21.2s, v0.2s, v9.2s[1]
- OP_ir v21.2s, v1.2s, v8.2s[1]
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE2x2
@@ -624,15 +624,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.2s[0]
- OP_ii s16, s1, v9.2s[0]
- OP_ri s17, s0, v9.2s[0]
- OP_ir s17, s1, v8.2s[0]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
- OP_rr s20, s0, v8.2s[1]
- OP_ii s20, s1, v9.2s[1]
- OP_ri s21, s0, v9.2s[1]
- OP_ir s21, s1, v8.2s[1]
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
.endm
.macro SAVE1x2
diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S
old mode 100755
new mode 100644
index 3131541d4..ce5cb0406
--- a/kernel/arm64/ctrmm_kernel_8x4.S
+++ b/kernel/arm64/ctrmm_kernel_8x4.S
@@ -180,93 +180,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- fmul v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
+ fmul v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.4s, v0.4s, v9.4s[0]
+ fmls v17.4s, v0.4s, v9.s[0]
#else
- fmul v17.4s, v0.4s, v9.4s[0]
+ fmul v17.4s, v0.4s, v9.s[0]
#endif
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- fmul v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
+ fmul v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
- fmls v19.4s, v2.4s, v9.4s[0]
+ fmls v19.4s, v2.4s, v9.s[0]
#else
- fmul v19.4s, v2.4s, v9.4s[0]
+ fmul v19.4s, v2.4s, v9.s[0]
#endif
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
+ fmul v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.4s, v0.4s, v9.4s[1]
+ fmls v21.4s, v0.4s, v9.s[1]
#else
- fmul v21.4s, v0.4s, v9.4s[1]
+ fmul v21.4s, v0.4s, v9.s[1]
#endif
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- fmul v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
+ fmul v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b
- fmls v23.4s, v2.4s, v9.4s[1]
+ fmls v23.4s, v2.4s, v9.s[1]
#else
- fmul v23.4s, v2.4s, v9.4s[1]
+ fmul v23.4s, v2.4s, v9.s[1]
#endif
- OP_ir v23.4s, v3.4s, v8.4s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
- fmul v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
+ fmul v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.4s, v0.4s, v9.4s[2]
+ fmls v25.4s, v0.4s, v9.s[2]
#else
- fmul v25.4s, v0.4s, v9.4s[2]
+ fmul v25.4s, v0.4s, v9.s[2]
#endif
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- fmul v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
+ fmul v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b
- fmls v27.4s, v2.4s, v9.4s[2]
+ fmls v27.4s, v2.4s, v9.s[2]
#else
- fmul v27.4s, v2.4s, v9.4s[2]
+ fmul v27.4s, v2.4s, v9.s[2]
#endif
- OP_ir v27.4s, v3.4s, v8.4s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
- fmul v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
+ fmul v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.4s, v0.4s, v9.4s[3]
+ fmls v29.4s, v0.4s, v9.s[3]
#else
- fmul v29.4s, v0.4s, v9.4s[3]
+ fmul v29.4s, v0.4s, v9.s[3]
#endif
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
- fmul v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
+ fmul v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b
- fmls v31.4s, v2.4s, v9.4s[3]
+ fmls v31.4s, v2.4s, v9.s[3]
#else
- fmul v31.4s, v2.4s, v9.4s[3]
+ fmul v31.4s, v2.4s, v9.s[3]
#endif
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32
@@ -277,45 +277,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_M1
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
- OP_ri v19.4s, v2.4s, v9.4s[0]
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- OP_rr v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
- OP_ri v23.4s, v2.4s, v9.4s[1]
- OP_ir v23.4s, v3.4s, v8.4s[1]
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- OP_rr v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
- OP_ri v27.4s, v2.4s, v9.4s[2]
- OP_ir v27.4s, v3.4s, v8.4s[2]
+ OP_rr v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
+ OP_ri v27.4s, v2.4s, v9.s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
- OP_rr v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
- OP_ri v31.4s, v2.4s, v9.4s[3]
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_rr v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
+ OP_ri v31.4s, v2.4s, v9.s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32
@@ -326,45 +326,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_M2
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
- OP_rr v18.4s, v6.4s, v12.4s[0]
- OP_ii v18.4s, v7.4s, v13.4s[0]
- OP_ri v19.4s, v6.4s, v13.4s[0]
- OP_ir v19.4s, v7.4s, v12.4s[0]
+ OP_rr v18.4s, v6.4s, v12.s[0]
+ OP_ii v18.4s, v7.4s, v13.s[0]
+ OP_ri v19.4s, v6.4s, v13.s[0]
+ OP_ir v19.4s, v7.4s, v12.s[0]
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
- OP_rr v22.4s, v6.4s, v12.4s[1]
- OP_ii v22.4s, v7.4s, v13.4s[1]
- OP_ri v23.4s, v6.4s, v13.4s[1]
- OP_ir v23.4s, v7.4s, v12.4s[1]
+ OP_rr v22.4s, v6.4s, v12.s[1]
+ OP_ii v22.4s, v7.4s, v13.s[1]
+ OP_ri v23.4s, v6.4s, v13.s[1]
+ OP_ir v23.4s, v7.4s, v12.s[1]
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
- OP_rr v26.4s, v6.4s, v12.4s[2]
- OP_ii v26.4s, v7.4s, v13.4s[2]
- OP_ri v27.4s, v6.4s, v13.4s[2]
- OP_ir v27.4s, v7.4s, v12.4s[2]
+ OP_rr v26.4s, v6.4s, v12.s[2]
+ OP_ii v26.4s, v7.4s, v13.s[2]
+ OP_ri v27.4s, v6.4s, v13.s[2]
+ OP_ir v27.4s, v7.4s, v12.s[2]
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
- OP_rr v30.4s, v6.4s, v12.4s[3]
- OP_ii v30.4s, v7.4s, v13.4s[3]
- OP_ri v31.4s, v6.4s, v13.4s[3]
- OP_ir v31.4s, v7.4s, v12.4s[3]
+ OP_rr v30.4s, v6.4s, v12.s[3]
+ OP_ii v30.4s, v7.4s, v13.s[3]
+ OP_ri v31.4s, v6.4s, v13.s[3]
+ OP_ir v31.4s, v7.4s, v12.s[3]
ld2 {v8.4s, v9.4s}, [pB]
add pB, pB, #32
@@ -375,45 +375,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_E
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
- OP_rr v18.4s, v6.4s, v12.4s[0]
- OP_ii v18.4s, v7.4s, v13.4s[0]
- OP_ri v19.4s, v6.4s, v13.4s[0]
- OP_ir v19.4s, v7.4s, v12.4s[0]
+ OP_rr v18.4s, v6.4s, v12.s[0]
+ OP_ii v18.4s, v7.4s, v13.s[0]
+ OP_ri v19.4s, v6.4s, v13.s[0]
+ OP_ir v19.4s, v7.4s, v12.s[0]
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
- OP_rr v22.4s, v6.4s, v12.4s[1]
- OP_ii v22.4s, v7.4s, v13.4s[1]
- OP_ri v23.4s, v6.4s, v13.4s[1]
- OP_ir v23.4s, v7.4s, v12.4s[1]
+ OP_rr v22.4s, v6.4s, v12.s[1]
+ OP_ii v22.4s, v7.4s, v13.s[1]
+ OP_ri v23.4s, v6.4s, v13.s[1]
+ OP_ir v23.4s, v7.4s, v12.s[1]
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
- OP_rr v26.4s, v6.4s, v12.4s[2]
- OP_ii v26.4s, v7.4s, v13.4s[2]
- OP_ri v27.4s, v6.4s, v13.4s[2]
- OP_ir v27.4s, v7.4s, v12.4s[2]
+ OP_rr v26.4s, v6.4s, v12.s[2]
+ OP_ii v26.4s, v7.4s, v13.s[2]
+ OP_ri v27.4s, v6.4s, v13.s[2]
+ OP_ir v27.4s, v7.4s, v12.s[2]
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
- OP_rr v30.4s, v6.4s, v12.4s[3]
- OP_ii v30.4s, v7.4s, v13.4s[3]
- OP_ri v31.4s, v6.4s, v13.4s[3]
- OP_ir v31.4s, v7.4s, v12.4s[3]
+ OP_rr v30.4s, v6.4s, v12.s[3]
+ OP_ii v30.4s, v7.4s, v13.s[3]
+ OP_ri v31.4s, v6.4s, v13.s[3]
+ OP_ir v31.4s, v7.4s, v12.s[3]
.endm
@@ -425,45 +425,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
- OP_ri v19.4s, v2.4s, v9.4s[0]
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- OP_rr v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
- OP_ri v23.4s, v2.4s, v9.4s[1]
- OP_ir v23.4s, v3.4s, v8.4s[1]
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- OP_rr v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
- OP_ri v27.4s, v2.4s, v9.4s[2]
- OP_ir v27.4s, v3.4s, v8.4s[2]
+ OP_rr v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
+ OP_ri v27.4s, v2.4s, v9.s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
- OP_rr v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
- OP_ri v31.4s, v2.4s, v9.4s[3]
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_rr v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
+ OP_ri v31.4s, v2.4s, v9.s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
.endm
@@ -562,49 +562,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- fmul v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
+ fmul v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.4s, v0.4s, v9.4s[0]
+ fmls v17.4s, v0.4s, v9.s[0]
#else
- fmul v17.4s, v0.4s, v9.4s[0]
+ fmul v17.4s, v0.4s, v9.s[0]
#endif
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
+ fmul v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.4s, v0.4s, v9.4s[1]
+ fmls v21.4s, v0.4s, v9.s[1]
#else
- fmul v21.4s, v0.4s, v9.4s[1]
+ fmul v21.4s, v0.4s, v9.s[1]
#endif
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- fmul v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
+ fmul v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.4s, v0.4s, v9.4s[2]
+ fmls v25.4s, v0.4s, v9.s[2]
#else
- fmul v25.4s, v0.4s, v9.4s[2]
+ fmul v25.4s, v0.4s, v9.s[2]
#endif
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- fmul v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
+ fmul v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.4s, v0.4s, v9.4s[3]
+ fmls v29.4s, v0.4s, v9.s[3]
#else
- fmul v29.4s, v0.4s, v9.4s[3]
+ fmul v29.4s, v0.4s, v9.s[3]
#endif
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32
@@ -613,85 +613,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x4_M1
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v4.4s, v5.4s}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
.endm
.macro KERNEL4x4_M2
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
ld2 {v0.4s, v1.4s}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
.endm
.macro KERNEL4x4_E
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
.endm
.macro KERNEL4x4_SUB
@@ -700,25 +700,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
.endm
.macro SAVE4x4
@@ -780,25 +780,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.4s[0]
- OP_ii v16.2s, v1.2s, v9.4s[0]
- OP_ri v17.2s, v0.2s, v9.4s[0]
- OP_ir v17.2s, v1.2s, v8.4s[0]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
- OP_rr v20.2s, v0.2s, v8.4s[1]
- OP_ii v20.2s, v1.2s, v9.4s[1]
- OP_ri v21.2s, v0.2s, v9.4s[1]
- OP_ir v21.2s, v1.2s, v8.4s[1]
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
- OP_rr v24.2s, v0.2s, v8.4s[2]
- OP_ii v24.2s, v1.2s, v9.4s[2]
- OP_ri v25.2s, v0.2s, v9.4s[2]
- OP_ir v25.2s, v1.2s, v8.4s[2]
+ OP_rr v24.2s, v0.2s, v8.s[2]
+ OP_ii v24.2s, v1.2s, v9.s[2]
+ OP_ri v25.2s, v0.2s, v9.s[2]
+ OP_ir v25.2s, v1.2s, v8.s[2]
- OP_rr v28.2s, v0.2s, v8.4s[3]
- OP_ii v28.2s, v1.2s, v9.4s[3]
- OP_ri v29.2s, v0.2s, v9.4s[3]
- OP_ir v29.2s, v1.2s, v8.4s[3]
+ OP_rr v28.2s, v0.2s, v8.s[3]
+ OP_ii v28.2s, v1.2s, v9.s[3]
+ OP_ri v29.2s, v0.2s, v9.s[3]
+ OP_ir v29.2s, v1.2s, v8.s[3]
.endm
.macro SAVE2x4
@@ -860,25 +860,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.4s[0]
- OP_ii s16, s1, v9.4s[0]
- OP_ri s17, s0, v9.4s[0]
- OP_ir s17, s1, v8.4s[0]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
- OP_rr s20, s0, v8.4s[1]
- OP_ii s20, s1, v9.4s[1]
- OP_ri s21, s0, v9.4s[1]
- OP_ir s21, s1, v8.4s[1]
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
- OP_rr s24, s0, v8.4s[2]
- OP_ii s24, s1, v9.4s[2]
- OP_ri s25, s0, v9.4s[2]
- OP_ir s25, s1, v8.4s[2]
+ OP_rr s24, s0, v8.s[2]
+ OP_ii s24, s1, v9.s[2]
+ OP_ri s25, s0, v9.s[2]
+ OP_ir s25, s1, v8.s[2]
- OP_rr s28, s0, v8.4s[3]
- OP_ii s28, s1, v9.4s[3]
- OP_ri s29, s0, v9.4s[3]
- OP_ir s29, s1, v8.4s[3]
+ OP_rr s28, s0, v8.s[3]
+ OP_ii s28, s1, v9.s[3]
+ OP_ri s29, s0, v9.s[3]
+ OP_ir s29, s1, v8.s[3]
.endm
.macro SAVE1x4
@@ -942,25 +942,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.2s[0]
- OP_ii v16.4s, v1.4s, v9.2s[0]
- OP_ri v17.4s, v0.4s, v9.2s[0]
- OP_ir v17.4s, v1.4s, v8.2s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v18.4s, v2.4s, v8.2s[0]
- OP_ii v18.4s, v3.4s, v9.2s[0]
- OP_ri v19.4s, v2.4s, v9.2s[0]
- OP_ir v19.4s, v3.4s, v8.2s[0]
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.2s[1]
- OP_ii v20.4s, v1.4s, v9.2s[1]
- OP_ri v21.4s, v0.4s, v9.2s[1]
- OP_ir v21.4s, v1.4s, v8.2s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- OP_rr v22.4s, v2.4s, v8.2s[1]
- OP_ii v22.4s, v3.4s, v9.2s[1]
- OP_ri v23.4s, v2.4s, v9.2s[1]
- OP_ir v23.4s, v3.4s, v8.2s[1]
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
.endm
.macro SAVE8x2
@@ -1018,15 +1018,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.2s[0]
- OP_ii v16.4s, v1.4s, v9.2s[0]
- OP_ri v17.4s, v0.4s, v9.2s[0]
- OP_ir v17.4s, v1.4s, v8.2s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.2s[1]
- OP_ii v20.4s, v1.4s, v9.2s[1]
- OP_ri v21.4s, v0.4s, v9.2s[1]
- OP_ir v21.4s, v1.4s, v8.2s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE4x2
@@ -1066,15 +1066,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.2s[0]
- OP_ii v16.2s, v1.2s, v9.2s[0]
- OP_ri v17.2s, v0.2s, v9.2s[0]
- OP_ir v17.2s, v1.2s, v8.2s[0]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
- OP_rr v20.2s, v0.2s, v8.2s[1]
- OP_ii v20.2s, v1.2s, v9.2s[1]
- OP_ri v21.2s, v0.2s, v9.2s[1]
- OP_ir v21.2s, v1.2s, v8.2s[1]
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE2x2
@@ -1114,15 +1114,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.2s[0]
- OP_ii s16, s1, v9.2s[0]
- OP_ri s17, s0, v9.2s[0]
- OP_ir s17, s1, v8.2s[0]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
- OP_rr s20, s0, v8.2s[1]
- OP_ii s20, s1, v9.2s[1]
- OP_ri s21, s0, v9.2s[1]
- OP_ir s21, s1, v8.2s[1]
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
.endm
.macro SAVE1x2
@@ -1164,15 +1164,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v8.4s[1]
- OP_ri v17.4s, v0.4s, v8.4s[1]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v8.s[1]
+ OP_ri v17.4s, v0.4s, v8.s[1]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v8.4s[1]
- OP_ri v19.4s, v2.4s, v8.4s[1]
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v8.s[1]
+ OP_ri v19.4s, v2.4s, v8.s[1]
+ OP_ir v19.4s, v3.4s, v8.s[0]
.endm
.macro SAVE8x1
diff --git a/kernel/arm64/dgemm_kernel_4x4.S b/kernel/arm64/dgemm_kernel_4x4.S
index e2ad11492..44b0f7ff2 100644
--- a/kernel/arm64/dgemm_kernel_4x4.S
+++ b/kernel/arm64/dgemm_kernel_4x4.S
@@ -161,150 +161,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldp q0, q1, [pA]
add pA, pA, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v29.2d, v1.2d, v11.2d[0]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v29.2d, v1.2d, v11.d[0]
ldp q2, q3, [ppA]
add ppA, ppA, #32
- fmul v20.2d, v0.2d, v9.2d[0]
- fmul v25.2d, v1.2d, v10.2d[0]
+ fmul v20.2d, v0.2d, v9.d[0]
+ fmul v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
- fmul v18.2d, v2.2d, v8.2d[0]
- fmul v31.2d, v3.2d, v11.2d[0]
+ fmul v18.2d, v2.2d, v8.d[0]
+ fmul v31.2d, v3.2d, v11.d[0]
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
- fmul v22.2d, v2.2d, v9.2d[0]
- fmul v27.2d, v3.2d, v10.2d[0]
+ fmul v22.2d, v2.2d, v9.d[0]
+ fmul v27.2d, v3.2d, v10.d[0]
ldp d12, d13, [pB]
add pB, pB, #16
- fmul v24.2d, v0.2d, v10.2d[0]
- fmul v21.2d, v1.2d, v9.2d[0]
+ fmul v24.2d, v0.2d, v10.d[0]
+ fmul v21.2d, v1.2d, v9.d[0]
ldp q4, q5, [pA] // for next round
add pA, pA, #32
- fmul v26.2d, v2.2d, v10.2d[0]
- fmul v23.2d, v3.2d, v9.2d[0]
+ fmul v26.2d, v2.2d, v10.d[0]
+ fmul v23.2d, v3.2d, v9.d[0]
ldp q6, q7, [ppA] // for next round
add ppA, ppA, #32
- fmul v28.2d, v0.2d, v11.2d[0]
- fmul v17.2d, v1.2d, v8.2d[0]
+ fmul v28.2d, v0.2d, v11.d[0]
+ fmul v17.2d, v1.2d, v8.d[0]
ldp d14, d15, [pB]
add pB, pB, #16
- fmul v30.2d, v2.2d, v11.2d[0]
- fmul v19.2d, v3.2d, v8.2d[0]
+ fmul v30.2d, v2.2d, v11.d[0]
+ fmul v19.2d, v3.2d, v8.d[0]
.endm
.macro KERNEL8x4_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
ldp d8, d9, [pB]
add pB, pB, #16
- fmla v18.2d, v6.2d, v12.2d[0]
- fmla v31.2d, v7.2d, v15.2d[0]
+ fmla v18.2d, v6.2d, v12.d[0]
+ fmla v31.2d, v7.2d, v15.d[0]
ldp d10, d11, [pB]
add pB, pB, #16
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
- fmla v22.2d, v6.2d, v13.2d[0]
- fmla v27.2d, v7.2d, v14.2d[0]
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v21.2d, v5.2d, v13.2d[0]
+ fmla v22.2d, v6.2d, v13.d[0]
+ fmla v27.2d, v7.2d, v14.d[0]
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
ldp q0, q1, [pA]
add pA, pA, #32
- fmla v26.2d, v6.2d, v14.2d[0]
- fmla v23.2d, v7.2d, v13.2d[0]
- fmla v28.2d, v4.2d, v15.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v26.2d, v6.2d, v14.d[0]
+ fmla v23.2d, v7.2d, v13.d[0]
+ fmla v28.2d, v4.2d, v15.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
ldp q2, q3, [ppA]
add ppA, ppA, #32
- fmla v30.2d, v6.2d, v15.2d[0]
- fmla v19.2d, v7.2d, v12.2d[0]
+ fmla v30.2d, v6.2d, v15.d[0]
+ fmla v19.2d, v7.2d, v12.d[0]
.endm
.macro KERNEL8x4_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
ldp d12, d13, [pB]
add pB, pB, #16
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v31.2d, v3.2d, v11.2d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v31.2d, v3.2d, v11.d[0]
ldp d14, d15, [pB]
add pB, pB, #16
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
- fmla v22.2d, v2.2d, v9.2d[0]
- fmla v27.2d, v3.2d, v10.2d[0]
+ fmla v22.2d, v2.2d, v9.d[0]
+ fmla v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v21.2d, v1.2d, v9.2d[0]
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
ldp q4, q5, [pA]
add pA, pA, #32
- fmla v26.2d, v2.2d, v10.2d[0]
- fmla v23.2d, v3.2d, v9.2d[0]
+ fmla v26.2d, v2.2d, v10.d[0]
+ fmla v23.2d, v3.2d, v9.d[0]
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
ldp q6, q7, [ppA]
add ppA, ppA, #32
- fmla v30.2d, v2.2d, v11.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
+ fmla v30.2d, v2.2d, v11.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
.endm
.macro KERNEL8x4_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
- fmla v18.2d, v6.2d, v12.2d[0]
- fmla v27.2d, v7.2d, v14.2d[0]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
+ fmla v18.2d, v6.2d, v12.d[0]
+ fmla v27.2d, v7.2d, v14.d[0]
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
- fmla v22.2d, v6.2d, v13.2d[0]
- fmla v31.2d, v7.2d, v15.2d[0]
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
+ fmla v22.2d, v6.2d, v13.d[0]
+ fmla v31.2d, v7.2d, v15.d[0]
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v26.2d, v6.2d, v14.2d[0]
- fmla v19.2d, v7.2d, v12.2d[0]
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v26.2d, v6.2d, v14.d[0]
+ fmla v19.2d, v7.2d, v12.d[0]
- fmla v28.2d, v4.2d, v15.2d[0]
- fmla v21.2d, v5.2d, v13.2d[0]
- fmla v30.2d, v6.2d, v15.2d[0]
- fmla v23.2d, v7.2d, v13.2d[0]
+ fmla v28.2d, v4.2d, v15.d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v30.2d, v6.2d, v15.d[0]
+ fmla v23.2d, v7.2d, v13.d[0]
.endm
.macro KERNEL8x4_SUB
@@ -315,28 +315,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldp q0, q1, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
ldp q2, q3, [ppA]
add ppA, ppA, #32
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v21.2d, v1.2d, v9.2d[0]
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v31.2d, v3.2d, v11.2d[0]
- fmla v22.2d, v2.2d, v9.2d[0]
- fmla v27.2d, v3.2d, v10.2d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v31.2d, v3.2d, v11.d[0]
+ fmla v22.2d, v2.2d, v9.d[0]
+ fmla v27.2d, v3.2d, v10.d[0]
- fmla v26.2d, v2.2d, v10.2d[0]
- fmla v23.2d, v3.2d, v9.2d[0]
- fmla v30.2d, v2.2d, v11.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
+ fmla v26.2d, v2.2d, v10.d[0]
+ fmla v23.2d, v3.2d, v9.d[0]
+ fmla v30.2d, v2.2d, v11.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
.endm
.macro SAVE8x4
@@ -422,17 +422,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x4
@@ -482,10 +482,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
.endm
.macro SAVE2x4
@@ -572,10 +572,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE4x2
@@ -610,8 +610,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
.endm
.macro SAVE2x2
@@ -643,7 +643,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA]
add pA, pA, #8
- fmla v16.2d, v8.2d, v0.2d[0]
+ fmla v16.2d, v8.2d, v0.d[0]
.endm
.macro SAVE1x2
@@ -674,8 +674,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x1
@@ -705,7 +705,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA , pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
.endm
.macro SAVE2x1
diff --git a/kernel/arm64/dgemm_kernel_4x8.S b/kernel/arm64/dgemm_kernel_4x8.S
old mode 100755
new mode 100644
index 88e9a773d..b04dbb5d5
--- a/kernel/arm64/dgemm_kernel_4x8.S
+++ b/kernel/arm64/dgemm_kernel_4x8.S
@@ -154,25 +154,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v17.2d, v1.2d, v8.2d[0]
- fmul v18.2d, v0.2d, v8.2d[1]
- fmul v19.2d, v1.2d, v8.2d[1]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v17.2d, v1.2d, v8.d[0]
+ fmul v18.2d, v0.2d, v8.d[1]
+ fmul v19.2d, v1.2d, v8.d[1]
- fmul v20.2d, v0.2d, v9.2d[0]
- fmul v21.2d, v1.2d, v9.2d[0]
- fmul v22.2d, v0.2d, v9.2d[1]
- fmul v23.2d, v1.2d, v9.2d[1]
+ fmul v20.2d, v0.2d, v9.d[0]
+ fmul v21.2d, v1.2d, v9.d[0]
+ fmul v22.2d, v0.2d, v9.d[1]
+ fmul v23.2d, v1.2d, v9.d[1]
- fmul v24.2d, v0.2d, v10.2d[0]
- fmul v25.2d, v1.2d, v10.2d[0]
- fmul v26.2d, v0.2d, v10.2d[1]
- fmul v27.2d, v1.2d, v10.2d[1]
+ fmul v24.2d, v0.2d, v10.d[0]
+ fmul v25.2d, v1.2d, v10.d[0]
+ fmul v26.2d, v0.2d, v10.d[1]
+ fmul v27.2d, v1.2d, v10.d[1]
- fmul v28.2d, v0.2d, v11.2d[0]
- fmul v29.2d, v1.2d, v11.2d[0]
- fmul v30.2d, v0.2d, v11.2d[1]
- fmul v31.2d, v1.2d, v11.2d[1]
+ fmul v28.2d, v0.2d, v11.d[0]
+ fmul v29.2d, v1.2d, v11.d[0]
+ fmul v30.2d, v0.2d, v11.d[1]
+ fmul v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
@@ -183,25 +183,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v0.2d, v8.2d[1]
- fmla v19.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v0.2d, v8.d[1]
+ fmla v19.2d, v1.2d, v8.d[1]
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v9.2d[0]
- fmla v22.2d, v0.2d, v9.2d[1]
- fmla v23.2d, v1.2d, v9.2d[1]
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
+ fmla v22.2d, v0.2d, v9.d[1]
+ fmla v23.2d, v1.2d, v9.d[1]
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
- fmla v26.2d, v0.2d, v10.2d[1]
- fmla v27.2d, v1.2d, v10.2d[1]
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
+ fmla v26.2d, v0.2d, v10.d[1]
+ fmla v27.2d, v1.2d, v10.d[1]
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
- fmla v30.2d, v0.2d, v11.2d[1]
- fmla v31.2d, v1.2d, v11.2d[1]
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
+ fmla v30.2d, v0.2d, v11.d[1]
+ fmla v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32
@@ -214,25 +214,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v18.2d, v4.2d, v12.2d[1]
- fmla v19.2d, v5.2d, v12.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v18.2d, v4.2d, v12.d[1]
+ fmla v19.2d, v5.2d, v12.d[1]
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v13.2d[0]
- fmla v22.2d, v4.2d, v13.2d[1]
- fmla v23.2d, v5.2d, v13.2d[1]
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v22.2d, v4.2d, v13.d[1]
+ fmla v23.2d, v5.2d, v13.d[1]
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
- fmla v26.2d, v4.2d, v14.2d[1]
- fmla v27.2d, v5.2d, v14.2d[1]
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
+ fmla v26.2d, v4.2d, v14.d[1]
+ fmla v27.2d, v5.2d, v14.d[1]
- fmla v28.2d, v4.2d, v15.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
- fmla v30.2d, v4.2d, v15.2d[1]
- fmla v31.2d, v5.2d, v15.2d[1]
+ fmla v28.2d, v4.2d, v15.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
+ fmla v30.2d, v4.2d, v15.d[1]
+ fmla v31.2d, v5.2d, v15.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32
@@ -245,25 +245,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v18.2d, v4.2d, v12.2d[1]
- fmla v19.2d, v5.2d, v12.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v18.2d, v4.2d, v12.d[1]
+ fmla v19.2d, v5.2d, v12.d[1]
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v13.2d[0]
- fmla v22.2d, v4.2d, v13.2d[1]
- fmla v23.2d, v5.2d, v13.2d[1]
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v22.2d, v4.2d, v13.d[1]
+ fmla v23.2d, v5.2d, v13.d[1]
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
- fmla v26.2d, v4.2d, v14.2d[1]
- fmla v27.2d, v5.2d, v14.2d[1]
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
+ fmla v26.2d, v4.2d, v14.d[1]
+ fmla v27.2d, v5.2d, v14.d[1]
- fmla v28.2d, v4.2d, v15.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
- fmla v30.2d, v4.2d, v15.2d[1]
- fmla v31.2d, v5.2d, v15.2d[1]
+ fmla v28.2d, v4.2d, v15.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
+ fmla v30.2d, v4.2d, v15.d[1]
+ fmla v31.2d, v5.2d, v15.d[1]
.endm
.macro KERNEL4x8_SUB
@@ -274,25 +274,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v0.2d, v8.2d[1]
- fmla v19.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v0.2d, v8.d[1]
+ fmla v19.2d, v1.2d, v8.d[1]
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v9.2d[0]
- fmla v22.2d, v0.2d, v9.2d[1]
- fmla v23.2d, v1.2d, v9.2d[1]
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
+ fmla v22.2d, v0.2d, v9.d[1]
+ fmla v23.2d, v1.2d, v9.d[1]
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
- fmla v26.2d, v0.2d, v10.2d[1]
- fmla v27.2d, v1.2d, v10.2d[1]
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
+ fmla v26.2d, v0.2d, v10.d[1]
+ fmla v27.2d, v1.2d, v10.d[1]
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
- fmla v30.2d, v0.2d, v11.2d[1]
- fmla v31.2d, v1.2d, v11.2d[1]
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
+ fmla v30.2d, v0.2d, v11.d[1]
+ fmla v31.2d, v1.2d, v11.d[1]
.endm
.macro SAVE4x8
@@ -374,17 +374,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v18.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v18.2d, v0.2d, v8.d[1]
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v22.2d, v0.2d, v9.2d[1]
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v22.2d, v0.2d, v9.d[1]
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v26.2d, v0.2d, v10.2d[1]
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v26.2d, v0.2d, v10.d[1]
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v30.2d, v0.2d, v11.2d[1]
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v30.2d, v0.2d, v11.d[1]
.endm
.macro SAVE2x8
@@ -520,17 +520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v29.2d, v1.2d, v9.2d[1]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v29.2d, v1.2d, v9.d[1]
- fmul v20.2d, v0.2d, v8.2d[1]
- fmul v25.2d, v1.2d, v9.2d[0]
+ fmul v20.2d, v0.2d, v8.d[1]
+ fmul v25.2d, v1.2d, v9.d[0]
- fmul v24.2d, v0.2d, v9.2d[0]
- fmul v21.2d, v1.2d, v8.2d[1]
+ fmul v24.2d, v0.2d, v9.d[0]
+ fmul v21.2d, v1.2d, v8.d[1]
- fmul v28.2d, v0.2d, v9.2d[1]
- fmul v17.2d, v1.2d, v8.2d[0]
+ fmul v28.2d, v0.2d, v9.d[1]
+ fmul v17.2d, v1.2d, v8.d[0]
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
@@ -539,61 +539,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x4_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
ld1 {v4.2d, v5.2d}, [pA] // For next round
add pA, pA, #32
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v29.2d, v5.2d, v13.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v29.2d, v5.2d, v13.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v25.2d, v5.2d, v13.d[0]
ld1 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v12.2d[1]
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v17.2d, v5.2d, v12.d[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v29.2d, v5.2d, v13.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v29.2d, v5.2d, v13.d[1]
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v25.2d, v5.2d, v13.d[0]
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v12.2d[1]
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v12.d[1]
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v17.2d, v5.2d, v12.d[0]
.endm
.macro KERNEL4x4_SUB
@@ -602,17 +602,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x4
@@ -660,10 +660,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
.endm
.macro SAVE2x4
@@ -746,10 +746,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE4x2
@@ -782,8 +782,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
.endm
.macro SAVE2x2
@@ -813,7 +813,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA]
add pA, pA, #8
- fmla v16.2d, v8.2d, v0.2d[0]
+ fmla v16.2d, v8.2d, v0.d[0]
.endm
.macro SAVE1x2
@@ -842,8 +842,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x1
@@ -871,7 +871,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA , pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
.endm
.macro SAVE2x1
diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S
old mode 100755
new mode 100644
index a607fecc4..f3c3d5c35
--- a/kernel/arm64/dgemm_kernel_8x4.S
+++ b/kernel/arm64/dgemm_kernel_8x4.S
@@ -52,12 +52,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha0 d10
#define alphaV0 v10.d[0]
-#define alpha1 d11
-#define alphaV1 v11.d[0]
-#define alpha2 d14
-#define alphaV2 v14.d[0]
-#define alpha3 d15
-#define alphaV3 v15.d[0]
+
+#define A_PRE_SIZE 2560
+#define B_PRE_SIZE 448
+#define C_PRE_SIZE 128
// 00 origM
// 01 origN
@@ -74,8 +72,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
-// 15 pA
-// 16
+// 15 pCRow3
+// 16 pA
// 17
// 18 must save
// 19 must save
@@ -100,14 +98,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v05 pA1_2, pA1_3
//v06 pA1_4, pA1_5
//v07 pA1_6, pA1_7
-//v08 must save pB0_0, pB0_1
-//v09 must save pB0_2, pB0_3
-//v10 must save ALPHA0
-//v11 must save ALPHA1
-//v12 must save pB1_0, pB1_1
-//v13 must save pB1_2, pB1_3
-//v14 must save ALPHA2
-//v15 must save ALPHA3
+//v08 must save pB0_0
+//v09 must save pB0_1
+//v10 must save pB0_2 --> ALPHA0
+//v11 must save pB0_3
+//v12 must save pB1_0
+//v13 must save pB1_1
+//v14 must save pB1_2
+//v15 must save pB1_3
//v16 must save C00, C01
//v17 must save C02, C03
//v18 C04, C05
@@ -149,244 +147,257 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_I
- ld1 {v0.2d, v1.2d}, [pA]
- add pA, pA, #32
- ld1 {v2.2d, v3.2d}, [pA]
- add pA, pA, #32
- ldp d8, d9, [pB]
- add pB, pB, #16
- ldp d10, d11, [pB]
- add pB, pB, #16
+ ldp q0, q1, [pA], #32
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v17.2d, v1.2d, v8.2d[0]
+ ldp d8, d9, [pB], #16
- fmul v18.2d, v2.2d, v8.2d[0]
- fmul v19.2d, v3.2d, v8.2d[0]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v20.2d, v0.2d, v9.d[0]
- fmul v20.2d, v0.2d, v9.2d[0]
- fmul v21.2d, v1.2d, v9.2d[0]
+ ldp d10, d11, [pB], #16
- fmul v22.2d, v2.2d, v9.2d[0]
- fmul v23.2d, v3.2d, v9.2d[0]
+ fmul v17.2d, v1.2d, v8.d[0]
+ fmul v21.2d, v1.2d, v9.d[0]
- fmul v24.2d, v0.2d, v10.2d[0]
- fmul v25.2d, v1.2d, v10.2d[0]
+ ldp q2, q3, [pA], #32
- fmul v26.2d, v2.2d, v10.2d[0]
- fmul v27.2d, v3.2d, v10.2d[0]
+ fmul v24.2d, v0.2d, v10.d[0]
+ fmul v28.2d, v0.2d, v11.d[0]
- fmul v28.2d, v0.2d, v11.2d[0]
- fmul v29.2d, v1.2d, v11.2d[0]
+ ldp q4, q5, [pA], #32
- fmul v30.2d, v2.2d, v11.2d[0]
- fmul v31.2d, v3.2d, v11.2d[0]
+ fmul v25.2d, v1.2d, v10.d[0]
+ fmul v29.2d, v1.2d, v11.d[0]
- ld1 {v4.2d, v5.2d}, [pA]
- add pA, pA, #32
- ld1 {v6.2d, v7.2d}, [pA]
- add pA, pA, #32
- ldp d12, d13, [pB]
- add pB, pB, #16
- ldp d14, d15, [pB]
- add pB, pB, #16
+ ldp d12, d13, [pB], #16
+
+ fmul v18.2d, v2.2d, v8.d[0]
+ fmul v22.2d, v2.2d, v9.d[0]
+
+ ldp d14, d15, [pB], #16
+
+ fmul v26.2d, v2.2d, v10.d[0]
+ fmul v30.2d, v2.2d, v11.d[0]
+
+ ldp q6, q7, [pA], #32
+
+ fmul v19.2d, v3.2d, v8.d[0]
+ fmul v27.2d, v3.2d, v10.d[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+ fmul v31.2d, v3.2d, v11.d[0]
+ fmul v23.2d, v3.2d, v9.d[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL8x4_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v21.2d, v1.2d, v9.2d[0]
- fmla v26.2d, v2.2d, v10.2d[0]
- fmla v31.2d, v3.2d, v11.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v9.d[0]
- ld1 {v4.2d}, [pA], #16
+ ldp q4, q5, [pA], #32
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v28.2d, v0.2d, v11.d[0]
- ld1 {v5.2d}, [pA], #16
+ ldp d12, d13, [pB], #16
- fmla v30.2d, v2.2d, v11.2d[0]
- fmla v27.2d, v3.2d, v10.2d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
- ldp d12, d13, [pB]
- add pB, pB, #16
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
- ldp d14, d15, [pB]
- add pB, pB, #16
+ ldp d14, d15, [pB], #16
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v23.2d, v3.2d, v9.2d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v22.2d, v2.2d, v9.d[0]
- ld1 {v6.2d}, [pA], #16
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
+ fmla v26.2d, v2.2d, v10.d[0]
+ fmla v30.2d, v2.2d, v11.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
+ fmla v23.2d, v3.2d, v9.d[0]
- ld1 {v7.2d}, [pA], #16
+ ldp q6, q7, [pA], #32
- fmla v22.2d, v2.2d, v9.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
-
- prfm PLDL1KEEP, [pA, #224]
- prfm PLDL1KEEP, [pA, #224+64]
+ fmla v27.2d, v3.2d, v10.d[0]
+ fmla v31.2d, v3.2d, v11.d[0]
.endm
.macro KERNEL8x4_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v21.2d, v5.2d, v13.2d[0]
- fmla v26.2d, v6.2d, v14.2d[0]
- fmla v31.2d, v7.2d, v15.2d[0]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v28.2d, v4.2d, v15.d[0]
- ld1 {v0.2d}, [pA], #16
+ ldp q0, q1, [pA], #32
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
- ld1 {v1.2d}, [pA], #16
+ ldp d8, d9, [pB], #16
- fmla v30.2d, v6.2d, v15.2d[0]
- fmla v27.2d, v7.2d, v14.2d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
- ldp d8, d9, [pB]
- add pB, pB, #16
+ ldp d10, d11, [pB], #16
- fmla v28.2d, v4.2d, v15.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
+ fmla v18.2d, v6.2d, v12.d[0]
+ fmla v22.2d, v6.2d, v13.d[0]
- ldp d10, d11, [pB]
- add pB, pB, #16
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
- fmla v22.2d, v6.2d, v13.2d[0]
- fmla v19.2d, v7.2d, v12.2d[0]
+ fmla v26.2d, v6.2d, v14.d[0]
+ fmla v30.2d, v6.2d, v15.d[0]
- ld1 {v2.2d}, [pA], #16
+ fmla v19.2d, v7.2d, v12.d[0]
+ fmla v23.2d, v7.2d, v13.d[0]
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
+ ldp q2, q3, [pA], #32
- ld1 {v3.2d}, [pA], #16
-
- fmla v18.2d, v6.2d, v12.2d[0]
- fmla v23.2d, v7.2d, v13.2d[0]
-
- prfm PLDL1KEEP, [pB, #640]
+ fmla v27.2d, v7.2d, v14.d[0]
+ fmla v31.2d, v7.2d, v15.d[0]
.endm
.macro KERNEL8x4_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v18.2d, v6.2d, v12.2d[0]
- fmla v19.2d, v7.2d, v12.2d[0]
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v13.2d[0]
- fmla v22.2d, v6.2d, v13.2d[0]
- fmla v23.2d, v7.2d, v13.2d[0]
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
- fmla v26.2d, v6.2d, v14.2d[0]
- fmla v27.2d, v7.2d, v14.2d[0]
- fmla v28.2d, v4.2d, v15.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
- fmla v30.2d, v6.2d, v15.2d[0]
- fmla v31.2d, v7.2d, v15.2d[0]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v28.2d, v4.2d, v15.d[0]
+
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
+
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+ fmla v18.2d, v6.2d, v12.d[0]
+ fmla v22.2d, v6.2d, v13.d[0]
+ fmla v26.2d, v6.2d, v14.d[0]
+ fmla v30.2d, v6.2d, v15.d[0]
+
+ fmla v19.2d, v7.2d, v12.d[0]
+ fmla v23.2d, v7.2d, v13.d[0]
+ fmla v27.2d, v7.2d, v14.d[0]
+ fmla v31.2d, v7.2d, v15.d[0]
.endm
.macro KERNEL8x4_SUB
- ld1 {v0.2d, v1.2d}, [pA]
- add pA, pA, #32
- ld1 {v2.2d, v3.2d}, [pA]
- add pA, pA, #32
- ldp d8, d9, [pB]
- add pB, pB, #16
- ldp d10, d11, [pB]
- add pB, pB, #16
+ ldp q0, q1, [pA], #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
+ ldp d8, d9, [pB], #16
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v9.2d[0]
- fmla v22.2d, v2.2d, v9.2d[0]
- fmla v23.2d, v3.2d, v9.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v9.d[0]
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
- fmla v26.2d, v2.2d, v10.2d[0]
- fmla v27.2d, v3.2d, v10.2d[0]
+ ldp d10, d11, [pB], #16
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
- fmla v30.2d, v2.2d, v11.2d[0]
- fmla v31.2d, v3.2d, v11.2d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
+
+ ldp q2, q3, [pA], #32
+
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v28.2d, v0.2d, v11.d[0]
+
+ fmla v25.2d, v1.2d, v10.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v22.2d, v2.2d, v9.d[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+
+ fmla v26.2d, v2.2d, v10.d[0]
+ fmla v30.2d, v2.2d, v11.d[0]
+
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+ fmla v19.2d, v3.2d, v8.d[0]
+ fmla v27.2d, v3.2d, v10.d[0]
+
+ fmla v31.2d, v3.2d, v11.d[0]
+ fmla v23.2d, v3.2d, v9.d[0]
.endm
.macro SAVE8x4
fmov alpha0, alpha
- ld1 {v0.2d, v1.2d}, [pCRow0]
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+ ldp q0, q1, [pCRow0]
fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV0
- st1 {v0.2d, v1.2d}, [pCRow0]
+ stp q0, q1, [pCRow0]
add pCRow0, pCRow0, #32
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
- ld1 {v2.2d, v3.2d}, [pCRow0]
+ ldp q2, q3, [pCRow0]
fmla v2.2d, v18.2d, alphaV0
fmla v3.2d, v19.2d, alphaV0
- st1 {v2.2d, v3.2d}, [pCRow0]
+ stp q2, q3, [pCRow0]
add pCRow0, pCRow0, #32
- ld1 {v4.2d, v5.2d}, [pCRow1]
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+ ldp q4, q5, [pCRow1]
fmla v4.2d, v20.2d, alphaV0
fmla v5.2d, v21.2d, alphaV0
- st1 {v4.2d, v5.2d}, [pCRow1]
+ stp q4, q5, [pCRow1]
add pCRow1, pCRow1, #32
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
- ld1 {v6.2d, v7.2d}, [pCRow1]
+ ldp q6, q7, [pCRow1]
fmla v6.2d, v22.2d, alphaV0
fmla v7.2d, v23.2d, alphaV0
- st1 {v6.2d, v7.2d}, [pCRow1]
+ stp q6, q7, [pCRow1]
add pCRow1, pCRow1, #32
- ld1 {v0.2d, v1.2d}, [pCRow2]
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+ ldp q0, q1, [pCRow2]
fmla v0.2d, v24.2d, alphaV0
fmla v1.2d, v25.2d, alphaV0
- st1 {v0.2d, v1.2d}, [pCRow2]
+ stp q0, q1, [pCRow2]
add pCRow2, pCRow2, #32
- ld1 {v2.2d, v3.2d}, [pCRow2]
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+ ldp q2, q3, [pCRow2]
fmla v2.2d, v26.2d, alphaV0
fmla v3.2d, v27.2d, alphaV0
- st1 {v2.2d, v3.2d}, [pCRow2]
+ stp q2, q3, [pCRow2]
add pCRow2, pCRow2, #32
- ld1 {v4.2d, v5.2d}, [pCRow3]
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+ ldp q4, q5, [pCRow3]
fmla v4.2d, v28.2d, alphaV0
fmla v5.2d, v29.2d, alphaV0
- st1 {v4.2d, v5.2d}, [pCRow3]
+ stp q4, q5, [pCRow3]
add pCRow3, pCRow3, #32
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
- ld1 {v6.2d, v7.2d}, [pCRow3]
+ ldp q6, q7, [pCRow3]
fmla v6.2d, v30.2d, alphaV0
fmla v7.2d, v31.2d, alphaV0
- st1 {v6.2d, v7.2d}, [pCRow3]
+ stp q6, q7, [pCRow3]
add pCRow3, pCRow3, #32
-
- prfm PLDL2KEEP, [pCRow0, #128]
- prfm PLDL2KEEP, [pCRow1, #128]
- prfm PLDL2KEEP, [pCRow2, #128]
- prfm PLDL2KEEP, [pCRow3, #128]
.endm
/******************************************************************************/
@@ -408,44 +419,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x4
+ fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
- fmla v9.2d, v17.2d, alphaV1
+ fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
ld1 {v12.2d, v13.2d}, [pCRow1]
- fmla v12.2d, v20.2d, alphaV2
- fmla v13.2d, v21.2d, alphaV3
+ fmla v12.2d, v20.2d, alphaV0
+ fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
ld1 {v8.2d, v9.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV0
- fmla v9.2d, v25.2d, alphaV1
+ fmla v9.2d, v25.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
ld1 {v12.2d, v13.2d}, [pCRow1]
- fmla v12.2d, v28.2d, alphaV2
- fmla v13.2d, v29.2d, alphaV3
+ fmla v12.2d, v28.2d, alphaV0
+ fmla v13.2d, v29.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -467,13 +479,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
.endm
.macro SAVE2x4
+ fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
@@ -481,19 +494,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow1, pCRow0, LDC
ld1 {v12.2d}, [pCRow1]
- fmla v12.2d, v20.2d, alphaV1
+ fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
ld1 {v8.2d}, [pCRow2]
- fmla v8.2d, v24.2d, alphaV2
+ fmla v8.2d, v24.2d, alphaV0
st1 {v8.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
ld1 {v12.2d}, [pCRow1]
- fmla v12.2d, v28.2d, alphaV3
+ fmla v12.2d, v28.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16
@@ -518,6 +531,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
+ fmov alpha0, alpha
add pCRow1, pCRow0, LDC
ld1 {v8.d}[0], [pCRow0]
@@ -531,7 +545,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v12.d}[0], [pCRow2]
ld1 {v12.d}[1], [pCRow1]
- fmla v12.2d, v20.2d, alphaV1
+ fmla v12.2d, v20.2d, alphaV0
st1 {v12.d}[0], [pCRow2]
st1 {v12.d}[1], [pCRow1]
@@ -559,32 +573,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
- fmla v22.2d, v2.2d, v8.2d[1]
- fmla v23.2d, v3.2d, v8.2d[1]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
+ fmla v22.2d, v2.2d, v8.d[1]
+ fmla v23.2d, v3.2d, v8.d[1]
.endm
.macro SAVE8x2
+ fmov alpha0, alpha
add pCRow1, pCRow0, LDC
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0
- fmla v1.2d, v17.2d, alphaV1
- fmla v2.2d, v18.2d, alphaV2
- fmla v3.2d, v19.2d, alphaV3
+ fmla v1.2d, v17.2d, alphaV0
+ fmla v2.2d, v18.2d, alphaV0
+ fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0
- fmla v5.2d, v21.2d, alphaV1
- fmla v6.2d, v22.2d, alphaV2
- fmla v7.2d, v23.2d, alphaV3
+ fmla v5.2d, v21.2d, alphaV0
+ fmla v6.2d, v22.2d, alphaV0
+ fmla v7.2d, v23.2d, alphaV0
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
add pCRow0, pCRow0, #64
@@ -605,23 +620,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE4x2
+ fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
- fmla v9.2d, v17.2d, alphaV1
+ fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
ld1 {v12.2d, v13.2d}, [pCRow1]
- fmla v12.2d, v20.2d, alphaV2
- fmla v13.2d, v21.2d, alphaV3
+ fmla v12.2d, v20.2d, alphaV0
+ fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -641,11 +657,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
.endm
.macro SAVE2x2
+ fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
@@ -653,7 +670,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow1 , pCRow0, LDC
ld1 {v12.2d}, [pCRow1]
- fmla v12.2d, v20.2d, alphaV1
+ fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16
@@ -672,10 +689,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA]
add pA, pA, #8
- fmla v16.2d, v8.2d, v0.2d[0]
+ fmla v16.2d, v8.2d, v0.d[0]
.endm
.macro SAVE1x2
+ fmov alpha0, alpha
add pCRow1 , pCRow0, LDC
ld1 {v8.d}[0], [pCRow0]
@@ -706,18 +724,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
.endm
.macro SAVE8x1
+ fmov alpha0, alpha
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0
- fmla v1.2d, v17.2d, alphaV1
- fmla v2.2d, v18.2d, alphaV2
- fmla v3.2d, v19.2d, alphaV3
+ fmla v1.2d, v17.2d, alphaV0
+ fmla v2.2d, v18.2d, alphaV0
+ fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
add pCRow0, pCRow0, #64
@@ -738,14 +757,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x1
+ fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
- fmla v9.2d, v17.2d, alphaV1
+ fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow0, pCRow0, #32
@@ -765,10 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA , pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
.endm
.macro SAVE2x1
+ fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
@@ -793,6 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
+ fmov alpha0, alpha
ldr d8, [pCRow0]
fmadd d8, d16, alpha0, d8
str d8, [pCRow0]
@@ -820,6 +842,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
+ prfm PLDL1KEEP, [origPB]
+ prfm PLDL1KEEP, [origPA]
+
fmov alpha, d0
lsl LDC, LDC, #3 // ldc = ldc * 8
@@ -838,6 +863,7 @@ dgemm_kernel_L4_BEGIN:
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
add pCRow3, pCRow2, LDC
+
add pC, pCRow3, LDC
mov pA, origPA // pA = start of A array
@@ -849,6 +875,7 @@ dgemm_kernel_L4_M8_BEGIN:
cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
+ .align 5
dgemm_kernel_L4_M8_20:
mov pB, origPB
@@ -868,8 +895,8 @@ dgemm_kernel_L4_M8_20:
subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
- .align 5
+ .align 5
dgemm_kernel_L4_M8_22:
KERNEL8x4_M1
@@ -884,7 +911,7 @@ dgemm_kernel_L4_M8_22:
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
-
+ .align 5
dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
@@ -898,6 +925,7 @@ dgemm_kernel_L4_M8_22a:
b dgemm_kernel_L4_M8_44
+ .align 5
dgemm_kernel_L4_M8_32:
tst counterL, #1
@@ -923,6 +951,7 @@ dgemm_kernel_L4_M8_44:
ands counterL , origK, #7
ble dgemm_kernel_L4_M8_100
+ .align 5
dgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
@@ -931,6 +960,9 @@ dgemm_kernel_L4_M8_46:
bne dgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100:
+ prfm PLDL1KEEP, [pA]
+ prfm PLDL1KEEP, [pA, #64]
+ prfm PLDL1KEEP, [origPB]
SAVE8x4
diff --git a/kernel/arm64/dtrmm_kernel_4x4.S b/kernel/arm64/dtrmm_kernel_4x4.S
index 0d1b12881..34fb8c233 100644
--- a/kernel/arm64/dtrmm_kernel_4x4.S
+++ b/kernel/arm64/dtrmm_kernel_4x4.S
@@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v29.2d, v1.2d, v9.2d[1]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v29.2d, v1.2d, v9.d[1]
- fmul v20.2d, v0.2d, v8.2d[1]
- fmul v25.2d, v1.2d, v9.2d[0]
+ fmul v20.2d, v0.2d, v8.d[1]
+ fmul v25.2d, v1.2d, v9.d[0]
- fmul v24.2d, v0.2d, v9.2d[0]
- fmul v21.2d, v1.2d, v8.2d[1]
+ fmul v24.2d, v0.2d, v9.d[0]
+ fmul v21.2d, v1.2d, v8.d[1]
- fmul v28.2d, v0.2d, v9.2d[1]
- fmul v17.2d, v1.2d, v8.2d[0]
+ fmul v28.2d, v0.2d, v9.d[1]
+ fmul v17.2d, v1.2d, v8.d[0]
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
@@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x4_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
ld1 {v4.2d, v5.2d}, [pA] // For next round
add pA, pA, #32
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v29.2d, v5.2d, v13.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v29.2d, v5.2d, v13.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v25.2d, v5.2d, v13.d[0]
ld1 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v12.2d[1]
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v17.2d, v5.2d, v12.d[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v29.2d, v5.2d, v13.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v29.2d, v5.2d, v13.d[1]
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v25.2d, v5.2d, v13.d[0]
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v12.2d[1]
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v12.d[1]
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v17.2d, v5.2d, v12.d[0]
.endm
.macro KERNEL4x4_SUB
@@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x4
@@ -283,10 +283,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
.endm
.macro SAVE2x4
@@ -361,10 +361,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE4x2
@@ -395,8 +395,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
.endm
.macro SAVE2x2
@@ -424,7 +424,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA]
add pA, pA, #8
- fmla v16.2d, v8.2d, v0.2d[0]
+ fmla v16.2d, v8.2d, v0.d[0]
.endm
.macro SAVE1x2
@@ -451,8 +451,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x1
@@ -479,7 +479,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA , pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
.endm
.macro SAVE2x1
diff --git a/kernel/arm64/dtrmm_kernel_4x8.S b/kernel/arm64/dtrmm_kernel_4x8.S
old mode 100755
new mode 100644
index eb7397faa..4aecf28eb
--- a/kernel/arm64/dtrmm_kernel_4x8.S
+++ b/kernel/arm64/dtrmm_kernel_4x8.S
@@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v17.2d, v1.2d, v8.2d[0]
- fmul v18.2d, v0.2d, v8.2d[1]
- fmul v19.2d, v1.2d, v8.2d[1]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v17.2d, v1.2d, v8.d[0]
+ fmul v18.2d, v0.2d, v8.d[1]
+ fmul v19.2d, v1.2d, v8.d[1]
- fmul v20.2d, v0.2d, v9.2d[0]
- fmul v21.2d, v1.2d, v9.2d[0]
- fmul v22.2d, v0.2d, v9.2d[1]
- fmul v23.2d, v1.2d, v9.2d[1]
+ fmul v20.2d, v0.2d, v9.d[0]
+ fmul v21.2d, v1.2d, v9.d[0]
+ fmul v22.2d, v0.2d, v9.d[1]
+ fmul v23.2d, v1.2d, v9.d[1]
- fmul v24.2d, v0.2d, v10.2d[0]
- fmul v25.2d, v1.2d, v10.2d[0]
- fmul v26.2d, v0.2d, v10.2d[1]
- fmul v27.2d, v1.2d, v10.2d[1]
+ fmul v24.2d, v0.2d, v10.d[0]
+ fmul v25.2d, v1.2d, v10.d[0]
+ fmul v26.2d, v0.2d, v10.d[1]
+ fmul v27.2d, v1.2d, v10.d[1]
- fmul v28.2d, v0.2d, v11.2d[0]
- fmul v29.2d, v1.2d, v11.2d[0]
- fmul v30.2d, v0.2d, v11.2d[1]
- fmul v31.2d, v1.2d, v11.2d[1]
+ fmul v28.2d, v0.2d, v11.d[0]
+ fmul v29.2d, v1.2d, v11.d[0]
+ fmul v30.2d, v0.2d, v11.d[1]
+ fmul v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
@@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v0.2d, v8.2d[1]
- fmla v19.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v0.2d, v8.d[1]
+ fmla v19.2d, v1.2d, v8.d[1]
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v9.2d[0]
- fmla v22.2d, v0.2d, v9.2d[1]
- fmla v23.2d, v1.2d, v9.2d[1]
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
+ fmla v22.2d, v0.2d, v9.d[1]
+ fmla v23.2d, v1.2d, v9.d[1]
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
- fmla v26.2d, v0.2d, v10.2d[1]
- fmla v27.2d, v1.2d, v10.2d[1]
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
+ fmla v26.2d, v0.2d, v10.d[1]
+ fmla v27.2d, v1.2d, v10.d[1]
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
- fmla v30.2d, v0.2d, v11.2d[1]
- fmla v31.2d, v1.2d, v11.2d[1]
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
+ fmla v30.2d, v0.2d, v11.d[1]
+ fmla v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32
@@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v18.2d, v4.2d, v12.2d[1]
- fmla v19.2d, v5.2d, v12.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v18.2d, v4.2d, v12.d[1]
+ fmla v19.2d, v5.2d, v12.d[1]
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v13.2d[0]
- fmla v22.2d, v4.2d, v13.2d[1]
- fmla v23.2d, v5.2d, v13.2d[1]
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v22.2d, v4.2d, v13.d[1]
+ fmla v23.2d, v5.2d, v13.d[1]
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
- fmla v26.2d, v4.2d, v14.2d[1]
- fmla v27.2d, v5.2d, v14.2d[1]
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
+ fmla v26.2d, v4.2d, v14.d[1]
+ fmla v27.2d, v5.2d, v14.d[1]
- fmla v28.2d, v4.2d, v15.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
- fmla v30.2d, v4.2d, v15.2d[1]
- fmla v31.2d, v5.2d, v15.2d[1]
+ fmla v28.2d, v4.2d, v15.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
+ fmla v30.2d, v4.2d, v15.d[1]
+ fmla v31.2d, v5.2d, v15.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32
@@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v18.2d, v4.2d, v12.2d[1]
- fmla v19.2d, v5.2d, v12.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v18.2d, v4.2d, v12.d[1]
+ fmla v19.2d, v5.2d, v12.d[1]
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v13.2d[0]
- fmla v22.2d, v4.2d, v13.2d[1]
- fmla v23.2d, v5.2d, v13.2d[1]
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v22.2d, v4.2d, v13.d[1]
+ fmla v23.2d, v5.2d, v13.d[1]
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
- fmla v26.2d, v4.2d, v14.2d[1]
- fmla v27.2d, v5.2d, v14.2d[1]
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
+ fmla v26.2d, v4.2d, v14.d[1]
+ fmla v27.2d, v5.2d, v14.d[1]
- fmla v28.2d, v4.2d, v15.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
- fmla v30.2d, v4.2d, v15.2d[1]
- fmla v31.2d, v5.2d, v15.2d[1]
+ fmla v28.2d, v4.2d, v15.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
+ fmla v30.2d, v4.2d, v15.d[1]
+ fmla v31.2d, v5.2d, v15.d[1]
.endm
.macro KERNEL4x8_SUB
@@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v0.2d, v8.2d[1]
- fmla v19.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v0.2d, v8.d[1]
+ fmla v19.2d, v1.2d, v8.d[1]
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v9.2d[0]
- fmla v22.2d, v0.2d, v9.2d[1]
- fmla v23.2d, v1.2d, v9.2d[1]
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
+ fmla v22.2d, v0.2d, v9.d[1]
+ fmla v23.2d, v1.2d, v9.d[1]
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
- fmla v26.2d, v0.2d, v10.2d[1]
- fmla v27.2d, v1.2d, v10.2d[1]
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
+ fmla v26.2d, v0.2d, v10.d[1]
+ fmla v27.2d, v1.2d, v10.d[1]
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
- fmla v30.2d, v0.2d, v11.2d[1]
- fmla v31.2d, v1.2d, v11.2d[1]
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
+ fmla v30.2d, v0.2d, v11.d[1]
+ fmla v31.2d, v1.2d, v11.d[1]
.endm
.macro SAVE4x8
@@ -369,17 +369,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v18.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v18.2d, v0.2d, v8.d[1]
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v22.2d, v0.2d, v9.2d[1]
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v22.2d, v0.2d, v9.d[1]
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v26.2d, v0.2d, v10.2d[1]
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v26.2d, v0.2d, v10.d[1]
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v30.2d, v0.2d, v11.2d[1]
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v30.2d, v0.2d, v11.d[1]
.endm
.macro SAVE2x8
@@ -499,17 +499,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v29.2d, v1.2d, v9.2d[1]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v29.2d, v1.2d, v9.d[1]
- fmul v20.2d, v0.2d, v8.2d[1]
- fmul v25.2d, v1.2d, v9.2d[0]
+ fmul v20.2d, v0.2d, v8.d[1]
+ fmul v25.2d, v1.2d, v9.d[0]
- fmul v24.2d, v0.2d, v9.2d[0]
- fmul v21.2d, v1.2d, v8.2d[1]
+ fmul v24.2d, v0.2d, v9.d[0]
+ fmul v21.2d, v1.2d, v8.d[1]
- fmul v28.2d, v0.2d, v9.2d[1]
- fmul v17.2d, v1.2d, v8.2d[0]
+ fmul v28.2d, v0.2d, v9.d[1]
+ fmul v17.2d, v1.2d, v8.d[0]
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
@@ -518,61 +518,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x4_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
ld1 {v4.2d, v5.2d}, [pA] // For next round
add pA, pA, #32
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v29.2d, v5.2d, v13.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v29.2d, v5.2d, v13.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v25.2d, v5.2d, v13.d[0]
ld1 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v12.2d[1]
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v17.2d, v5.2d, v12.d[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v29.2d, v5.2d, v13.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v29.2d, v5.2d, v13.d[1]
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v25.2d, v5.2d, v13.d[0]
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v12.2d[1]
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v12.d[1]
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v17.2d, v5.2d, v12.d[0]
.endm
.macro KERNEL4x4_SUB
@@ -581,17 +581,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x4
@@ -635,10 +635,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
.endm
.macro SAVE2x4
@@ -713,10 +713,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE4x2
@@ -747,8 +747,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
.endm
.macro SAVE2x2
@@ -776,7 +776,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA]
add pA, pA, #8
- fmla v16.2d, v8.2d, v0.2d[0]
+ fmla v16.2d, v8.2d, v0.d[0]
.endm
.macro SAVE1x2
@@ -803,8 +803,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x1
@@ -831,7 +831,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA , pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
.endm
.macro SAVE2x1
diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S
old mode 100755
new mode 100644
index 6890505bd..b06c7560d
--- a/kernel/arm64/dtrmm_kernel_8x4.S
+++ b/kernel/arm64/dtrmm_kernel_8x4.S
@@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v17.2d, v1.2d, v8.2d[0]
- fmul v18.2d, v2.2d, v8.2d[0]
- fmul v19.2d, v3.2d, v8.2d[0]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v17.2d, v1.2d, v8.d[0]
+ fmul v18.2d, v2.2d, v8.d[0]
+ fmul v19.2d, v3.2d, v8.d[0]
- fmul v20.2d, v0.2d, v8.2d[1]
- fmul v21.2d, v1.2d, v8.2d[1]
- fmul v22.2d, v2.2d, v8.2d[1]
- fmul v23.2d, v3.2d, v8.2d[1]
+ fmul v20.2d, v0.2d, v8.d[1]
+ fmul v21.2d, v1.2d, v8.d[1]
+ fmul v22.2d, v2.2d, v8.d[1]
+ fmul v23.2d, v3.2d, v8.d[1]
- fmul v24.2d, v0.2d, v9.2d[0]
- fmul v25.2d, v1.2d, v9.2d[0]
- fmul v26.2d, v2.2d, v9.2d[0]
- fmul v27.2d, v3.2d, v9.2d[0]
+ fmul v24.2d, v0.2d, v9.d[0]
+ fmul v25.2d, v1.2d, v9.d[0]
+ fmul v26.2d, v2.2d, v9.d[0]
+ fmul v27.2d, v3.2d, v9.d[0]
- fmul v28.2d, v0.2d, v9.2d[1]
- fmul v29.2d, v1.2d, v9.2d[1]
- fmul v30.2d, v2.2d, v9.2d[1]
- fmul v31.2d, v3.2d, v9.2d[1]
+ fmul v28.2d, v0.2d, v9.d[1]
+ fmul v29.2d, v1.2d, v9.d[1]
+ fmul v30.2d, v2.2d, v9.d[1]
+ fmul v31.2d, v3.2d, v9.d[1]
ld1 {v4.2d, v5.2d}, [pA]
add pA, pA, #32
@@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
- fmla v22.2d, v2.2d, v8.2d[1]
- fmla v23.2d, v3.2d, v8.2d[1]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
+ fmla v22.2d, v2.2d, v8.d[1]
+ fmla v23.2d, v3.2d, v8.d[1]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v25.2d, v1.2d, v9.2d[0]
- fmla v26.2d, v2.2d, v9.2d[0]
- fmla v27.2d, v3.2d, v9.2d[0]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v25.2d, v1.2d, v9.d[0]
+ fmla v26.2d, v2.2d, v9.d[0]
+ fmla v27.2d, v3.2d, v9.d[0]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v29.2d, v1.2d, v9.2d[1]
- fmla v30.2d, v2.2d, v9.2d[1]
- fmla v31.2d, v3.2d, v9.2d[1]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v29.2d, v1.2d, v9.d[1]
+ fmla v30.2d, v2.2d, v9.d[1]
+ fmla v31.2d, v3.2d, v9.d[1]
ld1 {v4.2d, v5.2d}, [pA]
add pA, pA, #32
@@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v18.2d, v6.2d, v12.2d[0]
- fmla v19.2d, v7.2d, v12.2d[0]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v18.2d, v6.2d, v12.d[0]
+ fmla v19.2d, v7.2d, v12.d[0]
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v21.2d, v5.2d, v12.2d[1]
- fmla v22.2d, v6.2d, v12.2d[1]
- fmla v23.2d, v7.2d, v12.2d[1]
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v21.2d, v5.2d, v12.d[1]
+ fmla v22.2d, v6.2d, v12.d[1]
+ fmla v23.2d, v7.2d, v12.d[1]
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v25.2d, v5.2d, v13.2d[0]
- fmla v26.2d, v6.2d, v13.2d[0]
- fmla v27.2d, v7.2d, v13.2d[0]
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v25.2d, v5.2d, v13.d[0]
+ fmla v26.2d, v6.2d, v13.d[0]
+ fmla v27.2d, v7.2d, v13.d[0]
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v29.2d, v5.2d, v13.2d[1]
- fmla v30.2d, v6.2d, v13.2d[1]
- fmla v31.2d, v7.2d, v13.2d[1]
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v29.2d, v5.2d, v13.d[1]
+ fmla v30.2d, v6.2d, v13.d[1]
+ fmla v31.2d, v7.2d, v13.d[1]
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
@@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v18.2d, v6.2d, v12.2d[0]
- fmla v19.2d, v7.2d, v12.2d[0]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v18.2d, v6.2d, v12.d[0]
+ fmla v19.2d, v7.2d, v12.d[0]
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v21.2d, v5.2d, v12.2d[1]
- fmla v22.2d, v6.2d, v12.2d[1]
- fmla v23.2d, v7.2d, v12.2d[1]
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v21.2d, v5.2d, v12.d[1]
+ fmla v22.2d, v6.2d, v12.d[1]
+ fmla v23.2d, v7.2d, v12.d[1]
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v25.2d, v5.2d, v13.2d[0]
- fmla v26.2d, v6.2d, v13.2d[0]
- fmla v27.2d, v7.2d, v13.2d[0]
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v25.2d, v5.2d, v13.d[0]
+ fmla v26.2d, v6.2d, v13.d[0]
+ fmla v27.2d, v7.2d, v13.d[0]
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v29.2d, v5.2d, v13.2d[1]
- fmla v30.2d, v6.2d, v13.2d[1]
- fmla v31.2d, v7.2d, v13.2d[1]
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v29.2d, v5.2d, v13.d[1]
+ fmla v30.2d, v6.2d, v13.d[1]
+ fmla v31.2d, v7.2d, v13.d[1]
.endm
.macro KERNEL8x4_SUB
@@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
- fmla v22.2d, v2.2d, v8.2d[1]
- fmla v23.2d, v3.2d, v8.2d[1]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
+ fmla v22.2d, v2.2d, v8.d[1]
+ fmla v23.2d, v3.2d, v8.d[1]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v25.2d, v1.2d, v9.2d[0]
- fmla v26.2d, v2.2d, v9.2d[0]
- fmla v27.2d, v3.2d, v9.2d[0]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v25.2d, v1.2d, v9.d[0]
+ fmla v26.2d, v2.2d, v9.d[0]
+ fmla v27.2d, v3.2d, v9.d[0]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v29.2d, v1.2d, v9.2d[1]
- fmla v30.2d, v2.2d, v9.2d[1]
- fmla v31.2d, v3.2d, v9.2d[1]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v29.2d, v1.2d, v9.d[1]
+ fmla v30.2d, v2.2d, v9.d[1]
+ fmla v31.2d, v3.2d, v9.d[1]
.endm
.macro SAVE8x4
@@ -351,17 +351,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x4
@@ -406,10 +406,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
.endm
.macro SAVE2x4
@@ -490,15 +490,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
- fmla v22.2d, v2.2d, v8.2d[1]
- fmla v23.2d, v3.2d, v8.2d[1]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
+ fmla v22.2d, v2.2d, v8.d[1]
+ fmla v23.2d, v3.2d, v8.d[1]
.endm
.macro SAVE8x2
@@ -534,10 +534,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE4x2
@@ -568,8 +568,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
.endm
.macro SAVE2x2
@@ -597,7 +597,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA]
add pA, pA, #8
- fmla v16.2d, v8.2d, v0.2d[0]
+ fmla v16.2d, v8.2d, v0.d[0]
.endm
.macro SAVE1x2
@@ -629,10 +629,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
.endm
.macro SAVE8x1
@@ -660,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x1
@@ -686,7 +686,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA]
add pA , pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
.endm
.macro SAVE2x1
diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S
index 22b55b01c..68366d9f2 100644
--- a/kernel/arm64/sgemm_kernel_16x4.S
+++ b/kernel/arm64/sgemm_kernel_16x4.S
@@ -158,25 +158,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v8.2s[0]
- fmul v17.4s, v1.4s, v8.2s[0]
- fmul v18.4s, v2.4s, v8.2s[0]
- fmul v19.4s, v3.4s, v8.2s[0]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v17.4s, v1.4s, v8.s[0]
+ fmul v18.4s, v2.4s, v8.s[0]
+ fmul v19.4s, v3.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.2s[1]
- fmul v21.4s, v1.4s, v8.2s[1]
- fmul v22.4s, v2.4s, v8.2s[1]
- fmul v23.4s, v3.4s, v8.2s[1]
+ fmul v20.4s, v0.4s, v8.s[1]
+ fmul v21.4s, v1.4s, v8.s[1]
+ fmul v22.4s, v2.4s, v8.s[1]
+ fmul v23.4s, v3.4s, v8.s[1]
- fmul v24.4s, v0.4s, v9.2s[0]
- fmul v25.4s, v1.4s, v9.2s[0]
- fmul v26.4s, v2.4s, v9.2s[0]
- fmul v27.4s, v3.4s, v9.2s[0]
+ fmul v24.4s, v0.4s, v9.s[0]
+ fmul v25.4s, v1.4s, v9.s[0]
+ fmul v26.4s, v2.4s, v9.s[0]
+ fmul v27.4s, v3.4s, v9.s[0]
- fmul v28.4s, v0.4s, v9.2s[1]
- fmul v29.4s, v1.4s, v9.2s[1]
- fmul v30.4s, v2.4s, v9.2s[1]
- fmul v31.4s, v3.4s, v9.2s[1]
+ fmul v28.4s, v0.4s, v9.s[1]
+ fmul v29.4s, v1.4s, v9.s[1]
+ fmul v30.4s, v2.4s, v9.s[1]
+ fmul v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -191,25 +191,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL16x4_M1
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v22.4s, v2.4s, v8.2s[1]
- fmla v23.4s, v3.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v22.4s, v2.4s, v8.s[1]
+ fmla v23.4s, v3.4s, v8.s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v26.4s, v2.4s, v9.2s[0]
- fmla v27.4s, v3.4s, v9.2s[0]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v26.4s, v2.4s, v9.s[0]
+ fmla v27.4s, v3.4s, v9.s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
- fmla v30.4s, v2.4s, v9.2s[1]
- fmla v31.4s, v3.4s, v9.2s[1]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
+ fmla v30.4s, v2.4s, v9.s[1]
+ fmla v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -224,25 +224,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL16x4_M2
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v18.4s, v6.4s, v12.2s[0]
- fmla v19.4s, v7.4s, v12.2s[0]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v18.4s, v6.4s, v12.s[0]
+ fmla v19.4s, v7.4s, v12.s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v22.4s, v6.4s, v12.2s[1]
- fmla v23.4s, v7.4s, v12.2s[1]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v22.4s, v6.4s, v12.s[1]
+ fmla v23.4s, v7.4s, v12.s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v26.4s, v6.4s, v13.2s[0]
- fmla v27.4s, v7.4s, v13.2s[0]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v26.4s, v6.4s, v13.s[0]
+ fmla v27.4s, v7.4s, v13.s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
- fmla v30.4s, v6.4s, v13.2s[1]
- fmla v31.4s, v7.4s, v13.2s[1]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
+ fmla v30.4s, v6.4s, v13.s[1]
+ fmla v31.4s, v7.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16
@@ -257,25 +257,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL16x4_E
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v18.4s, v6.4s, v12.2s[0]
- fmla v19.4s, v7.4s, v12.2s[0]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v18.4s, v6.4s, v12.s[0]
+ fmla v19.4s, v7.4s, v12.s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v22.4s, v6.4s, v12.2s[1]
- fmla v23.4s, v7.4s, v12.2s[1]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v22.4s, v6.4s, v12.s[1]
+ fmla v23.4s, v7.4s, v12.s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v26.4s, v6.4s, v13.2s[0]
- fmla v27.4s, v7.4s, v13.2s[0]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v26.4s, v6.4s, v13.s[0]
+ fmla v27.4s, v7.4s, v13.s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
- fmla v30.4s, v6.4s, v13.2s[1]
- fmla v31.4s, v7.4s, v13.2s[1]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
+ fmla v30.4s, v6.4s, v13.s[1]
+ fmla v31.4s, v7.4s, v13.s[1]
.endm
.macro KERNEL16x4_SUB
@@ -290,25 +290,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v22.4s, v2.4s, v8.2s[1]
- fmla v23.4s, v3.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v22.4s, v2.4s, v8.s[1]
+ fmla v23.4s, v3.4s, v8.s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v26.4s, v2.4s, v9.2s[0]
- fmla v27.4s, v3.4s, v9.2s[0]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v26.4s, v2.4s, v9.s[0]
+ fmla v27.4s, v3.4s, v9.s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
- fmla v30.4s, v2.4s, v9.2s[1]
- fmla v31.4s, v3.4s, v9.2s[1]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
+ fmla v30.4s, v2.4s, v9.s[1]
+ fmla v31.4s, v3.4s, v9.s[1]
.endm
.macro SAVE16x4
@@ -370,14 +370,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v8.2s[0]
- fmul v17.4s, v1.4s, v8.2s[0]
- fmul v20.4s, v0.4s, v8.2s[1]
- fmul v21.4s, v1.4s, v8.2s[1]
- fmul v24.4s, v0.4s, v9.2s[0]
- fmul v25.4s, v1.4s, v9.2s[0]
- fmul v28.4s, v0.4s, v9.2s[1]
- fmul v29.4s, v1.4s, v9.2s[1]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v17.4s, v1.4s, v8.s[0]
+ fmul v20.4s, v0.4s, v8.s[1]
+ fmul v21.4s, v1.4s, v8.s[1]
+ fmul v24.4s, v0.4s, v9.s[0]
+ fmul v25.4s, v1.4s, v9.s[0]
+ fmul v28.4s, v0.4s, v9.s[1]
+ fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -388,14 +388,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_M1
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -406,14 +406,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_M2
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16
@@ -424,14 +424,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_E
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
.endm
.macro KERNEL8x4_SUB
@@ -442,14 +442,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
.endm
.macro SAVE8x4
@@ -501,17 +501,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmul v16.2s, v0.2s, v8.2s[0]
- fmul v29.2s, v1.2s, v9.2s[1]
+ fmul v16.2s, v0.2s, v8.s[0]
+ fmul v29.2s, v1.2s, v9.s[1]
- fmul v20.2s, v0.2s, v8.2s[1]
- fmul v25.2s, v1.2s, v9.2s[0]
+ fmul v20.2s, v0.2s, v8.s[1]
+ fmul v25.2s, v1.2s, v9.s[0]
- fmul v24.2s, v0.2s, v9.2s[0]
- fmul v21.2s, v1.2s, v8.2s[1]
+ fmul v24.2s, v0.2s, v9.s[0]
+ fmul v21.2s, v1.2s, v8.s[1]
- fmul v28.2s, v0.2s, v9.2s[1]
- fmul v17.2s, v1.2s, v8.2s[0]
+ fmul v28.2s, v0.2s, v9.s[1]
+ fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -520,61 +520,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x4_M1
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_SUB
@@ -583,17 +583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x4
@@ -638,10 +638,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
.endm
.macro SAVE2x4
@@ -729,15 +729,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v22.4s, v2.4s, v8.2s[1]
- fmla v23.4s, v3.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v22.4s, v2.4s, v8.s[1]
+ fmla v23.4s, v3.4s, v8.s[1]
.endm
.macro SAVE16x2
@@ -777,11 +777,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE8x2
@@ -817,10 +817,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE4x2
@@ -852,8 +852,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
.endm
.macro SAVE2x2
@@ -882,7 +882,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA]
add pA, pA, #4
- fmla v16.2s, v8.2s, v0.2s[0]
+ fmla v16.2s, v8.2s, v0.s[0]
.endm
.macro SAVE1x2
@@ -918,10 +918,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
.endm
.macro SAVE16x1
@@ -951,8 +951,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
.endm
.macro SAVE8x1
@@ -978,8 +978,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x1
@@ -1004,7 +1004,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA , pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
.endm
.macro SAVE2x1
diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S
index bfa80d589..a5cf7baff 100644
--- a/kernel/arm64/sgemm_kernel_4x4.S
+++ b/kernel/arm64/sgemm_kernel_4x4.S
@@ -192,164 +192,164 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA_0]
add pA_0, pA_0, #16
- fmul v16.4s, v0.4s, v8.4s[0]
- fmul v20.4s, v0.4s, v8.4s[1]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v20.4s, v0.4s, v8.s[1]
ld1 {v2.4s}, [pA_1]
add pA_1, pA_1, #16
- fmul v24.4s, v0.4s, v8.4s[2]
- fmul v28.4s, v0.4s, v8.4s[3]
+ fmul v24.4s, v0.4s, v8.s[2]
+ fmul v28.4s, v0.4s, v8.s[3]
ld1 {v4.4s}, [pA_2]
add pA_2, pA_2, #16
- fmul v17.4s, v2.4s, v8.4s[0]
- fmul v21.4s, v2.4s, v8.4s[1]
+ fmul v17.4s, v2.4s, v8.s[0]
+ fmul v21.4s, v2.4s, v8.s[1]
ld1 {v6.4s}, [pA_3]
add pA_3, pA_3, #16
- fmul v25.4s, v2.4s, v8.4s[2]
- fmul v29.4s, v2.4s, v8.4s[3]
+ fmul v25.4s, v2.4s, v8.s[2]
+ fmul v29.4s, v2.4s, v8.s[3]
ld1 {v12.4s}, [pB] // for next round
add pB, pB, #16
- fmul v18.4s, v4.4s, v8.4s[0]
- fmul v19.4s, v6.4s, v8.4s[0]
+ fmul v18.4s, v4.4s, v8.s[0]
+ fmul v19.4s, v6.4s, v8.s[0]
ld1 {v1.4s}, [pA_0] // for next round
add pA_0, pA_0, #16
- fmul v22.4s, v4.4s, v8.4s[1]
- fmul v23.4s, v6.4s, v8.4s[1]
+ fmul v22.4s, v4.4s, v8.s[1]
+ fmul v23.4s, v6.4s, v8.s[1]
ld1 {v3.4s}, [pA_1] // for next round
add pA_1, pA_1, #16
- fmul v26.4s, v4.4s, v8.4s[2]
- fmul v27.4s, v6.4s, v8.4s[2]
+ fmul v26.4s, v4.4s, v8.s[2]
+ fmul v27.4s, v6.4s, v8.s[2]
ld1 {v5.4s}, [pA_2] // for next round
add pA_2, pA_2, #16
- fmul v30.4s, v4.4s, v8.4s[3]
- fmul v31.4s, v6.4s, v8.4s[3]
+ fmul v30.4s, v4.4s, v8.s[3]
+ fmul v31.4s, v6.4s, v8.s[3]
ld1 {v7.4s}, [pA_3] // for next round
add pA_3, pA_3, #16
.endm
.macro KERNEL16x4_M2
- fmla v16.4s, v1.4s, v12.4s[0]
- fmla v17.4s, v3.4s, v12.4s[0]
+ fmla v16.4s, v1.4s, v12.s[0]
+ fmla v17.4s, v3.4s, v12.s[0]
ld1 {v8.4s}, [pB] // for next round
add pB, pB, #16
- fmla v18.4s, v5.4s, v12.4s[0]
- fmla v19.4s, v7.4s, v12.4s[0]
+ fmla v18.4s, v5.4s, v12.s[0]
+ fmla v19.4s, v7.4s, v12.s[0]
ld1 {v0.4s}, [pA_0] // for next round
add pA_0, pA_0, #16
- fmla v20.4s, v1.4s, v12.4s[1]
- fmla v21.4s, v3.4s, v12.4s[1]
+ fmla v20.4s, v1.4s, v12.s[1]
+ fmla v21.4s, v3.4s, v12.s[1]
ld1 {v2.4s}, [pA_1] // for next round
add pA_1, pA_1, #16
- fmla v22.4s, v5.4s, v12.4s[1]
- fmla v23.4s, v7.4s, v12.4s[1]
+ fmla v22.4s, v5.4s, v12.s[1]
+ fmla v23.4s, v7.4s, v12.s[1]
ld1 {v4.4s}, [pA_2] // for next round
add pA_2, pA_2, #16
- fmla v24.4s, v1.4s, v12.4s[2]
- fmla v25.4s, v3.4s, v12.4s[2]
+ fmla v24.4s, v1.4s, v12.s[2]
+ fmla v25.4s, v3.4s, v12.s[2]
ld1 {v6.4s}, [pA_3] // for next round
add pA_3, pA_3, #16
- fmla v26.4s, v5.4s, v12.4s[2]
- fmla v27.4s, v7.4s, v12.4s[2]
+ fmla v26.4s, v5.4s, v12.s[2]
+ fmla v27.4s, v7.4s, v12.s[2]
prfm PLDL1KEEP, [pA_2, #512]
- fmla v28.4s, v1.4s, v12.4s[3]
- fmla v29.4s, v3.4s, v12.4s[3]
+ fmla v28.4s, v1.4s, v12.s[3]
+ fmla v29.4s, v3.4s, v12.s[3]
prfm PLDL1KEEP, [pA_3, #512]
- fmla v30.4s, v5.4s, v12.4s[3]
- fmla v31.4s, v7.4s, v12.4s[3]
+ fmla v30.4s, v5.4s, v12.s[3]
+ fmla v31.4s, v7.4s, v12.s[3]
prfm PLDL1KEEP, [pB, #512]
.endm
.macro KERNEL16x4_M1
- fmla v16.4s, v0.4s, v8.4s[0]
- fmla v17.4s, v2.4s, v8.4s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v2.4s, v8.s[0]
ld1 {v12.4s}, [pB] // for next round
add pB, pB, #16
- fmla v18.4s, v4.4s, v8.4s[0]
- fmla v19.4s, v6.4s, v8.4s[0]
+ fmla v18.4s, v4.4s, v8.s[0]
+ fmla v19.4s, v6.4s, v8.s[0]
ld1 {v1.4s}, [pA_0] // for next round
add pA_0, pA_0, #16
- fmla v20.4s, v0.4s, v8.4s[1]
- fmla v21.4s, v2.4s, v8.4s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v2.4s, v8.s[1]
ld1 {v3.4s}, [pA_1] // for next round
add pA_1, pA_1, #16
- fmla v22.4s, v4.4s, v8.4s[1]
- fmla v23.4s, v6.4s, v8.4s[1]
+ fmla v22.4s, v4.4s, v8.s[1]
+ fmla v23.4s, v6.4s, v8.s[1]
ld1 {v5.4s}, [pA_2] // for next round
add pA_2, pA_2, #16
- fmla v24.4s, v0.4s, v8.4s[2]
- fmla v25.4s, v2.4s, v8.4s[2]
+ fmla v24.4s, v0.4s, v8.s[2]
+ fmla v25.4s, v2.4s, v8.s[2]
ld1 {v7.4s}, [pA_3] // for next round
add pA_3, pA_3, #16
- fmla v26.4s, v4.4s, v8.4s[2]
- fmla v27.4s, v6.4s, v8.4s[2]
+ fmla v26.4s, v4.4s, v8.s[2]
+ fmla v27.4s, v6.4s, v8.s[2]
prfm PLDL1KEEP, [pA_0, #512]
- fmla v28.4s, v0.4s, v8.4s[3]
- fmla v29.4s, v2.4s, v8.4s[3]
+ fmla v28.4s, v0.4s, v8.s[3]
+ fmla v29.4s, v2.4s, v8.s[3]
prfm PLDL1KEEP, [pA_1, #512]
- fmla v30.4s, v4.4s, v8.4s[3]
- fmla v31.4s, v6.4s, v8.4s[3]
+ fmla v30.4s, v4.4s, v8.s[3]
+ fmla v31.4s, v6.4s, v8.s[3]
.endm
.macro KERNEL16x4_E
- fmla v16.4s, v1.4s, v12.4s[0]
- fmla v17.4s, v3.4s, v12.4s[0]
- fmla v18.4s, v5.4s, v12.4s[0]
- fmla v19.4s, v7.4s, v12.4s[0]
- fmla v20.4s, v1.4s, v12.4s[1]
- fmla v21.4s, v3.4s, v12.4s[1]
- fmla v22.4s, v5.4s, v12.4s[1]
- fmla v23.4s, v7.4s, v12.4s[1]
- fmla v24.4s, v1.4s, v12.4s[2]
- fmla v25.4s, v3.4s, v12.4s[2]
- fmla v26.4s, v5.4s, v12.4s[2]
- fmla v27.4s, v7.4s, v12.4s[2]
- fmla v28.4s, v1.4s, v12.4s[3]
- fmla v29.4s, v3.4s, v12.4s[3]
- fmla v30.4s, v5.4s, v12.4s[3]
- fmla v31.4s, v7.4s, v12.4s[3]
+ fmla v16.4s, v1.4s, v12.s[0]
+ fmla v17.4s, v3.4s, v12.s[0]
+ fmla v18.4s, v5.4s, v12.s[0]
+ fmla v19.4s, v7.4s, v12.s[0]
+ fmla v20.4s, v1.4s, v12.s[1]
+ fmla v21.4s, v3.4s, v12.s[1]
+ fmla v22.4s, v5.4s, v12.s[1]
+ fmla v23.4s, v7.4s, v12.s[1]
+ fmla v24.4s, v1.4s, v12.s[2]
+ fmla v25.4s, v3.4s, v12.s[2]
+ fmla v26.4s, v5.4s, v12.s[2]
+ fmla v27.4s, v7.4s, v12.s[2]
+ fmla v28.4s, v1.4s, v12.s[3]
+ fmla v29.4s, v3.4s, v12.s[3]
+ fmla v30.4s, v5.4s, v12.s[3]
+ fmla v31.4s, v7.4s, v12.s[3]
.endm
.macro KERNEL16x4_SUB
@@ -359,34 +359,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA_0]
add pA_0, pA_0, #16
- fmla v16.4s, v0.4s, v8.4s[0]
- fmla v20.4s, v0.4s, v8.4s[1]
- fmla v24.4s, v0.4s, v8.4s[2]
- fmla v28.4s, v0.4s, v8.4s[3]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v8.s[2]
+ fmla v28.4s, v0.4s, v8.s[3]
ld1 {v2.4s}, [pA_1]
add pA_1, pA_1, #16
- fmla v17.4s, v2.4s, v8.4s[0]
- fmla v21.4s, v2.4s, v8.4s[1]
- fmla v25.4s, v2.4s, v8.4s[2]
- fmla v29.4s, v2.4s, v8.4s[3]
+ fmla v17.4s, v2.4s, v8.s[0]
+ fmla v21.4s, v2.4s, v8.s[1]
+ fmla v25.4s, v2.4s, v8.s[2]
+ fmla v29.4s, v2.4s, v8.s[3]
ld1 {v4.4s}, [pA_2]
add pA_2, pA_2, #16
- fmla v18.4s, v4.4s, v8.4s[0]
- fmla v22.4s, v4.4s, v8.4s[1]
- fmla v26.4s, v4.4s, v8.4s[2]
- fmla v30.4s, v4.4s, v8.4s[3]
+ fmla v18.4s, v4.4s, v8.s[0]
+ fmla v22.4s, v4.4s, v8.s[1]
+ fmla v26.4s, v4.4s, v8.s[2]
+ fmla v30.4s, v4.4s, v8.s[3]
ld1 {v6.4s}, [pA_3]
add pA_3, pA_3, #16
- fmla v19.4s, v6.4s, v8.4s[0]
- fmla v23.4s, v6.4s, v8.4s[1]
- fmla v27.4s, v6.4s, v8.4s[2]
- fmla v31.4s, v6.4s, v8.4s[3]
+ fmla v19.4s, v6.4s, v8.s[0]
+ fmla v23.4s, v6.4s, v8.s[1]
+ fmla v27.4s, v6.4s, v8.s[2]
+ fmla v31.4s, v6.4s, v8.s[3]
.endm
.macro SAVE16x4
@@ -456,28 +456,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0, pA_0, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
ld1 {v2.2s, v3.2s}, [pA_1]
add pA_1, pA_1, #16
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
- fmla v18.2s, v2.2s, v8.2s[0]
- fmla v31.2s, v3.2s, v9.2s[1]
- fmla v22.2s, v2.2s, v8.2s[1]
- fmla v27.2s, v3.2s, v9.2s[0]
+ fmla v18.2s, v2.2s, v8.s[0]
+ fmla v31.2s, v3.2s, v9.s[1]
+ fmla v22.2s, v2.2s, v8.s[1]
+ fmla v27.2s, v3.2s, v9.s[0]
- fmla v26.2s, v2.2s, v9.2s[0]
- fmla v23.2s, v3.2s, v8.2s[1]
- fmla v30.2s, v2.2s, v9.2s[1]
- fmla v19.2s, v3.2s, v8.2s[0]
+ fmla v26.2s, v2.2s, v9.s[0]
+ fmla v23.2s, v3.2s, v8.s[1]
+ fmla v30.2s, v2.2s, v9.s[1]
+ fmla v19.2s, v3.2s, v8.s[0]
.endm
.macro SAVE8x4
@@ -556,17 +556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0, pA_0, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x4
@@ -614,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA_0]
add pA_0, pA_0, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
.endm
.macro SAVE2x4
@@ -700,10 +700,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0, pA_0, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE4x2
@@ -736,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA_0]
add pA_0, pA_0, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
.endm
.macro SAVE2x2
@@ -767,7 +767,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA_0]
add pA_0, pA_0, #4
- fmla v16.2s, v8.2s, v0.2s[0]
+ fmla v16.2s, v8.2s, v0.s[0]
.endm
.macro SAVE1x2
@@ -796,8 +796,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0 , pA_0, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x1
@@ -825,7 +825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA_0]
add pA_0 , pA_0, #8
- fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
.endm
.macro SAVE2x1
diff --git a/kernel/arm64/sgemm_kernel_8x8.S b/kernel/arm64/sgemm_kernel_8x8.S
index ac690e4d4..bd47bed31 100644
--- a/kernel/arm64/sgemm_kernel_8x8.S
+++ b/kernel/arm64/sgemm_kernel_8x8.S
@@ -157,22 +157,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v4.4s[0]
- fmul v17.4s, v1.4s, v4.4s[0]
- fmul v18.4s, v0.4s, v4.4s[1]
- fmul v19.4s, v1.4s, v4.4s[1]
- fmul v20.4s, v0.4s, v4.4s[2]
- fmul v21.4s, v1.4s, v4.4s[2]
- fmul v22.4s, v0.4s, v4.4s[3]
- fmul v23.4s, v1.4s, v4.4s[3]
- fmul v24.4s, v0.4s, v5.4s[0]
- fmul v25.4s, v1.4s, v5.4s[0]
- fmul v26.4s, v0.4s, v5.4s[1]
- fmul v27.4s, v1.4s, v5.4s[1]
- fmul v28.4s, v0.4s, v5.4s[2]
- fmul v29.4s, v1.4s, v5.4s[2]
- fmul v30.4s, v0.4s, v5.4s[3]
- fmul v31.4s, v1.4s, v5.4s[3]
+ fmul v16.4s, v0.4s, v4.s[0]
+ fmul v17.4s, v1.4s, v4.s[0]
+ fmul v18.4s, v0.4s, v4.s[1]
+ fmul v19.4s, v1.4s, v4.s[1]
+ fmul v20.4s, v0.4s, v4.s[2]
+ fmul v21.4s, v1.4s, v4.s[2]
+ fmul v22.4s, v0.4s, v4.s[3]
+ fmul v23.4s, v1.4s, v4.s[3]
+ fmul v24.4s, v0.4s, v5.s[0]
+ fmul v25.4s, v1.4s, v5.s[0]
+ fmul v26.4s, v0.4s, v5.s[1]
+ fmul v27.4s, v1.4s, v5.s[1]
+ fmul v28.4s, v0.4s, v5.s[2]
+ fmul v29.4s, v1.4s, v5.s[2]
+ fmul v30.4s, v0.4s, v5.s[3]
+ fmul v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
@@ -185,22 +185,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x8_M1
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v17.4s, v1.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v19.4s, v1.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v21.4s, v1.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v23.4s, v1.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v25.4s, v1.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v27.4s, v1.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v29.4s, v1.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
- fmla v31.4s, v1.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v17.4s, v1.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v19.4s, v1.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v21.4s, v1.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v23.4s, v1.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v25.4s, v1.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v27.4s, v1.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v29.4s, v1.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
+ fmla v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
@@ -213,22 +213,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x8_M2
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v17.4s, v3.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v19.4s, v3.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v21.4s, v3.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v23.4s, v3.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v25.4s, v3.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v27.4s, v3.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v29.4s, v3.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
- fmla v31.4s, v3.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v17.4s, v3.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v19.4s, v3.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v21.4s, v3.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v23.4s, v3.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v25.4s, v3.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v27.4s, v3.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v29.4s, v3.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
+ fmla v31.4s, v3.4s, v7.s[3]
ld1 {v4.4s}, [pB]
add pB, pB, #16
@@ -241,22 +241,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x8_E
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v17.4s, v3.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v19.4s, v3.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v21.4s, v3.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v23.4s, v3.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v25.4s, v3.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v27.4s, v3.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v29.4s, v3.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
- fmla v31.4s, v3.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v17.4s, v3.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v19.4s, v3.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v21.4s, v3.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v23.4s, v3.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v25.4s, v3.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v27.4s, v3.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v29.4s, v3.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
+ fmla v31.4s, v3.4s, v7.s[3]
.endm
.macro KERNEL8x8_SUB
@@ -269,22 +269,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v17.4s, v1.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v19.4s, v1.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v21.4s, v1.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v23.4s, v1.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v25.4s, v1.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v27.4s, v1.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v29.4s, v1.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
- fmla v31.4s, v1.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v17.4s, v1.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v19.4s, v1.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v21.4s, v1.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v23.4s, v1.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v25.4s, v1.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v27.4s, v1.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v29.4s, v1.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
+ fmla v31.4s, v1.4s, v5.s[3]
.endm
.macro SAVE8x8
@@ -367,14 +367,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v4.4s[0]
- fmul v18.4s, v0.4s, v4.4s[1]
- fmul v20.4s, v0.4s, v4.4s[2]
- fmul v22.4s, v0.4s, v4.4s[3]
- fmul v24.4s, v0.4s, v5.4s[0]
- fmul v26.4s, v0.4s, v5.4s[1]
- fmul v28.4s, v0.4s, v5.4s[2]
- fmul v30.4s, v0.4s, v5.4s[3]
+ fmul v16.4s, v0.4s, v4.s[0]
+ fmul v18.4s, v0.4s, v4.s[1]
+ fmul v20.4s, v0.4s, v4.s[2]
+ fmul v22.4s, v0.4s, v4.s[3]
+ fmul v24.4s, v0.4s, v5.s[0]
+ fmul v26.4s, v0.4s, v5.s[1]
+ fmul v28.4s, v0.4s, v5.s[2]
+ fmul v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
@@ -385,14 +385,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_M1
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
@@ -403,14 +403,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_M2
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
ld1 {v4.4s}, [pB]
add pB, pB, #16
@@ -421,14 +421,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_E
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
.endm
.macro KERNEL4x8_SUB
@@ -439,14 +439,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
.endm
.macro SAVE4x8
@@ -520,14 +520,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v4.4s[0]
- fmla v18.2s, v0.2s, v4.4s[1]
- fmla v20.2s, v0.2s, v4.4s[2]
- fmla v22.2s, v0.2s, v4.4s[3]
- fmla v24.2s, v0.2s, v5.4s[0]
- fmla v26.2s, v0.2s, v5.4s[1]
- fmla v28.2s, v0.2s, v5.4s[2]
- fmla v30.2s, v0.2s, v5.4s[3]
+ fmla v16.2s, v0.2s, v4.s[0]
+ fmla v18.2s, v0.2s, v4.s[1]
+ fmla v20.2s, v0.2s, v4.s[2]
+ fmla v22.2s, v0.2s, v4.s[3]
+ fmla v24.2s, v0.2s, v5.s[0]
+ fmla v26.2s, v0.2s, v5.s[1]
+ fmla v28.2s, v0.2s, v5.s[2]
+ fmla v30.2s, v0.2s, v5.s[3]
.endm
.macro SAVE2x8
@@ -601,14 +601,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0, [pA]
add pA, pA, #4
- fmla s16, s0, v4.4s[0]
- fmla s18, s0, v4.4s[1]
- fmla s20, s0, v4.4s[2]
- fmla s22, s0, v4.4s[3]
- fmla s24, s0, v5.4s[0]
- fmla s26, s0, v5.4s[1]
- fmla s28, s0, v5.4s[2]
- fmla s30, s0, v5.4s[3]
+ fmla s16, s0, v4.s[0]
+ fmla s18, s0, v4.s[1]
+ fmla s20, s0, v4.s[2]
+ fmla s22, s0, v4.s[3]
+ fmla s24, s0, v5.s[0]
+ fmla s26, s0, v5.s[1]
+ fmla s28, s0, v5.s[2]
+ fmla s30, s0, v5.s[3]
.endm
.macro SAVE1x8
@@ -682,14 +682,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v8.2s[0]
- fmul v17.4s, v1.4s, v8.2s[0]
- fmul v20.4s, v0.4s, v8.2s[1]
- fmul v21.4s, v1.4s, v8.2s[1]
- fmul v24.4s, v0.4s, v9.2s[0]
- fmul v25.4s, v1.4s, v9.2s[0]
- fmul v28.4s, v0.4s, v9.2s[1]
- fmul v29.4s, v1.4s, v9.2s[1]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v17.4s, v1.4s, v8.s[0]
+ fmul v20.4s, v0.4s, v8.s[1]
+ fmul v21.4s, v1.4s, v8.s[1]
+ fmul v24.4s, v0.4s, v9.s[0]
+ fmul v25.4s, v1.4s, v9.s[0]
+ fmul v28.4s, v0.4s, v9.s[1]
+ fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -700,14 +700,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_M1
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -718,14 +718,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_M2
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16
@@ -736,14 +736,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_E
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
.endm
.macro KERNEL8x4_SUB
@@ -754,14 +754,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
.endm
.macro SAVE8x4
@@ -814,17 +814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmul v16.2s, v0.2s, v8.2s[0]
- fmul v29.2s, v1.2s, v9.2s[1]
+ fmul v16.2s, v0.2s, v8.s[0]
+ fmul v29.2s, v1.2s, v9.s[1]
- fmul v20.2s, v0.2s, v8.2s[1]
- fmul v25.2s, v1.2s, v9.2s[0]
+ fmul v20.2s, v0.2s, v8.s[1]
+ fmul v25.2s, v1.2s, v9.s[0]
- fmul v24.2s, v0.2s, v9.2s[0]
- fmul v21.2s, v1.2s, v8.2s[1]
+ fmul v24.2s, v0.2s, v9.s[0]
+ fmul v21.2s, v1.2s, v8.s[1]
- fmul v28.2s, v0.2s, v9.2s[1]
- fmul v17.2s, v1.2s, v8.2s[0]
+ fmul v28.2s, v0.2s, v9.s[1]
+ fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -833,61 +833,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x4_M1
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_SUB
@@ -896,17 +896,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x4
@@ -951,10 +951,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
.endm
.macro SAVE2x4
@@ -1034,11 +1034,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE8x2
@@ -1074,10 +1074,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE4x2
@@ -1109,8 +1109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
.endm
.macro SAVE2x2
@@ -1139,7 +1139,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA]
add pA, pA, #4
- fmla v16.2s, v8.2s, v0.2s[0]
+ fmla v16.2s, v8.2s, v0.s[0]
.endm
.macro SAVE1x2
@@ -1169,8 +1169,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
.endm
.macro SAVE8x1
@@ -1196,8 +1196,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x1
@@ -1222,7 +1222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA , pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
.endm
.macro SAVE2x1
diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S
old mode 100755
new mode 100644
index b99760a03..28b321651
--- a/kernel/arm64/strmm_kernel_16x4.S
+++ b/kernel/arm64/strmm_kernel_16x4.S
@@ -161,25 +161,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v8.2s[0]
- fmul v17.4s, v1.4s, v8.2s[0]
- fmul v18.4s, v2.4s, v8.2s[0]
- fmul v19.4s, v3.4s, v8.2s[0]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v17.4s, v1.4s, v8.s[0]
+ fmul v18.4s, v2.4s, v8.s[0]
+ fmul v19.4s, v3.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.2s[1]
- fmul v21.4s, v1.4s, v8.2s[1]
- fmul v22.4s, v2.4s, v8.2s[1]
- fmul v23.4s, v3.4s, v8.2s[1]
+ fmul v20.4s, v0.4s, v8.s[1]
+ fmul v21.4s, v1.4s, v8.s[1]
+ fmul v22.4s, v2.4s, v8.s[1]
+ fmul v23.4s, v3.4s, v8.s[1]
- fmul v24.4s, v0.4s, v9.2s[0]
- fmul v25.4s, v1.4s, v9.2s[0]
- fmul v26.4s, v2.4s, v9.2s[0]
- fmul v27.4s, v3.4s, v9.2s[0]
+ fmul v24.4s, v0.4s, v9.s[0]
+ fmul v25.4s, v1.4s, v9.s[0]
+ fmul v26.4s, v2.4s, v9.s[0]
+ fmul v27.4s, v3.4s, v9.s[0]
- fmul v28.4s, v0.4s, v9.2s[1]
- fmul v29.4s, v1.4s, v9.2s[1]
- fmul v30.4s, v2.4s, v9.2s[1]
- fmul v31.4s, v3.4s, v9.2s[1]
+ fmul v28.4s, v0.4s, v9.s[1]
+ fmul v29.4s, v1.4s, v9.s[1]
+ fmul v30.4s, v2.4s, v9.s[1]
+ fmul v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -194,25 +194,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL16x4_M1
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v22.4s, v2.4s, v8.2s[1]
- fmla v23.4s, v3.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v22.4s, v2.4s, v8.s[1]
+ fmla v23.4s, v3.4s, v8.s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v26.4s, v2.4s, v9.2s[0]
- fmla v27.4s, v3.4s, v9.2s[0]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v26.4s, v2.4s, v9.s[0]
+ fmla v27.4s, v3.4s, v9.s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
- fmla v30.4s, v2.4s, v9.2s[1]
- fmla v31.4s, v3.4s, v9.2s[1]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
+ fmla v30.4s, v2.4s, v9.s[1]
+ fmla v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -227,25 +227,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL16x4_M2
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v18.4s, v6.4s, v12.2s[0]
- fmla v19.4s, v7.4s, v12.2s[0]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v18.4s, v6.4s, v12.s[0]
+ fmla v19.4s, v7.4s, v12.s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v22.4s, v6.4s, v12.2s[1]
- fmla v23.4s, v7.4s, v12.2s[1]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v22.4s, v6.4s, v12.s[1]
+ fmla v23.4s, v7.4s, v12.s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v26.4s, v6.4s, v13.2s[0]
- fmla v27.4s, v7.4s, v13.2s[0]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v26.4s, v6.4s, v13.s[0]
+ fmla v27.4s, v7.4s, v13.s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
- fmla v30.4s, v6.4s, v13.2s[1]
- fmla v31.4s, v7.4s, v13.2s[1]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
+ fmla v30.4s, v6.4s, v13.s[1]
+ fmla v31.4s, v7.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16
@@ -260,25 +260,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL16x4_E
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v18.4s, v6.4s, v12.2s[0]
- fmla v19.4s, v7.4s, v12.2s[0]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v18.4s, v6.4s, v12.s[0]
+ fmla v19.4s, v7.4s, v12.s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v22.4s, v6.4s, v12.2s[1]
- fmla v23.4s, v7.4s, v12.2s[1]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v22.4s, v6.4s, v12.s[1]
+ fmla v23.4s, v7.4s, v12.s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v26.4s, v6.4s, v13.2s[0]
- fmla v27.4s, v7.4s, v13.2s[0]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v26.4s, v6.4s, v13.s[0]
+ fmla v27.4s, v7.4s, v13.s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
- fmla v30.4s, v6.4s, v13.2s[1]
- fmla v31.4s, v7.4s, v13.2s[1]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
+ fmla v30.4s, v6.4s, v13.s[1]
+ fmla v31.4s, v7.4s, v13.s[1]
.endm
.macro KERNEL16x4_SUB
@@ -293,25 +293,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v22.4s, v2.4s, v8.2s[1]
- fmla v23.4s, v3.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v22.4s, v2.4s, v8.s[1]
+ fmla v23.4s, v3.4s, v8.s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v26.4s, v2.4s, v9.2s[0]
- fmla v27.4s, v3.4s, v9.2s[0]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v26.4s, v2.4s, v9.s[0]
+ fmla v27.4s, v3.4s, v9.s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
- fmla v30.4s, v2.4s, v9.2s[1]
- fmla v31.4s, v3.4s, v9.2s[1]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
+ fmla v30.4s, v2.4s, v9.s[1]
+ fmla v31.4s, v3.4s, v9.s[1]
.endm
.macro SAVE16x4
@@ -369,14 +369,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v8.2s[0]
- fmul v17.4s, v1.4s, v8.2s[0]
- fmul v20.4s, v0.4s, v8.2s[1]
- fmul v21.4s, v1.4s, v8.2s[1]
- fmul v24.4s, v0.4s, v9.2s[0]
- fmul v25.4s, v1.4s, v9.2s[0]
- fmul v28.4s, v0.4s, v9.2s[1]
- fmul v29.4s, v1.4s, v9.2s[1]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v17.4s, v1.4s, v8.s[0]
+ fmul v20.4s, v0.4s, v8.s[1]
+ fmul v21.4s, v1.4s, v8.s[1]
+ fmul v24.4s, v0.4s, v9.s[0]
+ fmul v25.4s, v1.4s, v9.s[0]
+ fmul v28.4s, v0.4s, v9.s[1]
+ fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -387,14 +387,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_M1
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -405,14 +405,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_M2
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16
@@ -423,14 +423,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_E
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
.endm
.macro KERNEL8x4_SUB
@@ -441,14 +441,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
.endm
.macro SAVE8x4
@@ -496,17 +496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmul v16.2s, v0.2s, v8.2s[0]
- fmul v29.2s, v1.2s, v9.2s[1]
+ fmul v16.2s, v0.2s, v8.s[0]
+ fmul v29.2s, v1.2s, v9.s[1]
- fmul v20.2s, v0.2s, v8.2s[1]
- fmul v25.2s, v1.2s, v9.2s[0]
+ fmul v20.2s, v0.2s, v8.s[1]
+ fmul v25.2s, v1.2s, v9.s[0]
- fmul v24.2s, v0.2s, v9.2s[0]
- fmul v21.2s, v1.2s, v8.2s[1]
+ fmul v24.2s, v0.2s, v9.s[0]
+ fmul v21.2s, v1.2s, v8.s[1]
- fmul v28.2s, v0.2s, v9.2s[1]
- fmul v17.2s, v1.2s, v8.2s[0]
+ fmul v28.2s, v0.2s, v9.s[1]
+ fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -515,61 +515,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x4_M1
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_SUB
@@ -578,17 +578,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x4
@@ -633,10 +633,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
.endm
.macro SAVE2x4
@@ -718,15 +718,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v22.4s, v2.4s, v8.2s[1]
- fmla v23.4s, v3.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v22.4s, v2.4s, v8.s[1]
+ fmla v23.4s, v3.4s, v8.s[1]
.endm
.macro SAVE16x2
@@ -764,11 +764,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE8x2
@@ -802,10 +802,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE4x2
@@ -837,8 +837,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
.endm
.macro SAVE2x2
@@ -866,7 +866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA]
add pA, pA, #4
- fmla v16.2s, v8.2s, v0.2s[0]
+ fmla v16.2s, v8.2s, v0.s[0]
.endm
.macro SAVE1x2
@@ -901,10 +901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
.endm
.macro SAVE16x1
@@ -934,8 +934,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
.endm
.macro SAVE8x1
@@ -961,8 +961,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x1
@@ -987,7 +987,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA , pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
.endm
.macro SAVE2x1
diff --git a/kernel/arm64/strmm_kernel_4x4.S b/kernel/arm64/strmm_kernel_4x4.S
index 674e200d8..eeb3e6e72 100644
--- a/kernel/arm64/strmm_kernel_4x4.S
+++ b/kernel/arm64/strmm_kernel_4x4.S
@@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmul v16.2s, v0.2s, v8.2s[0]
- fmul v29.2s, v1.2s, v9.2s[1]
+ fmul v16.2s, v0.2s, v8.s[0]
+ fmul v29.2s, v1.2s, v9.s[1]
- fmul v20.2s, v0.2s, v8.2s[1]
- fmul v25.2s, v1.2s, v9.2s[0]
+ fmul v20.2s, v0.2s, v8.s[1]
+ fmul v25.2s, v1.2s, v9.s[0]
- fmul v24.2s, v0.2s, v9.2s[0]
- fmul v21.2s, v1.2s, v8.2s[1]
+ fmul v24.2s, v0.2s, v9.s[0]
+ fmul v21.2s, v1.2s, v8.s[1]
- fmul v28.2s, v0.2s, v9.2s[1]
- fmul v17.2s, v1.2s, v8.2s[0]
+ fmul v28.2s, v0.2s, v9.s[1]
+ fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x4_M1
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_SUB
@@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x4
@@ -280,10 +280,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
.endm
.macro SAVE2x4
@@ -353,10 +353,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE4x2
@@ -386,8 +386,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
.endm
.macro SAVE2x2
@@ -414,7 +414,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA]
add pA, pA, #4
- fmla v16.2s, v8.2s, v0.2s[0]
+ fmla v16.2s, v8.2s, v0.s[0]
.endm
.macro SAVE1x2
@@ -440,8 +440,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x1
@@ -468,7 +468,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA , pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
.endm
.macro SAVE2x1
diff --git a/kernel/arm64/strmm_kernel_8x8.S b/kernel/arm64/strmm_kernel_8x8.S
old mode 100755
new mode 100644
index 98b912934..843f0c890
--- a/kernel/arm64/strmm_kernel_8x8.S
+++ b/kernel/arm64/strmm_kernel_8x8.S
@@ -159,22 +159,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v4.4s[0]
- fmul v17.4s, v1.4s, v4.4s[0]
- fmul v18.4s, v0.4s, v4.4s[1]
- fmul v19.4s, v1.4s, v4.4s[1]
- fmul v20.4s, v0.4s, v4.4s[2]
- fmul v21.4s, v1.4s, v4.4s[2]
- fmul v22.4s, v0.4s, v4.4s[3]
- fmul v23.4s, v1.4s, v4.4s[3]
- fmul v24.4s, v0.4s, v5.4s[0]
- fmul v25.4s, v1.4s, v5.4s[0]
- fmul v26.4s, v0.4s, v5.4s[1]
- fmul v27.4s, v1.4s, v5.4s[1]
- fmul v28.4s, v0.4s, v5.4s[2]
- fmul v29.4s, v1.4s, v5.4s[2]
- fmul v30.4s, v0.4s, v5.4s[3]
- fmul v31.4s, v1.4s, v5.4s[3]
+ fmul v16.4s, v0.4s, v4.s[0]
+ fmul v17.4s, v1.4s, v4.s[0]
+ fmul v18.4s, v0.4s, v4.s[1]
+ fmul v19.4s, v1.4s, v4.s[1]
+ fmul v20.4s, v0.4s, v4.s[2]
+ fmul v21.4s, v1.4s, v4.s[2]
+ fmul v22.4s, v0.4s, v4.s[3]
+ fmul v23.4s, v1.4s, v4.s[3]
+ fmul v24.4s, v0.4s, v5.s[0]
+ fmul v25.4s, v1.4s, v5.s[0]
+ fmul v26.4s, v0.4s, v5.s[1]
+ fmul v27.4s, v1.4s, v5.s[1]
+ fmul v28.4s, v0.4s, v5.s[2]
+ fmul v29.4s, v1.4s, v5.s[2]
+ fmul v30.4s, v0.4s, v5.s[3]
+ fmul v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
@@ -187,22 +187,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x8_M1
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v17.4s, v1.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v19.4s, v1.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v21.4s, v1.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v23.4s, v1.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v25.4s, v1.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v27.4s, v1.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v29.4s, v1.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
- fmla v31.4s, v1.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v17.4s, v1.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v19.4s, v1.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v21.4s, v1.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v23.4s, v1.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v25.4s, v1.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v27.4s, v1.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v29.4s, v1.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
+ fmla v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
@@ -215,22 +215,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x8_M2
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v17.4s, v3.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v19.4s, v3.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v21.4s, v3.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v23.4s, v3.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v25.4s, v3.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v27.4s, v3.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v29.4s, v3.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
- fmla v31.4s, v3.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v17.4s, v3.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v19.4s, v3.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v21.4s, v3.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v23.4s, v3.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v25.4s, v3.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v27.4s, v3.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v29.4s, v3.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
+ fmla v31.4s, v3.4s, v7.s[3]
ld1 {v4.4s}, [pB]
add pB, pB, #16
@@ -243,22 +243,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x8_E
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v17.4s, v3.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v19.4s, v3.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v21.4s, v3.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v23.4s, v3.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v25.4s, v3.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v27.4s, v3.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v29.4s, v3.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
- fmla v31.4s, v3.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v17.4s, v3.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v19.4s, v3.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v21.4s, v3.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v23.4s, v3.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v25.4s, v3.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v27.4s, v3.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v29.4s, v3.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
+ fmla v31.4s, v3.4s, v7.s[3]
.endm
.macro KERNEL8x8_SUB
@@ -271,22 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v17.4s, v1.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v19.4s, v1.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v21.4s, v1.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v23.4s, v1.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v25.4s, v1.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v27.4s, v1.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v29.4s, v1.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
- fmla v31.4s, v1.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v17.4s, v1.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v19.4s, v1.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v21.4s, v1.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v23.4s, v1.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v25.4s, v1.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v27.4s, v1.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v29.4s, v1.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
+ fmla v31.4s, v1.4s, v5.s[3]
.endm
.macro SAVE8x8
@@ -361,14 +361,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v4.4s[0]
- fmul v18.4s, v0.4s, v4.4s[1]
- fmul v20.4s, v0.4s, v4.4s[2]
- fmul v22.4s, v0.4s, v4.4s[3]
- fmul v24.4s, v0.4s, v5.4s[0]
- fmul v26.4s, v0.4s, v5.4s[1]
- fmul v28.4s, v0.4s, v5.4s[2]
- fmul v30.4s, v0.4s, v5.4s[3]
+ fmul v16.4s, v0.4s, v4.s[0]
+ fmul v18.4s, v0.4s, v4.s[1]
+ fmul v20.4s, v0.4s, v4.s[2]
+ fmul v22.4s, v0.4s, v4.s[3]
+ fmul v24.4s, v0.4s, v5.s[0]
+ fmul v26.4s, v0.4s, v5.s[1]
+ fmul v28.4s, v0.4s, v5.s[2]
+ fmul v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
@@ -379,14 +379,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_M1
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
@@ -397,14 +397,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_M2
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
ld1 {v4.4s}, [pB]
add pB, pB, #16
@@ -415,14 +415,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_E
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
.endm
.macro KERNEL4x8_SUB
@@ -433,14 +433,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
.endm
.macro SAVE4x8
@@ -514,14 +514,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v4.4s[0]
- fmla v18.2s, v0.2s, v4.4s[1]
- fmla v20.2s, v0.2s, v4.4s[2]
- fmla v22.2s, v0.2s, v4.4s[3]
- fmla v24.2s, v0.2s, v5.4s[0]
- fmla v26.2s, v0.2s, v5.4s[1]
- fmla v28.2s, v0.2s, v5.4s[2]
- fmla v30.2s, v0.2s, v5.4s[3]
+ fmla v16.2s, v0.2s, v4.s[0]
+ fmla v18.2s, v0.2s, v4.s[1]
+ fmla v20.2s, v0.2s, v4.s[2]
+ fmla v22.2s, v0.2s, v4.s[3]
+ fmla v24.2s, v0.2s, v5.s[0]
+ fmla v26.2s, v0.2s, v5.s[1]
+ fmla v28.2s, v0.2s, v5.s[2]
+ fmla v30.2s, v0.2s, v5.s[3]
.endm
.macro SAVE2x8
@@ -595,14 +595,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0, [pA]
add pA, pA, #4
- fmla s16, s0, v4.4s[0]
- fmla s18, s0, v4.4s[1]
- fmla s20, s0, v4.4s[2]
- fmla s22, s0, v4.4s[3]
- fmla s24, s0, v5.4s[0]
- fmla s26, s0, v5.4s[1]
- fmla s28, s0, v5.4s[2]
- fmla s30, s0, v5.4s[3]
+ fmla s16, s0, v4.s[0]
+ fmla s18, s0, v4.s[1]
+ fmla s20, s0, v4.s[2]
+ fmla s22, s0, v4.s[3]
+ fmla s24, s0, v5.s[0]
+ fmla s26, s0, v5.s[1]
+ fmla s28, s0, v5.s[2]
+ fmla s30, s0, v5.s[3]
.endm
.macro SAVE1x8
@@ -676,14 +676,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v8.2s[0]
- fmul v17.4s, v1.4s, v8.2s[0]
- fmul v20.4s, v0.4s, v8.2s[1]
- fmul v21.4s, v1.4s, v8.2s[1]
- fmul v24.4s, v0.4s, v9.2s[0]
- fmul v25.4s, v1.4s, v9.2s[0]
- fmul v28.4s, v0.4s, v9.2s[1]
- fmul v29.4s, v1.4s, v9.2s[1]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v17.4s, v1.4s, v8.s[0]
+ fmul v20.4s, v0.4s, v8.s[1]
+ fmul v21.4s, v1.4s, v8.s[1]
+ fmul v24.4s, v0.4s, v9.s[0]
+ fmul v25.4s, v1.4s, v9.s[0]
+ fmul v28.4s, v0.4s, v9.s[1]
+ fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -694,14 +694,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_M1
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -712,14 +712,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_M2
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16
@@ -730,14 +730,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_E
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
.endm
.macro KERNEL8x4_SUB
@@ -748,14 +748,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
.endm
.macro SAVE8x4
@@ -808,17 +808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmul v16.2s, v0.2s, v8.2s[0]
- fmul v29.2s, v1.2s, v9.2s[1]
+ fmul v16.2s, v0.2s, v8.s[0]
+ fmul v29.2s, v1.2s, v9.s[1]
- fmul v20.2s, v0.2s, v8.2s[1]
- fmul v25.2s, v1.2s, v9.2s[0]
+ fmul v20.2s, v0.2s, v8.s[1]
+ fmul v25.2s, v1.2s, v9.s[0]
- fmul v24.2s, v0.2s, v9.2s[0]
- fmul v21.2s, v1.2s, v8.2s[1]
+ fmul v24.2s, v0.2s, v9.s[0]
+ fmul v21.2s, v1.2s, v8.s[1]
- fmul v28.2s, v0.2s, v9.2s[1]
- fmul v17.2s, v1.2s, v8.2s[0]
+ fmul v28.2s, v0.2s, v9.s[1]
+ fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
@@ -827,61 +827,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x4_M1
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_SUB
@@ -890,17 +890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x4
@@ -945,10 +945,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
.endm
.macro SAVE2x4
@@ -1028,11 +1028,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE8x2
@@ -1068,10 +1068,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE4x2
@@ -1103,8 +1103,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
.endm
.macro SAVE2x2
@@ -1133,7 +1133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA]
add pA, pA, #4
- fmla v16.2s, v8.2s, v0.2s[0]
+ fmla v16.2s, v8.2s, v0.s[0]
.endm
.macro SAVE1x2
@@ -1163,8 +1163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
.endm
.macro SAVE8x1
@@ -1190,8 +1190,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x1
@@ -1216,7 +1216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA]
add pA , pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
.endm
.macro SAVE2x1
diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S
index 28ce3de40..1cb695e56 100644
--- a/kernel/arm64/zgemm_kernel_4x4.S
+++ b/kernel/arm64/zgemm_kernel_4x4.S
@@ -182,93 +182,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
+ fmul v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.2d, v0.2d, v9.2d[0]
+ fmls v17.2d, v0.2d, v9.d[0]
#else
- fmul v17.2d, v0.2d, v9.2d[0]
+ fmul v17.2d, v0.2d, v9.d[0]
#endif
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
- fmul v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
+ fmul v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
- fmls v19.2d, v2.2d, v9.2d[0]
+ fmls v19.2d, v2.2d, v9.d[0]
#else
- fmul v19.2d, v2.2d, v9.2d[0]
+ fmul v19.2d, v2.2d, v9.d[0]
#endif
- OP_ir v19.2d, v3.2d, v8.2d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
- fmul v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
+ fmul v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.2d, v0.2d, v9.2d[1]
+ fmls v21.2d, v0.2d, v9.d[1]
#else
- fmul v21.2d, v0.2d, v9.2d[1]
+ fmul v21.2d, v0.2d, v9.d[1]
#endif
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
- fmul v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
+ fmul v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b
- fmls v23.2d, v2.2d, v9.2d[1]
+ fmls v23.2d, v2.2d, v9.d[1]
#else
- fmul v23.2d, v2.2d, v9.2d[1]
+ fmul v23.2d, v2.2d, v9.d[1]
#endif
- OP_ir v23.2d, v3.2d, v8.2d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
- fmul v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
+ fmul v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.2d, v0.2d, v11.2d[0]
+ fmls v25.2d, v0.2d, v11.d[0]
#else
- fmul v25.2d, v0.2d, v11.2d[0]
+ fmul v25.2d, v0.2d, v11.d[0]
#endif
- OP_ir v25.2d, v1.2d, v10.2d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
- fmul v26.2d, v2.2d, v10.2d[0]
- OP_ii v26.2d, v3.2d, v11.2d[0]
+ fmul v26.2d, v2.2d, v10.d[0]
+ OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b
- fmls v27.2d, v2.2d, v11.2d[0]
+ fmls v27.2d, v2.2d, v11.d[0]
#else
- fmul v27.2d, v2.2d, v11.2d[0]
+ fmul v27.2d, v2.2d, v11.d[0]
#endif
- OP_ir v27.2d, v3.2d, v10.2d[0]
+ OP_ir v27.2d, v3.2d, v10.d[0]
- fmul v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
+ fmul v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.2d, v0.2d, v11.2d[1]
+ fmls v29.2d, v0.2d, v11.d[1]
#else
- fmul v29.2d, v0.2d, v11.2d[1]
+ fmul v29.2d, v0.2d, v11.d[1]
#endif
- OP_ir v29.2d, v1.2d, v10.2d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
- fmul v30.2d, v2.2d, v10.2d[1]
- OP_ii v30.2d, v3.2d, v11.2d[1]
+ fmul v30.2d, v2.2d, v10.d[1]
+ OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b
- fmls v31.2d, v2.2d, v11.2d[1]
+ fmls v31.2d, v2.2d, v11.d[1]
#else
- fmul v31.2d, v2.2d, v11.2d[1]
+ fmul v31.2d, v2.2d, v11.d[1]
#endif
- OP_ir v31.2d, v3.2d, v10.2d[1]
+ OP_ir v31.2d, v3.2d, v10.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
@@ -281,161 +281,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x4_M1
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
ld2 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
- OP_ri v19.2d, v2.2d, v9.2d[0]
- OP_ir v19.2d, v3.2d, v8.2d[0]
+ OP_rr v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+ OP_ri v19.2d, v2.2d, v9.d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v14.2d, v15.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v4.2d, v5.2d} , [pA] // For next round
add pA, pA, #32
- OP_rr v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
- OP_ri v23.2d, v2.2d, v9.2d[1]
- OP_ir v23.2d, v3.2d, v8.2d[1]
+ OP_rr v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
+ OP_ri v23.2d, v2.2d, v9.d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v6.2d, v7.2d} , [pA] // For next round
add pA, pA, #32
- OP_rr v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
- OP_ri v25.2d, v0.2d, v11.2d[0]
- OP_ir v25.2d, v1.2d, v10.2d[0]
+ OP_rr v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
+ OP_ri v25.2d, v0.2d, v11.d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v26.2d, v2.2d, v10.2d[0]
- OP_ii v26.2d, v3.2d, v11.2d[0]
- OP_ri v27.2d, v2.2d, v11.2d[0]
- OP_ir v27.2d, v3.2d, v10.2d[0]
+ OP_rr v26.2d, v2.2d, v10.d[0]
+ OP_ii v26.2d, v3.2d, v11.d[0]
+ OP_ri v27.2d, v2.2d, v11.d[0]
+ OP_ir v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
- OP_ri v29.2d, v0.2d, v11.2d[1]
- OP_ir v29.2d, v1.2d, v10.2d[1]
+ OP_rr v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
+ OP_ri v29.2d, v0.2d, v11.d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
- OP_rr v30.2d, v2.2d, v10.2d[1]
- OP_ii v30.2d, v3.2d, v11.2d[1]
- OP_ri v31.2d, v2.2d, v11.2d[1]
- OP_ir v31.2d, v3.2d, v10.2d[1]
+ OP_rr v30.2d, v2.2d, v10.d[1]
+ OP_ii v30.2d, v3.2d, v11.d[1]
+ OP_ri v31.2d, v2.2d, v11.d[1]
+ OP_ir v31.2d, v3.2d, v10.d[1]
.endm
.macro KERNEL4x4_M2
- OP_rr v16.2d, v4.2d, v12.2d[0]
- OP_ii v16.2d, v5.2d, v13.2d[0]
- OP_ri v17.2d, v4.2d, v13.2d[0]
- OP_ir v17.2d, v5.2d, v12.2d[0]
+ OP_rr v16.2d, v4.2d, v12.d[0]
+ OP_ii v16.2d, v5.2d, v13.d[0]
+ OP_ri v17.2d, v4.2d, v13.d[0]
+ OP_ir v17.2d, v5.2d, v12.d[0]
ld2 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v18.2d, v6.2d, v12.2d[0]
- OP_ii v18.2d, v7.2d, v13.2d[0]
- OP_ri v19.2d, v6.2d, v13.2d[0]
- OP_ir v19.2d, v7.2d, v12.2d[0]
+ OP_rr v18.2d, v6.2d, v12.d[0]
+ OP_ii v18.2d, v7.2d, v13.d[0]
+ OP_ri v19.2d, v6.2d, v13.d[0]
+ OP_ir v19.2d, v7.2d, v12.d[0]
ld2 {v10.2d, v11.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.2d, v4.2d, v12.2d[1]
- OP_ii v20.2d, v5.2d, v13.2d[1]
- OP_ri v21.2d, v4.2d, v13.2d[1]
- OP_ir v21.2d, v5.2d, v12.2d[1]
+ OP_rr v20.2d, v4.2d, v12.d[1]
+ OP_ii v20.2d, v5.2d, v13.d[1]
+ OP_ri v21.2d, v4.2d, v13.d[1]
+ OP_ir v21.2d, v5.2d, v12.d[1]
ld2 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32
- OP_rr v22.2d, v6.2d, v12.2d[1]
- OP_ii v22.2d, v7.2d, v13.2d[1]
- OP_ri v23.2d, v6.2d, v13.2d[1]
- OP_ir v23.2d, v7.2d, v12.2d[1]
+ OP_rr v22.2d, v6.2d, v12.d[1]
+ OP_ii v22.2d, v7.2d, v13.d[1]
+ OP_ri v23.2d, v6.2d, v13.d[1]
+ OP_ir v23.2d, v7.2d, v12.d[1]
ld2 {v2.2d, v3.2d}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.2d, v4.2d, v14.2d[0]
- OP_ii v24.2d, v5.2d, v15.2d[0]
- OP_ri v25.2d, v4.2d, v15.2d[0]
- OP_ir v25.2d, v5.2d, v14.2d[0]
+ OP_rr v24.2d, v4.2d, v14.d[0]
+ OP_ii v24.2d, v5.2d, v15.d[0]
+ OP_ri v25.2d, v4.2d, v15.d[0]
+ OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v26.2d, v6.2d, v14.2d[0]
- OP_ii v26.2d, v7.2d, v15.2d[0]
- OP_ri v27.2d, v6.2d, v15.2d[0]
- OP_ir v27.2d, v7.2d, v14.2d[0]
+ OP_rr v26.2d, v6.2d, v14.d[0]
+ OP_ii v26.2d, v7.2d, v15.d[0]
+ OP_ri v27.2d, v6.2d, v15.d[0]
+ OP_ir v27.2d, v7.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v28.2d, v4.2d, v14.2d[1]
- OP_ii v28.2d, v5.2d, v15.2d[1]
- OP_ri v29.2d, v4.2d, v15.2d[1]
- OP_ir v29.2d, v5.2d, v14.2d[1]
+ OP_rr v28.2d, v4.2d, v14.d[1]
+ OP_ii v28.2d, v5.2d, v15.d[1]
+ OP_ri v29.2d, v4.2d, v15.d[1]
+ OP_ir v29.2d, v5.2d, v14.d[1]
- OP_rr v30.2d, v6.2d, v14.2d[1]
- OP_ii v30.2d, v7.2d, v15.2d[1]
- OP_ri v31.2d, v6.2d, v15.2d[1]
- OP_ir v31.2d, v7.2d, v14.2d[1]
+ OP_rr v30.2d, v6.2d, v14.d[1]
+ OP_ii v30.2d, v7.2d, v15.d[1]
+ OP_ri v31.2d, v6.2d, v15.d[1]
+ OP_ir v31.2d, v7.2d, v14.d[1]
.endm
.macro KERNEL4x4_E
- OP_rr v16.2d, v4.2d, v12.2d[0]
- OP_ii v16.2d, v5.2d, v13.2d[0]
- OP_ri v17.2d, v4.2d, v13.2d[0]
- OP_ir v17.2d, v5.2d, v12.2d[0]
+ OP_rr v16.2d, v4.2d, v12.d[0]
+ OP_ii v16.2d, v5.2d, v13.d[0]
+ OP_ri v17.2d, v4.2d, v13.d[0]
+ OP_ir v17.2d, v5.2d, v12.d[0]
- OP_rr v18.2d, v6.2d, v12.2d[0]
- OP_ii v18.2d, v7.2d, v13.2d[0]
- OP_ri v19.2d, v6.2d, v13.2d[0]
- OP_ir v19.2d, v7.2d, v12.2d[0]
+ OP_rr v18.2d, v6.2d, v12.d[0]
+ OP_ii v18.2d, v7.2d, v13.d[0]
+ OP_ri v19.2d, v6.2d, v13.d[0]
+ OP_ir v19.2d, v7.2d, v12.d[0]
- OP_rr v20.2d, v4.2d, v12.2d[1]
- OP_ii v20.2d, v5.2d, v13.2d[1]
- OP_ri v21.2d, v4.2d, v13.2d[1]
- OP_ir v21.2d, v5.2d, v12.2d[1]
+ OP_rr v20.2d, v4.2d, v12.d[1]
+ OP_ii v20.2d, v5.2d, v13.d[1]
+ OP_ri v21.2d, v4.2d, v13.d[1]
+ OP_ir v21.2d, v5.2d, v12.d[1]
- OP_rr v22.2d, v6.2d, v12.2d[1]
- OP_ii v22.2d, v7.2d, v13.2d[1]
- OP_ri v23.2d, v6.2d, v13.2d[1]
- OP_ir v23.2d, v7.2d, v12.2d[1]
+ OP_rr v22.2d, v6.2d, v12.d[1]
+ OP_ii v22.2d, v7.2d, v13.d[1]
+ OP_ri v23.2d, v6.2d, v13.d[1]
+ OP_ir v23.2d, v7.2d, v12.d[1]
- OP_rr v24.2d, v4.2d, v14.2d[0]
- OP_ii v24.2d, v5.2d, v15.2d[0]
- OP_ri v25.2d, v4.2d, v15.2d[0]
- OP_ir v25.2d, v5.2d, v14.2d[0]
+ OP_rr v24.2d, v4.2d, v14.d[0]
+ OP_ii v24.2d, v5.2d, v15.d[0]
+ OP_ri v25.2d, v4.2d, v15.d[0]
+ OP_ir v25.2d, v5.2d, v14.d[0]
- OP_rr v26.2d, v6.2d, v14.2d[0]
- OP_ii v26.2d, v7.2d, v15.2d[0]
- OP_ri v27.2d, v6.2d, v15.2d[0]
- OP_ir v27.2d, v7.2d, v14.2d[0]
+ OP_rr v26.2d, v6.2d, v14.d[0]
+ OP_ii v26.2d, v7.2d, v15.d[0]
+ OP_ri v27.2d, v6.2d, v15.d[0]
+ OP_ir v27.2d, v7.2d, v14.d[0]
- OP_rr v28.2d, v4.2d, v14.2d[1]
- OP_ii v28.2d, v5.2d, v15.2d[1]
- OP_ri v29.2d, v4.2d, v15.2d[1]
- OP_ir v29.2d, v5.2d, v14.2d[1]
+ OP_rr v28.2d, v4.2d, v14.d[1]
+ OP_ii v28.2d, v5.2d, v15.d[1]
+ OP_ri v29.2d, v4.2d, v15.d[1]
+ OP_ir v29.2d, v5.2d, v14.d[1]
- OP_rr v30.2d, v6.2d, v14.2d[1]
- OP_ii v30.2d, v7.2d, v15.2d[1]
- OP_ri v31.2d, v6.2d, v15.2d[1]
- OP_ir v31.2d, v7.2d, v14.2d[1]
+ OP_rr v30.2d, v6.2d, v14.d[1]
+ OP_ii v30.2d, v7.2d, v15.d[1]
+ OP_ri v31.2d, v6.2d, v15.d[1]
+ OP_ir v31.2d, v7.2d, v14.d[1]
.endm
.macro KERNEL4x4_SUB
@@ -448,45 +448,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
- OP_rr v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
- OP_ri v19.2d, v2.2d, v9.2d[0]
- OP_ir v19.2d, v3.2d, v8.2d[0]
+ OP_rr v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+ OP_ri v19.2d, v2.2d, v9.d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
- OP_rr v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
- OP_ri v23.2d, v2.2d, v9.2d[1]
- OP_ir v23.2d, v3.2d, v8.2d[1]
+ OP_rr v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
+ OP_ri v23.2d, v2.2d, v9.d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
- OP_rr v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
- OP_ri v25.2d, v0.2d, v11.2d[0]
- OP_ir v25.2d, v1.2d, v10.2d[0]
+ OP_rr v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
+ OP_ri v25.2d, v0.2d, v11.d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
- OP_rr v26.2d, v2.2d, v10.2d[0]
- OP_ii v26.2d, v3.2d, v11.2d[0]
- OP_ri v27.2d, v2.2d, v11.2d[0]
- OP_ir v27.2d, v3.2d, v10.2d[0]
+ OP_rr v26.2d, v2.2d, v10.d[0]
+ OP_ii v26.2d, v3.2d, v11.d[0]
+ OP_ri v27.2d, v2.2d, v11.d[0]
+ OP_ir v27.2d, v3.2d, v10.d[0]
- OP_rr v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
- OP_ri v29.2d, v0.2d, v11.2d[1]
- OP_ir v29.2d, v1.2d, v10.2d[1]
+ OP_rr v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
+ OP_ri v29.2d, v0.2d, v11.d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
- OP_rr v30.2d, v2.2d, v10.2d[1]
- OP_ii v30.2d, v3.2d, v11.2d[1]
- OP_ri v31.2d, v2.2d, v11.2d[1]
- OP_ir v31.2d, v3.2d, v10.2d[1]
+ OP_rr v30.2d, v2.2d, v10.d[1]
+ OP_ii v30.2d, v3.2d, v11.d[1]
+ OP_ri v31.2d, v2.2d, v11.d[1]
+ OP_ir v31.2d, v3.2d, v10.d[1]
.endm
.macro SAVE4x4
@@ -582,25 +582,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
- OP_rr v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
- OP_ri v25.2d, v0.2d, v11.2d[0]
- OP_ir v25.2d, v1.2d, v10.2d[0]
+ OP_rr v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
+ OP_ri v25.2d, v0.2d, v11.d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
- OP_rr v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
- OP_ri v29.2d, v0.2d, v11.2d[1]
- OP_ir v29.2d, v1.2d, v10.2d[1]
+ OP_rr v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
+ OP_ri v29.2d, v0.2d, v11.d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
.endm
.macro SAVE2x4
@@ -669,25 +669,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16
- OP_rr d16, d0, v8.2d[0]
- OP_ii d16, d1, v9.2d[0]
- OP_ri d17, d0, v9.2d[0]
- OP_ir d17, d1, v8.2d[0]
+ OP_rr d16, d0, v8.d[0]
+ OP_ii d16, d1, v9.d[0]
+ OP_ri d17, d0, v9.d[0]
+ OP_ir d17, d1, v8.d[0]
- OP_rr d20, d0, v8.2d[1]
- OP_ii d20, d1, v9.2d[1]
- OP_ri d21, d0, v9.2d[1]
- OP_ir d21, d1, v8.2d[1]
+ OP_rr d20, d0, v8.d[1]
+ OP_ii d20, d1, v9.d[1]
+ OP_ri d21, d0, v9.d[1]
+ OP_ir d21, d1, v8.d[1]
- OP_rr d24, d0, v10.2d[0]
- OP_ii d24, d1, v11.2d[0]
- OP_ri d25, d0, v11.2d[0]
- OP_ir d25, d1, v10.2d[0]
+ OP_rr d24, d0, v10.d[0]
+ OP_ii d24, d1, v11.d[0]
+ OP_ri d25, d0, v11.d[0]
+ OP_ir d25, d1, v10.d[0]
- OP_rr d28, d0, v10.2d[1]
- OP_ii d28, d1, v11.2d[1]
- OP_ri d29, d0, v11.2d[1]
- OP_ir d29, d1, v10.2d[1]
+ OP_rr d28, d0, v10.d[1]
+ OP_ii d28, d1, v11.d[1]
+ OP_ri d29, d0, v11.d[1]
+ OP_ir d29, d1, v10.d[1]
.endm
.macro SAVE1x4
@@ -756,25 +756,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
- OP_rr v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
- OP_ri v19.2d, v2.2d, v9.2d[0]
- OP_ir v19.2d, v3.2d, v8.2d[0]
+ OP_rr v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+ OP_ri v19.2d, v2.2d, v9.d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
- OP_rr v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
- OP_ri v23.2d, v2.2d, v9.2d[1]
- OP_ir v23.2d, v3.2d, v8.2d[1]
+ OP_rr v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
+ OP_ri v23.2d, v2.2d, v9.d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
.endm
.macro SAVE4x2
@@ -833,15 +833,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE2x2
@@ -886,15 +886,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16
- OP_rr d16, d0, v8.2d[0]
- OP_ii d16, d1, v9.2d[0]
- OP_ri d17, d0, v9.2d[0]
- OP_ir d17, d1, v8.2d[0]
+ OP_rr d16, d0, v8.d[0]
+ OP_ii d16, d1, v9.d[0]
+ OP_ri d17, d0, v9.d[0]
+ OP_ir d17, d1, v8.d[0]
- OP_rr d20, d0, v8.2d[1]
- OP_ii d20, d1, v9.2d[1]
- OP_ri d21, d0, v9.2d[1]
- OP_ir d21, d1, v8.2d[1]
+ OP_rr d20, d0, v8.d[1]
+ OP_ii d20, d1, v9.d[1]
+ OP_ri d21, d0, v9.d[1]
+ OP_ir d21, d1, v8.d[1]
.endm
.macro SAVE1x2
diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S
index 3ff8227e3..7945870d6 100644
--- a/kernel/arm64/ztrmm_kernel_4x4.S
+++ b/kernel/arm64/ztrmm_kernel_4x4.S
@@ -185,93 +185,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
+ fmul v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.2d, v0.2d, v9.2d[0]
+ fmls v17.2d, v0.2d, v9.d[0]
#else
- fmul v17.2d, v0.2d, v9.2d[0]
+ fmul v17.2d, v0.2d, v9.d[0]
#endif
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
- fmul v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
+ fmul v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
- fmls v19.2d, v2.2d, v9.2d[0]
+ fmls v19.2d, v2.2d, v9.d[0]
#else
- fmul v19.2d, v2.2d, v9.2d[0]
+ fmul v19.2d, v2.2d, v9.d[0]
#endif
- OP_ir v19.2d, v3.2d, v8.2d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
- fmul v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
+ fmul v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.2d, v0.2d, v9.2d[1]
+ fmls v21.2d, v0.2d, v9.d[1]
#else
- fmul v21.2d, v0.2d, v9.2d[1]
+ fmul v21.2d, v0.2d, v9.d[1]
#endif
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
- fmul v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
+ fmul v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b
- fmls v23.2d, v2.2d, v9.2d[1]
+ fmls v23.2d, v2.2d, v9.d[1]
#else
- fmul v23.2d, v2.2d, v9.2d[1]
+ fmul v23.2d, v2.2d, v9.d[1]
#endif
- OP_ir v23.2d, v3.2d, v8.2d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
- fmul v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
+ fmul v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.2d, v0.2d, v11.2d[0]
+ fmls v25.2d, v0.2d, v11.d[0]
#else
- fmul v25.2d, v0.2d, v11.2d[0]
+ fmul v25.2d, v0.2d, v11.d[0]
#endif
- OP_ir v25.2d, v1.2d, v10.2d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
- fmul v26.2d, v2.2d, v10.2d[0]
- OP_ii v26.2d, v3.2d, v11.2d[0]
+ fmul v26.2d, v2.2d, v10.d[0]
+ OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b
- fmls v27.2d, v2.2d, v11.2d[0]
+ fmls v27.2d, v2.2d, v11.d[0]
#else
- fmul v27.2d, v2.2d, v11.2d[0]
+ fmul v27.2d, v2.2d, v11.d[0]
#endif
- OP_ir v27.2d, v3.2d, v10.2d[0]
+ OP_ir v27.2d, v3.2d, v10.d[0]
- fmul v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
+ fmul v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.2d, v0.2d, v11.2d[1]
+ fmls v29.2d, v0.2d, v11.d[1]
#else
- fmul v29.2d, v0.2d, v11.2d[1]
+ fmul v29.2d, v0.2d, v11.d[1]
#endif
- OP_ir v29.2d, v1.2d, v10.2d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
- fmul v30.2d, v2.2d, v10.2d[1]
- OP_ii v30.2d, v3.2d, v11.2d[1]
+ fmul v30.2d, v2.2d, v10.d[1]
+ OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b
- fmls v31.2d, v2.2d, v11.2d[1]
+ fmls v31.2d, v2.2d, v11.d[1]
#else
- fmul v31.2d, v2.2d, v11.2d[1]
+ fmul v31.2d, v2.2d, v11.d[1]
#endif
- OP_ir v31.2d, v3.2d, v10.2d[1]
+ OP_ir v31.2d, v3.2d, v10.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
@@ -284,161 +284,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x4_M1
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
ld2 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
- OP_ri v19.2d, v2.2d, v9.2d[0]
- OP_ir v19.2d, v3.2d, v8.2d[0]
+ OP_rr v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+ OP_ri v19.2d, v2.2d, v9.d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v14.2d, v15.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v4.2d, v5.2d} , [pA] // For next round
add pA, pA, #32
- OP_rr v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
- OP_ri v23.2d, v2.2d, v9.2d[1]
- OP_ir v23.2d, v3.2d, v8.2d[1]
+ OP_rr v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
+ OP_ri v23.2d, v2.2d, v9.d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v6.2d, v7.2d} , [pA] // For next round
add pA, pA, #32
- OP_rr v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
- OP_ri v25.2d, v0.2d, v11.2d[0]
- OP_ir v25.2d, v1.2d, v10.2d[0]
+ OP_rr v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
+ OP_ri v25.2d, v0.2d, v11.d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v26.2d, v2.2d, v10.2d[0]
- OP_ii v26.2d, v3.2d, v11.2d[0]
- OP_ri v27.2d, v2.2d, v11.2d[0]
- OP_ir v27.2d, v3.2d, v10.2d[0]
+ OP_rr v26.2d, v2.2d, v10.d[0]
+ OP_ii v26.2d, v3.2d, v11.d[0]
+ OP_ri v27.2d, v2.2d, v11.d[0]
+ OP_ir v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
- OP_ri v29.2d, v0.2d, v11.2d[1]
- OP_ir v29.2d, v1.2d, v10.2d[1]
+ OP_rr v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
+ OP_ri v29.2d, v0.2d, v11.d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
- OP_rr v30.2d, v2.2d, v10.2d[1]
- OP_ii v30.2d, v3.2d, v11.2d[1]
- OP_ri v31.2d, v2.2d, v11.2d[1]
- OP_ir v31.2d, v3.2d, v10.2d[1]
+ OP_rr v30.2d, v2.2d, v10.d[1]
+ OP_ii v30.2d, v3.2d, v11.d[1]
+ OP_ri v31.2d, v2.2d, v11.d[1]
+ OP_ir v31.2d, v3.2d, v10.d[1]
.endm
.macro KERNEL4x4_M2
- OP_rr v16.2d, v4.2d, v12.2d[0]
- OP_ii v16.2d, v5.2d, v13.2d[0]
- OP_ri v17.2d, v4.2d, v13.2d[0]
- OP_ir v17.2d, v5.2d, v12.2d[0]
+ OP_rr v16.2d, v4.2d, v12.d[0]
+ OP_ii v16.2d, v5.2d, v13.d[0]
+ OP_ri v17.2d, v4.2d, v13.d[0]
+ OP_ir v17.2d, v5.2d, v12.d[0]
ld2 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v18.2d, v6.2d, v12.2d[0]
- OP_ii v18.2d, v7.2d, v13.2d[0]
- OP_ri v19.2d, v6.2d, v13.2d[0]
- OP_ir v19.2d, v7.2d, v12.2d[0]
+ OP_rr v18.2d, v6.2d, v12.d[0]
+ OP_ii v18.2d, v7.2d, v13.d[0]
+ OP_ri v19.2d, v6.2d, v13.d[0]
+ OP_ir v19.2d, v7.2d, v12.d[0]
ld2 {v10.2d, v11.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.2d, v4.2d, v12.2d[1]
- OP_ii v20.2d, v5.2d, v13.2d[1]
- OP_ri v21.2d, v4.2d, v13.2d[1]
- OP_ir v21.2d, v5.2d, v12.2d[1]
+ OP_rr v20.2d, v4.2d, v12.d[1]
+ OP_ii v20.2d, v5.2d, v13.d[1]
+ OP_ri v21.2d, v4.2d, v13.d[1]
+ OP_ir v21.2d, v5.2d, v12.d[1]
ld2 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32
- OP_rr v22.2d, v6.2d, v12.2d[1]
- OP_ii v22.2d, v7.2d, v13.2d[1]
- OP_ri v23.2d, v6.2d, v13.2d[1]
- OP_ir v23.2d, v7.2d, v12.2d[1]
+ OP_rr v22.2d, v6.2d, v12.d[1]
+ OP_ii v22.2d, v7.2d, v13.d[1]
+ OP_ri v23.2d, v6.2d, v13.d[1]
+ OP_ir v23.2d, v7.2d, v12.d[1]
ld2 {v2.2d, v3.2d}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.2d, v4.2d, v14.2d[0]
- OP_ii v24.2d, v5.2d, v15.2d[0]
- OP_ri v25.2d, v4.2d, v15.2d[0]
- OP_ir v25.2d, v5.2d, v14.2d[0]
+ OP_rr v24.2d, v4.2d, v14.d[0]
+ OP_ii v24.2d, v5.2d, v15.d[0]
+ OP_ri v25.2d, v4.2d, v15.d[0]
+ OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v26.2d, v6.2d, v14.2d[0]
- OP_ii v26.2d, v7.2d, v15.2d[0]
- OP_ri v27.2d, v6.2d, v15.2d[0]
- OP_ir v27.2d, v7.2d, v14.2d[0]
+ OP_rr v26.2d, v6.2d, v14.d[0]
+ OP_ii v26.2d, v7.2d, v15.d[0]
+ OP_ri v27.2d, v6.2d, v15.d[0]
+ OP_ir v27.2d, v7.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v28.2d, v4.2d, v14.2d[1]
- OP_ii v28.2d, v5.2d, v15.2d[1]
- OP_ri v29.2d, v4.2d, v15.2d[1]
- OP_ir v29.2d, v5.2d, v14.2d[1]
+ OP_rr v28.2d, v4.2d, v14.d[1]
+ OP_ii v28.2d, v5.2d, v15.d[1]
+ OP_ri v29.2d, v4.2d, v15.d[1]
+ OP_ir v29.2d, v5.2d, v14.d[1]
- OP_rr v30.2d, v6.2d, v14.2d[1]
- OP_ii v30.2d, v7.2d, v15.2d[1]
- OP_ri v31.2d, v6.2d, v15.2d[1]
- OP_ir v31.2d, v7.2d, v14.2d[1]
+ OP_rr v30.2d, v6.2d, v14.d[1]
+ OP_ii v30.2d, v7.2d, v15.d[1]
+ OP_ri v31.2d, v6.2d, v15.d[1]
+ OP_ir v31.2d, v7.2d, v14.d[1]
.endm
.macro KERNEL4x4_E
- OP_rr v16.2d, v4.2d, v12.2d[0]
- OP_ii v16.2d, v5.2d, v13.2d[0]
- OP_ri v17.2d, v4.2d, v13.2d[0]
- OP_ir v17.2d, v5.2d, v12.2d[0]
+ OP_rr v16.2d, v4.2d, v12.d[0]
+ OP_ii v16.2d, v5.2d, v13.d[0]
+ OP_ri v17.2d, v4.2d, v13.d[0]
+ OP_ir v17.2d, v5.2d, v12.d[0]
- OP_rr v18.2d, v6.2d, v12.2d[0]
- OP_ii v18.2d, v7.2d, v13.2d[0]
- OP_ri v19.2d, v6.2d, v13.2d[0]
- OP_ir v19.2d, v7.2d, v12.2d[0]
+ OP_rr v18.2d, v6.2d, v12.d[0]
+ OP_ii v18.2d, v7.2d, v13.d[0]
+ OP_ri v19.2d, v6.2d, v13.d[0]
+ OP_ir v19.2d, v7.2d, v12.d[0]
- OP_rr v20.2d, v4.2d, v12.2d[1]
- OP_ii v20.2d, v5.2d, v13.2d[1]
- OP_ri v21.2d, v4.2d, v13.2d[1]
- OP_ir v21.2d, v5.2d, v12.2d[1]
+ OP_rr v20.2d, v4.2d, v12.d[1]
+ OP_ii v20.2d, v5.2d, v13.d[1]
+ OP_ri v21.2d, v4.2d, v13.d[1]
+ OP_ir v21.2d, v5.2d, v12.d[1]
- OP_rr v22.2d, v6.2d, v12.2d[1]
- OP_ii v22.2d, v7.2d, v13.2d[1]
- OP_ri v23.2d, v6.2d, v13.2d[1]
- OP_ir v23.2d, v7.2d, v12.2d[1]
+ OP_rr v22.2d, v6.2d, v12.d[1]
+ OP_ii v22.2d, v7.2d, v13.d[1]
+ OP_ri v23.2d, v6.2d, v13.d[1]
+ OP_ir v23.2d, v7.2d, v12.d[1]
- OP_rr v24.2d, v4.2d, v14.2d[0]
- OP_ii v24.2d, v5.2d, v15.2d[0]
- OP_ri v25.2d, v4.2d, v15.2d[0]
- OP_ir v25.2d, v5.2d, v14.2d[0]
+ OP_rr v24.2d, v4.2d, v14.d[0]
+ OP_ii v24.2d, v5.2d, v15.d[0]
+ OP_ri v25.2d, v4.2d, v15.d[0]
+ OP_ir v25.2d, v5.2d, v14.d[0]
- OP_rr v26.2d, v6.2d, v14.2d[0]
- OP_ii v26.2d, v7.2d, v15.2d[0]
- OP_ri v27.2d, v6.2d, v15.2d[0]
- OP_ir v27.2d, v7.2d, v14.2d[0]
+ OP_rr v26.2d, v6.2d, v14.d[0]
+ OP_ii v26.2d, v7.2d, v15.d[0]
+ OP_ri v27.2d, v6.2d, v15.d[0]
+ OP_ir v27.2d, v7.2d, v14.d[0]
- OP_rr v28.2d, v4.2d, v14.2d[1]
- OP_ii v28.2d, v5.2d, v15.2d[1]
- OP_ri v29.2d, v4.2d, v15.2d[1]
- OP_ir v29.2d, v5.2d, v14.2d[1]
+ OP_rr v28.2d, v4.2d, v14.d[1]
+ OP_ii v28.2d, v5.2d, v15.d[1]
+ OP_ri v29.2d, v4.2d, v15.d[1]
+ OP_ir v29.2d, v5.2d, v14.d[1]
- OP_rr v30.2d, v6.2d, v14.2d[1]
- OP_ii v30.2d, v7.2d, v15.2d[1]
- OP_ri v31.2d, v6.2d, v15.2d[1]
- OP_ir v31.2d, v7.2d, v14.2d[1]
+ OP_rr v30.2d, v6.2d, v14.d[1]
+ OP_ii v30.2d, v7.2d, v15.d[1]
+ OP_ri v31.2d, v6.2d, v15.d[1]
+ OP_ir v31.2d, v7.2d, v14.d[1]
.endm
.macro KERNEL4x4_SUB
@@ -451,45 +451,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
- OP_rr v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
- OP_ri v19.2d, v2.2d, v9.2d[0]
- OP_ir v19.2d, v3.2d, v8.2d[0]
+ OP_rr v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+ OP_ri v19.2d, v2.2d, v9.d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
- OP_rr v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
- OP_ri v23.2d, v2.2d, v9.2d[1]
- OP_ir v23.2d, v3.2d, v8.2d[1]
+ OP_rr v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
+ OP_ri v23.2d, v2.2d, v9.d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
- OP_rr v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
- OP_ri v25.2d, v0.2d, v11.2d[0]
- OP_ir v25.2d, v1.2d, v10.2d[0]
+ OP_rr v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
+ OP_ri v25.2d, v0.2d, v11.d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
- OP_rr v26.2d, v2.2d, v10.2d[0]
- OP_ii v26.2d, v3.2d, v11.2d[0]
- OP_ri v27.2d, v2.2d, v11.2d[0]
- OP_ir v27.2d, v3.2d, v10.2d[0]
+ OP_rr v26.2d, v2.2d, v10.d[0]
+ OP_ii v26.2d, v3.2d, v11.d[0]
+ OP_ri v27.2d, v2.2d, v11.d[0]
+ OP_ir v27.2d, v3.2d, v10.d[0]
- OP_rr v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
- OP_ri v29.2d, v0.2d, v11.2d[1]
- OP_ir v29.2d, v1.2d, v10.2d[1]
+ OP_rr v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
+ OP_ri v29.2d, v0.2d, v11.d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
- OP_rr v30.2d, v2.2d, v10.2d[1]
- OP_ii v30.2d, v3.2d, v11.2d[1]
- OP_ri v31.2d, v2.2d, v11.2d[1]
- OP_ir v31.2d, v3.2d, v10.2d[1]
+ OP_rr v30.2d, v2.2d, v10.d[1]
+ OP_ii v30.2d, v3.2d, v11.d[1]
+ OP_ri v31.2d, v2.2d, v11.d[1]
+ OP_ir v31.2d, v3.2d, v10.d[1]
.endm
.macro SAVE4x4
@@ -577,25 +577,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
- OP_rr v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
- OP_ri v25.2d, v0.2d, v11.2d[0]
- OP_ir v25.2d, v1.2d, v10.2d[0]
+ OP_rr v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
+ OP_ri v25.2d, v0.2d, v11.d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
- OP_rr v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
- OP_ri v29.2d, v0.2d, v11.2d[1]
- OP_ir v29.2d, v1.2d, v10.2d[1]
+ OP_rr v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
+ OP_ri v29.2d, v0.2d, v11.d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
.endm
.macro SAVE2x4
@@ -660,25 +660,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16
- OP_rr d16, d0, v8.2d[0]
- OP_ii d16, d1, v9.2d[0]
- OP_ri d17, d0, v9.2d[0]
- OP_ir d17, d1, v8.2d[0]
+ OP_rr d16, d0, v8.d[0]
+ OP_ii d16, d1, v9.d[0]
+ OP_ri d17, d0, v9.d[0]
+ OP_ir d17, d1, v8.d[0]
- OP_rr d20, d0, v8.2d[1]
- OP_ii d20, d1, v9.2d[1]
- OP_ri d21, d0, v9.2d[1]
- OP_ir d21, d1, v8.2d[1]
+ OP_rr d20, d0, v8.d[1]
+ OP_ii d20, d1, v9.d[1]
+ OP_ri d21, d0, v9.d[1]
+ OP_ir d21, d1, v8.d[1]
- OP_rr d24, d0, v10.2d[0]
- OP_ii d24, d1, v11.2d[0]
- OP_ri d25, d0, v11.2d[0]
- OP_ir d25, d1, v10.2d[0]
+ OP_rr d24, d0, v10.d[0]
+ OP_ii d24, d1, v11.d[0]
+ OP_ri d25, d0, v11.d[0]
+ OP_ir d25, d1, v10.d[0]
- OP_rr d28, d0, v10.2d[1]
- OP_ii d28, d1, v11.2d[1]
- OP_ri d29, d0, v11.2d[1]
- OP_ir d29, d1, v10.2d[1]
+ OP_rr d28, d0, v10.d[1]
+ OP_ii d28, d1, v11.d[1]
+ OP_ri d29, d0, v11.d[1]
+ OP_ir d29, d1, v10.d[1]
.endm
.macro SAVE1x4
@@ -743,25 +743,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
- OP_rr v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
- OP_ri v19.2d, v2.2d, v9.2d[0]
- OP_ir v19.2d, v3.2d, v8.2d[0]
+ OP_rr v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+ OP_ri v19.2d, v2.2d, v9.d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
- OP_rr v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
- OP_ri v23.2d, v2.2d, v9.2d[1]
- OP_ir v23.2d, v3.2d, v8.2d[1]
+ OP_rr v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
+ OP_ri v23.2d, v2.2d, v9.d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
.endm
.macro SAVE4x2
@@ -816,15 +816,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE2x2
@@ -867,15 +867,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16
- OP_rr d16, d0, v8.2d[0]
- OP_ii d16, d1, v9.2d[0]
- OP_ri d17, d0, v9.2d[0]
- OP_ir d17, d1, v8.2d[0]
+ OP_rr d16, d0, v8.d[0]
+ OP_ii d16, d1, v9.d[0]
+ OP_ri d17, d0, v9.d[0]
+ OP_ir d17, d1, v8.d[0]
- OP_rr d20, d0, v8.2d[1]
- OP_ii d20, d1, v9.2d[1]
- OP_ri d21, d0, v9.2d[1]
- OP_ir d21, d1, v8.2d[1]
+ OP_rr d20, d0, v8.d[1]
+ OP_ii d20, d1, v9.d[1]
+ OP_ri d21, d0, v9.d[1]
+ OP_ir d21, d1, v8.d[1]
.endm
.macro SAVE1x2
diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index 760d568cd..b37a4213b 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -3,14 +3,18 @@
#CGEMM_BETA = ../generic/zgemm_beta.c
#ZGEMM_BETA = ../generic/zgemm_beta.c
-STRMMKERNEL = gemm_kernel_power6.S
+STRMMKERNEL = strmm_kernel_16x8_power8.S
DTRMMKERNEL = dtrmm_kernel_16x4_power8.S
-CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
+CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
-SGEMMKERNEL = gemm_kernel_power6.S
-SGEMMONCOPY = ../generic/gemm_ncopy_4.c
-SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+SGEMMKERNEL = sgemm_kernel_16x8_power8.S
+SGEMMINCOPY = ../generic/gemm_ncopy_16.c
+SGEMMITCOPY = ../generic/gemm_tcopy_16.c
+SGEMMONCOPY = ../generic/gemm_ncopy_8.c
+SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
+SGEMMINCOPYOBJ = sgemm_incopy.o
+SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
@@ -24,11 +28,15 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
-CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
-CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
-CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+CGEMMKERNEL = cgemm_kernel_8x4_power8.S
+CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
+CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
+CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
+CGEMMINCOPYOBJ = cgemm_incopy.o
+CGEMMITCOPYOBJ = cgemm_itcopy.o
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
@@ -97,56 +105,56 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#ISMINKERNEL = ../arm/imin.c
#IDMINKERNEL = ../arm/imin.c
#
-#SASUMKERNEL = ../arm/asum.c
-#DASUMKERNEL = ../arm/asum.c
-#CASUMKERNEL = ../arm/zasum.c
-#ZASUMKERNEL = ../arm/zasum.c
+SASUMKERNEL = sasum.c
+DASUMKERNEL = dasum.c
+CASUMKERNEL = casum.c
+ZASUMKERNEL = zasum.c
#
#SAXPYKERNEL = ../arm/axpy.c
-#DAXPYKERNEL = ../arm/axpy.c
+DAXPYKERNEL = daxpy.c
#CAXPYKERNEL = ../arm/zaxpy.c
-#ZAXPYKERNEL = ../arm/zaxpy.c
+ZAXPYKERNEL = zaxpy.c
#
-#SCOPYKERNEL = ../arm/copy.c
-#DCOPYKERNEL = ../arm/copy.c
-#CCOPYKERNEL = ../arm/zcopy.c
-#ZCOPYKERNEL = ../arm/zcopy.c
+SCOPYKERNEL = scopy.c
+DCOPYKERNEL = dcopy.c
+CCOPYKERNEL = ccopy.c
+ZCOPYKERNEL = zcopy.c
#
-#SDOTKERNEL = ../arm/dot.c
-#DDOTKERNEL = ../arm/dot.c
+SDOTKERNEL = sdot.c
+DDOTKERNEL = ddot.c
#CDOTKERNEL = ../arm/zdot.c
-#ZDOTKERNEL = ../arm/zdot.c
+ZDOTKERNEL = zdot.c
#
#SNRM2KERNEL = ../arm/nrm2.c
#DNRM2KERNEL = ../arm/nrm2.c
#CNRM2KERNEL = ../arm/znrm2.c
#ZNRM2KERNEL = ../arm/znrm2.c
#
-#SROTKERNEL = ../arm/rot.c
-#DROTKERNEL = ../arm/rot.c
+SROTKERNEL = srot.c
+DROTKERNEL = drot.c
#CROTKERNEL = ../arm/zrot.c
#ZROTKERNEL = ../arm/zrot.c
#
-#SSCALKERNEL = ../arm/scal.c
-#DSCALKERNEL = ../arm/scal.c
+SSCALKERNEL = sscal.c
+DSCALKERNEL = dscal.c
#CSCALKERNEL = ../arm/zscal.c
-#ZSCALKERNEL = ../arm/zscal.c
+ZSCALKERNEL = zscal.c
#
-#SSWAPKERNEL = ../arm/swap.c
-#DSWAPKERNEL = ../arm/swap.c
-#CSWAPKERNEL = ../arm/zswap.c
-#ZSWAPKERNEL = ../arm/zswap.c
+SSWAPKERNEL = sswap.c
+DSWAPKERNEL = dswap.c
+CSWAPKERNEL = cswap.c
+ZSWAPKERNEL = zswap.c
#
#SGEMVNKERNEL = ../arm/gemv_n.c
-#DGEMVNKERNEL = ../arm/gemv_n.c
+DGEMVNKERNEL = dgemv_n.c
#CGEMVNKERNEL = ../arm/zgemv_n.c
#ZGEMVNKERNEL = ../arm/zgemv_n.c
#
#SGEMVTKERNEL = ../arm/gemv_t.c
#DGEMVTKERNEL = ../arm/gemv_t.c
#CGEMVTKERNEL = ../arm/zgemv_t.c
-#ZGEMVTKERNEL = ../arm/zgemv_t.c
+#ZGEMVTKERNEL = zgemv_t_4.c
#SSYMV_U_KERNEL = ../generic/symv_k.c
diff --git a/kernel/power/casum.c b/kernel/power/casum.c
new file mode 100644
index 000000000..aeed0ca78
--- /dev/null
+++ b/kernel/power/casum.c
@@ -0,0 +1,151 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+#include
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#if defined(POWER8)
+#include "casum_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
+{
+
+ BLASLONG i=0;
+ FLOAT *x = x1;
+ FLOAT temp0, temp1, temp2, temp3;
+ FLOAT temp4, temp5, temp6, temp7;
+ FLOAT sum0 = 0.0;
+ FLOAT sum1 = 0.0;
+ FLOAT sum2 = 0.0;
+ FLOAT sum3 = 0.0;
+
+ while ( i< n )
+ {
+
+ temp0 = ABS(x[0]);
+ temp1 = ABS(x[1]);
+ temp2 = ABS(x[2]);
+ temp3 = ABS(x[3]);
+ temp4 = ABS(x[4]);
+ temp5 = ABS(x[5]);
+ temp6 = ABS(x[6]);
+ temp7 = ABS(x[7]);
+
+ sum0 += temp0;
+ sum1 += temp1;
+ sum2 += temp2;
+ sum3 += temp3;
+
+ sum0 += temp4;
+ sum1 += temp5;
+ sum2 += temp6;
+ sum3 += temp7;
+
+ x+=8;
+ i+=4;
+
+ }
+
+ svec[0] = sum0+sum1+sum2+sum3;
+ svec[1] = 0.0;
+ svec[2] = 0.0;
+ svec[3] = 0.0;
+
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ip=0;
+ FLOAT sumf = 0.0;
+ FLOAT svec[4] __attribute__ ((aligned (16)));;
+ BLASLONG n1;
+ BLASLONG inc_x2;
+
+ if (n <= 0 || inc_x <= 0) return(sumf);
+
+ if ( inc_x == 1 )
+ {
+
+ n1 = n & -16;
+ if ( n1 > 0 )
+ {
+
+ casum_kernel_16(n1, x, svec);
+ sumf = svec[0] + svec[1]+svec[2]+svec[3];
+ i=n1;
+ ip = 2 * n1;
+ }
+
+ while(i < n)
+ {
+ sumf += ABS(x[ip]) + ABS(x[ip+1]);
+ ip += 2;
+ i++;
+ }
+
+ }
+ else
+ {
+ inc_x2 = 2 * inc_x;
+
+ while(i < n)
+ {
+ sumf += ABS(x[ip]) + ABS(x[ip+1]);
+ ip += inc_x2;
+ i++;
+ }
+
+ }
+ return(sumf);
+}
+
+
diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c
new file mode 100644
index 000000000..cb50234ce
--- /dev/null
+++ b/kernel/power/casum_microk_power8.c
@@ -0,0 +1,177 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
+
+static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ BLASLONG pre = 384;
+
+ __asm__ __volatile__
+ (
+
+ "dcbt %2 , %4 \n\t"
+
+ "xxlxor 32,32,32 \n\t"
+ "xxlxor 33,33,33 \n\t"
+ "xxlxor 34,34,34 \n\t"
+ "xxlxor 35,35,35 \n\t"
+ "xxlxor 36,36,36 \n\t"
+ "xxlxor 37,37,37 \n\t"
+ "xxlxor 38,38,38 \n\t"
+ "xxlxor 39,39,39 \n\t"
+
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -16 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "dcbt %2 , %4 \n\t"
+
+ "xvabssp 48, 40 \n\t"
+ "xvabssp 49, 41 \n\t"
+ "xvabssp 50, 42 \n\t"
+ "xvabssp 51, 43 \n\t"
+
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+
+ "xvabssp 52, 44 \n\t"
+ "xvabssp 53, 45 \n\t"
+
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+
+ "xvabssp 54, 46 \n\t"
+ "xvabssp 55, 47 \n\t"
+
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+
+ "xvaddsp 32, 32, 48 \n\t"
+ "xvaddsp 33, 33, 49 \n\t"
+
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+
+ "xvaddsp 34, 34, 50 \n\t"
+ "xvaddsp 35, 35, 51 \n\t"
+ "addi %2, %2, 128 \n\t"
+ "xvaddsp 36, 36, 52 \n\t"
+ "xvaddsp 37, 37, 53 \n\t"
+ "addic. %0 , %0 , -16 \n\t"
+ "xvaddsp 38, 38, 54 \n\t"
+ "xvaddsp 39, 39, 55 \n\t"
+
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+
+ "xvabssp 48, 40 \n\t"
+ "xvabssp 49, 41 \n\t"
+ "xvabssp 50, 42 \n\t"
+ "xvabssp 51, 43 \n\t"
+ "xvabssp 52, 44 \n\t"
+ "xvabssp 53, 45 \n\t"
+ "xvabssp 54, 46 \n\t"
+ "xvabssp 55, 47 \n\t"
+
+ "xvaddsp 32, 32, 48 \n\t"
+ "xvaddsp 33, 33, 49 \n\t"
+ "xvaddsp 34, 34, 50 \n\t"
+ "xvaddsp 35, 35, 51 \n\t"
+ "xvaddsp 36, 36, 52 \n\t"
+ "xvaddsp 37, 37, 53 \n\t"
+ "xvaddsp 38, 38, 54 \n\t"
+ "xvaddsp 39, 39, 55 \n\t"
+
+ "xvaddsp 32, 32, 33 \n\t"
+ "xvaddsp 34, 34, 35 \n\t"
+ "xvaddsp 36, 36, 37 \n\t"
+ "xvaddsp 38, 38, 39 \n\t"
+
+ "xvaddsp 32, 32, 34 \n\t"
+ "xvaddsp 36, 36, 38 \n\t"
+
+ "xvaddsp 32, 32, 36 \n\t"
+
+
+ "stxvw4x 32, 0, %3 \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x1), // 2
+ "r" (svec), // 3
+ "r" (pre), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/ccopy.c b/kernel/power/ccopy.c
new file mode 100644
index 000000000..ce7d67475
--- /dev/null
+++ b/kernel/power/ccopy.c
@@ -0,0 +1,140 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8)
+#include "ccopy_microk_power8.c"
+#endif
+
+#ifndef HAVE_KERNEL_32
+
+static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG i=0;
+ FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+
+ while ( i 0 )
+ {
+ ccopy_kernel_32(n1, x, y);
+ i=n1;
+ ix=n1*2;
+ iy=n1*2;
+ }
+
+ while(i < n)
+ {
+ y[iy] = x[iy] ;
+ y[iy+1] = x[ix+1] ;
+ ix+=2;
+ iy+=2;
+ i++ ;
+
+ }
+
+
+ }
+ else
+ {
+
+ BLASLONG inc_x2 = 2 * inc_x;
+ BLASLONG inc_y2 = 2 * inc_y;
+
+ while(i < n)
+ {
+ y[iy] = x[ix] ;
+ y[iy+1] = x[ix+1] ;
+ ix += inc_x2 ;
+ iy += inc_y2 ;
+ i++ ;
+
+ }
+
+ }
+ return(0);
+
+
+}
+
+
diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c
new file mode 100644
index 000000000..95b3559ba
--- /dev/null
+++ b/kernel/power/ccopy_microk_power8.c
@@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_32 1
+
+static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ BLASLONG pre = 384;
+ BLASLONG alpha=0;
+
+ __asm__ __volatile__
+ (
+
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "lxvw4x 50, 0, %2 \n\t"
+ "lxvw4x 51, %5, %2 \n\t"
+ "lxvw4x 52, %6, %2 \n\t"
+ "lxvw4x 53, %7, %2 \n\t"
+ "lxvw4x 54, %8, %2 \n\t"
+ "lxvw4x 55, %9, %2 \n\t"
+ "lxvw4x 56, %10, %2 \n\t"
+ "lxvw4x 57, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -32 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "stxvw4x 40, 0, %1 \n\t"
+ "stxvw4x 41, %5, %1 \n\t"
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+ "stxvw4x 42, %6, %1 \n\t"
+ "stxvw4x 43, %7, %1 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+ "stxvw4x 44, %8, %1 \n\t"
+ "stxvw4x 45, %9, %1 \n\t"
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+ "stxvw4x 46, %10, %1 \n\t"
+ "stxvw4x 47, %11, %1 \n\t"
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+
+
+ "addi %1, %1, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "stxvw4x 50, 0, %1 \n\t"
+ "stxvw4x 51, %5, %1 \n\t"
+ "lxvw4x 50, 0, %2 \n\t"
+ "lxvw4x 51, %5, %2 \n\t"
+ "stxvw4x 52, %6, %1 \n\t"
+ "stxvw4x 53, %7, %1 \n\t"
+ "lxvw4x 52, %6, %2 \n\t"
+ "lxvw4x 53, %7, %2 \n\t"
+ "stxvw4x 54, %8, %1 \n\t"
+ "stxvw4x 55, %9, %1 \n\t"
+ "lxvw4x 54, %8, %2 \n\t"
+ "lxvw4x 55, %9, %2 \n\t"
+ "stxvw4x 56, %10, %1 \n\t"
+ "stxvw4x 57, %11, %1 \n\t"
+ "lxvw4x 56, %10, %2 \n\t"
+ "lxvw4x 57, %11, %2 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -32 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ "stxvw4x 40, 0, %1 \n\t"
+ "stxvw4x 41, %5, %1 \n\t"
+ "stxvw4x 42, %6, %1 \n\t"
+ "stxvw4x 43, %7, %1 \n\t"
+ "stxvw4x 44, %8, %1 \n\t"
+ "stxvw4x 45, %9, %1 \n\t"
+ "stxvw4x 46, %10, %1 \n\t"
+ "stxvw4x 47, %11, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+
+ "stxvw4x 50, 0, %1 \n\t"
+ "stxvw4x 51, %5, %1 \n\t"
+ "stxvw4x 52, %6, %1 \n\t"
+ "stxvw4x 53, %7, %1 \n\t"
+ "stxvw4x 54, %8, %1 \n\t"
+ "stxvw4x 55, %9, %1 \n\t"
+ "stxvw4x 56, %10, %1 \n\t"
+ "stxvw4x 57, %11, %1 \n\t"
+
+
+ :
+ :
+ "r" (i), // 0
+ "r" (y1), // 1
+ "r" (x1), // 2
+ "r" (alpha), // 3
+ "r" (pre), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2" , "%1", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S
new file mode 100644
index 000000000..0c462ce8e
--- /dev/null
+++ b/kernel/power/cgemm_kernel_8x4_power8.S
@@ -0,0 +1,407 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD lwz
+#else
+#define LOAD ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 32000
+#define ALPHA_R_SP 296(SP)
+#define ALPHA_I_SP 304(SP)
+#define FZERO 312(SP)
+#else
+#define STACKSIZE 256
+#define ALPHA_R_SP 224(SP)
+#define ALPHA_I_SP 232(SP)
+#define FZERO 240(SP)
+#endif
+
+#define M r3
+#define N r4
+#define K r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A r6
+#define B r7
+#define C r8
+#define LDC r9
+#define OFFSET r10
+#else
+#define A r8
+#define B r9
+#define C r10
+#define LDC r6
+#define OFFSET r7
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A r10
+#define B r6
+#define C r7
+#define LDC r8
+#define OFFSET r9
+#else
+#define A r8
+#define B r9
+#define C r10
+#define LDC r6
+#define OFFSET r7
+#endif
+#endif
+
+#define o0 0
+
+#define alpha_dr vs28
+#define alpha_di vs29
+#define alpha_sr vs30
+#define alpha_si vs31
+
+#define FRAMEPOINTER r12
+
+#define BBUFFER r14
+#define L r15
+#define o12 r16
+#define o4 r17
+#define T2 r19
+#define BBO r20
+#define o8 r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define o16 r27
+#define o32 r28
+#define o48 r29
+
+#define PRE r30
+#define T1 r31
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+ mr FRAMEPOINTER, SP
+ addi SP, SP, -STACKSIZE
+ addi SP, SP, -STACKSIZE
+ addi SP, SP, -STACKSIZE
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+#ifdef __64BIT__
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+#else
+ stw r31, 144(SP)
+ stw r30, 148(SP)
+ stw r29, 152(SP)
+ stw r28, 156(SP)
+ stw r27, 160(SP)
+ stw r26, 164(SP)
+ stw r25, 168(SP)
+ stw r24, 172(SP)
+ stw r23, 176(SP)
+ stw r22, 180(SP)
+ stw r21, 184(SP)
+ stw r20, 188(SP)
+ stw r19, 192(SP)
+ stw r18, 196(SP)
+ stw r17, 200(SP)
+ stw r16, 204(SP)
+ stw r15, 208(SP)
+ stw r14, 212(SP)
+#endif
+
+ stfs f1, ALPHA_R_SP
+ stfs f2, ALPHA_I_SP
+ // stw r0, FZERO
+
+#ifdef linux
+#ifdef __64BIT__
+ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#else
+#ifdef DOUBLE
+ lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER)
+ lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER)
+ lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
+#else
+ lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+#endif
+#endif
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+ ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+ ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
+#else
+#ifdef DOUBLE
+ lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER)
+#else
+ lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
+#endif
+#endif
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg KK, OFFSET
+#endif
+#endif
+
+#include "cgemm_macros_8x4_power8.S"
+
+ cmpwi cr0, M, 0
+ ble L999_H1
+ cmpwi cr0, N, 0
+ ble L999_H1
+ cmpwi cr0, K, 0
+ ble L999_H1
+
+ slwi LDC, LDC, ZBASE_SHIFT
+ li PRE, 384
+ li o4 , 4
+ li o8 , 8
+ li o12 , 12
+ li o16 , 16
+ li o32 , 32
+ li o48 , 48
+
+ addi BBUFFER, SP, 512+4096
+ li T1, -4096
+ and BBUFFER, BBUFFER, T1
+
+
+#ifdef __64BIT__
+ addi T1 , SP, 296
+#else
+ addi T1 , SP, 224
+#endif
+
+ stxsspx vs1, 0, T1
+ lxsspx alpha_dr, 0, T1
+ stxsspx vs2, o8 , T1
+ lxsspx alpha_di, o8, T1
+ addi T1, SP, 360
+ li T2, 0
+
+ stw T2, 0(T1)
+ stw T2, 4(T1)
+ stw T2, 8(T1)
+ stxsspx alpha_dr, o12, T1
+ lxvw4x alpha_sr, o0 , T1
+ addi T1, T1, 16
+
+ stw T2, 0(T1)
+ stw T2, 4(T1)
+ stw T2, 8(T1)
+ stxsspx alpha_di, o12, T1
+ lxvw4x alpha_si, o0 , T1
+
+ .align 5
+
+#include "cgemm_logic_8x4_power8.S"
+
+L999:
+ addi r3, 0, 0
+
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+#ifdef __64BIT__
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+#else
+ lwz r31, 144(SP)
+ lwz r30, 148(SP)
+ lwz r29, 152(SP)
+ lwz r28, 156(SP)
+ lwz r27, 160(SP)
+ lwz r26, 164(SP)
+ lwz r25, 168(SP)
+ lwz r24, 172(SP)
+ lwz r23, 176(SP)
+ lwz r22, 180(SP)
+ lwz r21, 184(SP)
+ lwz r20, 188(SP)
+ lwz r19, 192(SP)
+ lwz r18, 196(SP)
+ lwz r17, 200(SP)
+ lwz r16, 204(SP)
+ lwz r15, 208(SP)
+ lwz r14, 212(SP)
+#endif
+
+ addi SP, SP, STACKSIZE
+ addi SP, SP, STACKSIZE
+ addi SP, SP, STACKSIZE
+ addi SP, SP, STACKSIZE
+
+ blr
+
+ EPILOGUE
+#endif
diff --git a/kernel/power/cgemm_logic_8x4_power8.S b/kernel/power/cgemm_logic_8x4_power8.S
new file mode 100644
index 000000000..db2a57f91
--- /dev/null
+++ b/kernel/power/cgemm_logic_8x4_power8.S
@@ -0,0 +1,1459 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+ srawi. J, N, 2
+ ble CGEMM_L4_END
+
+CGEMM_L4_BEGIN:
+
+ mr BO, B
+ mr BBO, BBUFFER
+ slwi T1, K, 3
+
+CGEMM_L4_COPYB:
+ dcbtst BBO, PRE
+
+ lxvw4x vs3, o0, BO
+ lxvw4x vs11, o16, BO
+ xxspltw vs4, vs3, 0
+ xxspltw vs5, vs3, 1
+ xxspltw vs6, vs3, 2
+ xxspltw vs7, vs3, 3
+ xxspltw vs12, vs11, 0
+ xxspltw vs13, vs11, 1
+ xxspltw vs14, vs11, 2
+ xxspltw vs15, vs11, 3
+ stxvw4x vs4, o0, BBO
+ stxvw4x vs5, o16, BBO
+ stxvw4x vs6, o32, BBO
+ stxvw4x vs7, o48, BBO
+ addi BO, BO, 32
+ addi BBO, BBO, 64
+ stxvw4x vs12, o0, BBO
+ stxvw4x vs13, o16, BBO
+ stxvw4x vs14, o32, BBO
+ stxvw4x vs15, o48, BBO
+ addic. T1, T1, -8
+ addi BBO, BBO, 64
+
+ bge CGEMM_L4_COPYB
+
+
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 2
+ add C, C, T1
+ srawi. I, M, 3
+ ble CGEMM_L4x8_END
+
+CGEMM_L4x8_BEGIN:
+
+
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble CGEMM_L4x8_SUB0
+ cmpwi cr0, L, 1
+ ble CGEMM_L4x8_SUB4
+
+CGEMM_L4x8_LOOP_START:
+
+ dcbt AO, PRE
+ dcbt BO, PRE
+ LOAD4x8_1
+ dcbt BO, PRE
+ KERNEL4x8_I1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL4x8_2
+ dcbt BO, PRE
+ KERNEL4x8_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL4x8_2
+
+ dcbt BO, PRE
+ KERNEL4x8_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL4x8_2
+ dcbt BO, PRE
+ KERNEL4x8_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL4x8_2
+
+ addic. L, L, -2
+ ble CGEMM_L4x8_LOOP_END
+
+ .align 5
+
+CGEMM_L4x8_LOOP:
+
+ dcbt BO, PRE
+ KERNEL4x8_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL4x8_2
+ dcbt BO, PRE
+ KERNEL4x8_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL4x8_2
+
+ dcbt BO, PRE
+ KERNEL4x8_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL4x8_2
+ dcbt BO, PRE
+ KERNEL4x8_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL4x8_2
+
+ addic. L, L, -1
+ bgt CGEMM_L4x8_LOOP
+
+CGEMM_L4x8_LOOP_END:
+
+ dcbt BO, PRE
+ KERNEL4x8_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL4x8_2
+ KERNEL4x8_1
+ dcbt AO, PRE
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ dcbt AO, PRE
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_E2
+
+ b CGEMM_L4x8_SUB1
+
+CGEMM_L4x8_SUB4:
+
+ KERNEL4x8_SUBI1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+
+ b CGEMM_L4x8_SUB1
+
+CGEMM_L4x8_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL4x8_SUBI1
+
+ addic. L, L, -1
+ ble CGEMM_L4x8_SAVE
+ b CGEMM_L4x8_SUB2
+
+CGEMM_L4x8_SUB1:
+
+ andi. L, K, 7
+ ble CGEMM_L4x8_SAVE
+
+CGEMM_L4x8_SUB2:
+
+ KERNEL4x8_SUB1
+
+ addic. L, L, -1
+ bgt CGEMM_L4x8_SUB2
+
+CGEMM_L4x8_SAVE:
+
+ SAVE4x8
+
+ addic. I, I, -1
+ bgt CGEMM_L4x8_BEGIN
+
+CGEMM_L4x8_END:
+
+CGEMM_L4x4_BEGIN:
+
+ andi. T2, M, 7
+ ble CGEMM_L4x1_END
+
+ andi. T1, M, 4
+ ble CGEMM_L4x4_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble CGEMM_L4x4_SUB0
+ cmpwi cr0, L, 1
+ ble CGEMM_L4x4_SUB4
+
+CGEMM_L4x4_LOOP_START:
+
+ LOAD4x4_1
+ KERNEL4x4_I1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ addic. L, L, -2
+ ble CGEMM_L4x4_LOOP_END
+
+ .align 5
+
+CGEMM_L4x4_LOOP:
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ addic. L, L, -1
+ bgt CGEMM_L4x4_LOOP
+
+CGEMM_L4x4_LOOP_END:
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_E2
+
+ b CGEMM_L4x4_SUB1
+
+CGEMM_L4x4_SUB4:
+
+ KERNEL4x4_SUBI1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+
+ b CGEMM_L4x4_SUB1
+
+CGEMM_L4x4_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL4x4_SUBI1
+
+ addic. L, L, -1
+ ble CGEMM_L4x4_SAVE
+ b CGEMM_L4x4_SUB2
+
+CGEMM_L4x4_SUB1:
+
+ andi. L, K, 7
+ ble CGEMM_L4x4_SAVE
+
+CGEMM_L4x4_SUB2:
+
+ KERNEL4x4_SUB1
+
+ addic. L, L, -1
+ bgt CGEMM_L4x4_SUB2
+
+CGEMM_L4x4_SAVE:
+
+ SAVE4x4
+
+CGEMM_L4x4_END:
+
+CGEMM_L4x2_BEGIN:
+
+
+ andi. T1, M, 2
+ ble CGEMM_L4x2_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble CGEMM_L4x2_SUB0
+ cmpwi cr0, L, 1
+ ble CGEMM_L4x2_SUB4
+
+CGEMM_L4x2_LOOP_START:
+
+ LOAD4x2_1
+ KERNEL4x2_I1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ addic. L, L, -2
+ ble CGEMM_L4x2_LOOP_END
+
+ .align 5
+
+CGEMM_L4x2_LOOP:
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ addic. L, L, -1
+ bgt CGEMM_L4x2_LOOP
+
+CGEMM_L4x2_LOOP_END:
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_E2
+
+ b CGEMM_L4x2_SUB1
+
+CGEMM_L4x2_SUB4:
+
+ KERNEL4x2_SUBI1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+
+ b CGEMM_L4x2_SUB1
+
+CGEMM_L4x2_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL4x2_SUBI1
+
+ addic. L, L, -1
+ ble CGEMM_L4x2_SAVE
+ b CGEMM_L4x2_SUB2
+
+CGEMM_L4x2_SUB1:
+
+ andi. L, K, 7
+ ble CGEMM_L4x2_SAVE
+
+CGEMM_L4x2_SUB2:
+
+ KERNEL4x2_SUB1
+
+ addic. L, L, -1
+ bgt CGEMM_L4x2_SUB2
+
+CGEMM_L4x2_SAVE:
+
+ SAVE4x2
+
+CGEMM_L4x2_END:
+
+CGEMM_L4x1_BEGIN:
+
+
+ andi. T1, M, 1
+ ble CGEMM_L4x1_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble CGEMM_L4x1_SUB0
+ cmpwi cr0, L, 1
+ ble CGEMM_L4x1_SUB4
+
+CGEMM_L4x1_LOOP_START:
+
+ LOAD4x1_1
+ KERNEL4x1_I1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ addic. L, L, -2
+ ble CGEMM_L4x1_LOOP_END
+
+ .align 5
+
+CGEMM_L4x1_LOOP:
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ addic. L, L, -1
+ bgt CGEMM_L4x1_LOOP
+
+CGEMM_L4x1_LOOP_END:
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_E2
+
+ b CGEMM_L4x1_SUB1
+
+CGEMM_L4x1_SUB4:
+
+ KERNEL4x1_SUBI1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+
+ b CGEMM_L4x1_SUB1
+
+CGEMM_L4x1_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL4x1_SUBI1
+
+ addic. L, L, -1
+ ble CGEMM_L4x1_SAVE
+ b CGEMM_L4x1_SUB2
+
+CGEMM_L4x1_SUB1:
+
+ andi. L, K, 7
+ ble CGEMM_L4x1_SAVE
+
+CGEMM_L4x1_SUB2:
+
+ KERNEL4x1_SUB1
+
+ addic. L, L, -1
+ bgt CGEMM_L4x1_SUB2
+
+CGEMM_L4x1_SAVE:
+
+ SAVE4x1
+
+CGEMM_L4x1_END:
+
+ slwi T1, K, 5
+ add B, B, T1
+
+ addic. J, J, -1
+ bgt CGEMM_L4_BEGIN
+
+ andi. T2, N, 3
+ ble L999_H2
+
+CGEMM_L4_END:
+
+ b CGEMM_L2_BEGIN
+
+L999_H1:
+
+ b L999_H2
+
+CGEMM_L2_BEGIN:
+
+ mr BO, B
+ mr BBO, BBUFFER
+ slwi T1, K, 2
+
+CGEMM_L2_COPYB:
+ dcbtst BBO, PRE
+
+ lxvw4x vs3, o0, BO
+ lxvw4x vs11, o16, BO
+ xxspltw vs4, vs3, 0
+ xxspltw vs5, vs3, 1
+ xxspltw vs6, vs3, 2
+ xxspltw vs7, vs3, 3
+ xxspltw vs12, vs11, 0
+ xxspltw vs13, vs11, 1
+ xxspltw vs14, vs11, 2
+ xxspltw vs15, vs11, 3
+ stxvw4x vs4, o0, BBO
+ stxvw4x vs5, o16, BBO
+ stxvw4x vs6, o32, BBO
+ stxvw4x vs7, o48, BBO
+ addi BO, BO, 32
+ addi BBO, BBO, 64
+ stxvw4x vs12, o0, BBO
+ stxvw4x vs13, o16, BBO
+ stxvw4x vs14, o32, BBO
+ stxvw4x vs15, o48, BBO
+ addic. T1, T1, -8
+ addi BBO, BBO, 64
+
+ bge CGEMM_L2_COPYB
+
+
+ andi. T1, N, 2
+ ble CGEMM_L2_END
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 1
+ add C, C, T1
+ srawi. I, M, 3
+ ble CGEMM_L2x8_END
+
+CGEMM_L2x8_BEGIN:
+
+
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble CGEMM_L2x8_SUB0
+ cmpwi cr0, L, 1
+ ble CGEMM_L2x8_SUB4
+
+CGEMM_L2x8_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD2x8_1
+ KERNEL2x8_I1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+
+ addic. L, L, -2
+ ble CGEMM_L2x8_LOOP_END
+
+ .align 5
+
+CGEMM_L2x8_LOOP:
+
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+
+ addic. L, L, -1
+ bgt CGEMM_L2x8_LOOP
+
+CGEMM_L2x8_LOOP_END:
+
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ dcbt AO, PRE
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_E2
+
+ b CGEMM_L2x8_SUB1
+
+CGEMM_L2x8_SUB4:
+
+ KERNEL2x8_SUBI1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ b CGEMM_L2x8_SUB1
+
+CGEMM_L2x8_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x8_SUBI1
+
+ addic. L, L, -1
+ ble CGEMM_L2x8_SAVE
+ b CGEMM_L2x8_SUB2
+
+CGEMM_L2x8_SUB1:
+
+ andi. L, K, 7
+ ble CGEMM_L2x8_SAVE
+
+CGEMM_L2x8_SUB2:
+
+ KERNEL2x8_SUB1
+
+ addic. L, L, -1
+ bgt CGEMM_L2x8_SUB2
+
+CGEMM_L2x8_SAVE:
+
+ SAVE2x8
+
+ addic. I, I, -1
+ bgt CGEMM_L2x8_BEGIN
+
+CGEMM_L2x8_END:
+
+CGEMM_L2x4_BEGIN:
+
+ andi. T2, M, 7
+ ble CGEMM_L2x1_END
+
+ andi. T1, M, 4
+ ble CGEMM_L2x4_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble CGEMM_L2x4_SUB0
+ cmpwi cr0, L, 1
+ ble CGEMM_L2x4_SUB4
+
+CGEMM_L2x4_LOOP_START:
+
+ LOAD2x4_1
+ KERNEL2x4_I1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -2
+ ble CGEMM_L2x4_LOOP_END
+
+ .align 5
+
+CGEMM_L2x4_LOOP:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -1
+ bgt CGEMM_L2x4_LOOP
+
+CGEMM_L2x4_LOOP_END:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_E2
+
+ b CGEMM_L2x4_SUB1
+
+CGEMM_L2x4_SUB4:
+
+ KERNEL2x4_SUBI1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ b CGEMM_L2x4_SUB1
+
+CGEMM_L2x4_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x4_SUBI1
+
+ addic. L, L, -1
+ ble CGEMM_L2x4_SAVE
+ b CGEMM_L2x4_SUB2
+
+CGEMM_L2x4_SUB1:
+
+ andi. L, K, 7
+ ble CGEMM_L2x4_SAVE
+
+CGEMM_L2x4_SUB2:
+
+ KERNEL2x4_SUB1
+
+ addic. L, L, -1
+ bgt CGEMM_L2x4_SUB2
+
+CGEMM_L2x4_SAVE:
+
+ SAVE2x4
+
+CGEMM_L2x4_END:
+
+CGEMM_L2x2_BEGIN:
+
+
+ andi. T1, M, 2
+ ble CGEMM_L2x2_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble CGEMM_L2x2_SUB0
+ cmpwi cr0, L, 1
+ ble CGEMM_L2x2_SUB4
+
+CGEMM_L2x2_LOOP_START:
+
+ LOAD2x2_1
+ KERNEL2x2_I1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -2
+ ble CGEMM_L2x2_LOOP_END
+
+ .align 5
+
+CGEMM_L2x2_LOOP:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -1
+ bgt CGEMM_L2x2_LOOP
+
+CGEMM_L2x2_LOOP_END:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_E2
+
+ b CGEMM_L2x2_SUB1
+
+CGEMM_L2x2_SUB4:
+
+ KERNEL2x2_SUBI1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ b CGEMM_L2x2_SUB1
+
+CGEMM_L2x2_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x2_SUBI1
+
+ addic. L, L, -1
+ ble CGEMM_L2x2_SAVE
+ b CGEMM_L2x2_SUB2
+
+CGEMM_L2x2_SUB1:
+
+ andi. L, K, 7
+ ble CGEMM_L2x2_SAVE
+
+CGEMM_L2x2_SUB2:
+
+ KERNEL2x2_SUB1
+
+ addic. L, L, -1
+ bgt CGEMM_L2x2_SUB2
+
+CGEMM_L2x2_SAVE:
+
+ SAVE2x2
+
+CGEMM_L2x2_END:
+
+CGEMM_L2x1_BEGIN:
+
+
+ andi. T1, M, 1
+ ble CGEMM_L2x1_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble CGEMM_L2x1_SUB0
+ cmpwi cr0, L, 1
+ ble CGEMM_L2x1_SUB4
+
+CGEMM_L2x1_LOOP_START:
+
+ LOAD2x1_1
+ KERNEL2x1_I1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -2
+ ble CGEMM_L2x1_LOOP_END
+
+ .align 5
+
+CGEMM_L2x1_LOOP:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -1
+ bgt CGEMM_L2x1_LOOP
+
+CGEMM_L2x1_LOOP_END:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_E2
+
+ b CGEMM_L2x1_SUB1
+
+CGEMM_L2x1_SUB4:
+
+ KERNEL2x1_SUBI1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ b CGEMM_L2x1_SUB1
+
+CGEMM_L2x1_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x1_SUBI1
+
+ addic. L, L, -1
+ ble CGEMM_L2x1_SAVE
+ b CGEMM_L2x1_SUB2
+
+CGEMM_L2x1_SUB1:
+
+ andi. L, K, 7
+ ble CGEMM_L2x1_SAVE
+
+CGEMM_L2x1_SUB2:
+
+ KERNEL2x1_SUB1
+
+ addic. L, L, -1
+ bgt CGEMM_L2x1_SUB2
+
+CGEMM_L2x1_SAVE:
+
+ SAVE2x1
+
+CGEMM_L2x1_END:
+
+ slwi T1, K, 4
+ add B, B, T1
+
+CGEMM_L2_END:
+
+ b CGEMM_L1_BEGIN
+
+L999_H2:
+
+ b L999
+
+CGEMM_L1_BEGIN:
+
+ mr BO, B
+ mr BBO, BBUFFER
+ slwi T1, K, 1
+
+CGEMM_L1_COPYB:
+ dcbtst BBO, PRE
+
+ lxvw4x vs3, o0, BO
+ lxvw4x vs11, o16, BO
+ xxspltw vs4, vs3, 0
+ xxspltw vs5, vs3, 1
+ xxspltw vs6, vs3, 2
+ xxspltw vs7, vs3, 3
+ xxspltw vs12, vs11, 0
+ xxspltw vs13, vs11, 1
+ xxspltw vs14, vs11, 2
+ xxspltw vs15, vs11, 3
+ stxvw4x vs4, o0, BBO
+ stxvw4x vs5, o16, BBO
+ stxvw4x vs6, o32, BBO
+ stxvw4x vs7, o48, BBO
+ addi BO, BO, 32
+ addi BBO, BBO, 64
+ stxvw4x vs12, o0, BBO
+ stxvw4x vs13, o16, BBO
+ stxvw4x vs14, o32, BBO
+ stxvw4x vs15, o48, BBO
+ addic. T1, T1, -8
+ addi BBO, BBO, 64
+
+ bge CGEMM_L1_COPYB
+
+
+ andi. T1, N, 1
+ ble CGEMM_L1_END
+ mr CO, C
+ mr AO, A
+ srawi. I, M, 3
+ ble CGEMM_L1x8_END
+
+CGEMM_L1x8_BEGIN:
+
+
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble CGEMM_L1x8_SUB0
+ cmpwi cr0, L, 1
+ ble CGEMM_L1x8_SUB4
+
+CGEMM_L1x8_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD1x8_1
+ KERNEL1x8_I1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+
+ addic. L, L, -2
+ ble CGEMM_L1x8_LOOP_END
+
+ .align 5
+
+CGEMM_L1x8_LOOP:
+
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+
+ addic. L, L, -1
+ bgt CGEMM_L1x8_LOOP
+
+CGEMM_L1x8_LOOP_END:
+
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ dcbt AO, PRE
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_E2
+
+ b CGEMM_L1x8_SUB1
+
+CGEMM_L1x8_SUB4:
+
+ KERNEL1x8_SUBI1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ b CGEMM_L1x8_SUB1
+
+CGEMM_L1x8_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x8_SUBI1
+
+ addic. L, L, -1
+ ble CGEMM_L1x8_SAVE
+ b CGEMM_L1x8_SUB2
+
+CGEMM_L1x8_SUB1:
+
+ andi. L, K, 7
+ ble CGEMM_L1x8_SAVE
+
+CGEMM_L1x8_SUB2:
+
+ KERNEL1x8_SUB1
+
+ addic. L, L, -1
+ bgt CGEMM_L1x8_SUB2
+
+CGEMM_L1x8_SAVE:
+
+ SAVE1x8
+
+ addic. I, I, -1
+ bgt CGEMM_L1x8_BEGIN
+
+CGEMM_L1x8_END:
+
+CGEMM_L1x4_BEGIN:
+
+ andi. T2, M, 7
+ ble CGEMM_L1x1_END
+
+ andi. T1, M, 4
+ ble CGEMM_L1x4_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble CGEMM_L1x4_SUB0
+ cmpwi cr0, L, 1
+ ble CGEMM_L1x4_SUB4
+
+CGEMM_L1x4_LOOP_START:
+
+ LOAD1x4_1
+ KERNEL1x4_I1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -2
+ ble CGEMM_L1x4_LOOP_END
+
+ .align 5
+
+CGEMM_L1x4_LOOP:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -1
+ bgt CGEMM_L1x4_LOOP
+
+CGEMM_L1x4_LOOP_END:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_E2
+
+ b CGEMM_L1x4_SUB1
+
+CGEMM_L1x4_SUB4:
+
+ KERNEL1x4_SUBI1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ b CGEMM_L1x4_SUB1
+
+CGEMM_L1x4_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x4_SUBI1
+
+ addic. L, L, -1
+ ble CGEMM_L1x4_SAVE
+ b CGEMM_L1x4_SUB2
+
+CGEMM_L1x4_SUB1:
+
+ andi. L, K, 7
+ ble CGEMM_L1x4_SAVE
+
+CGEMM_L1x4_SUB2:
+
+ KERNEL1x4_SUB1
+
+ addic. L, L, -1
+ bgt CGEMM_L1x4_SUB2
+
+CGEMM_L1x4_SAVE:
+
+ SAVE1x4
+
+CGEMM_L1x4_END:
+
+CGEMM_L1x2_BEGIN:
+
+
+ andi. T1, M, 2
+ ble CGEMM_L1x2_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble CGEMM_L1x2_SUB0
+ cmpwi cr0, L, 1
+ ble CGEMM_L1x2_SUB4
+
+CGEMM_L1x2_LOOP_START:
+
+ LOAD1x2_1
+ KERNEL1x2_I1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -2
+ ble CGEMM_L1x2_LOOP_END
+
+ .align 5
+
+CGEMM_L1x2_LOOP:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -1
+ bgt CGEMM_L1x2_LOOP
+
+CGEMM_L1x2_LOOP_END:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_E2
+
+ b CGEMM_L1x2_SUB1
+
+CGEMM_L1x2_SUB4:
+
+ KERNEL1x2_SUBI1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ b CGEMM_L1x2_SUB1
+
+CGEMM_L1x2_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x2_SUBI1
+
+ addic. L, L, -1
+ ble CGEMM_L1x2_SAVE
+ b CGEMM_L1x2_SUB2
+
+CGEMM_L1x2_SUB1:
+
+ andi. L, K, 7
+ ble CGEMM_L1x2_SAVE
+
+CGEMM_L1x2_SUB2:
+
+ KERNEL1x2_SUB1
+
+ addic. L, L, -1
+ bgt CGEMM_L1x2_SUB2
+
+CGEMM_L1x2_SAVE:
+
+ SAVE1x2
+
+CGEMM_L1x2_END:
+
+CGEMM_L1x1_BEGIN:
+
+
+ andi. T1, M, 1
+ ble CGEMM_L1x1_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble CGEMM_L1x1_SUB0
+ cmpwi cr0, L, 1
+ ble CGEMM_L1x1_SUB4
+
+CGEMM_L1x1_LOOP_START:
+
+ LOAD1x1_1
+ KERNEL1x1_I1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -2
+ ble CGEMM_L1x1_LOOP_END
+
+ .align 5
+
+CGEMM_L1x1_LOOP:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -1
+ bgt CGEMM_L1x1_LOOP
+
+CGEMM_L1x1_LOOP_END:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_E2
+
+ b CGEMM_L1x1_SUB1
+
+CGEMM_L1x1_SUB4:
+
+ KERNEL1x1_SUBI1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ b CGEMM_L1x1_SUB1
+
+CGEMM_L1x1_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x1_SUBI1
+
+ addic. L, L, -1
+ ble CGEMM_L1x1_SAVE
+ b CGEMM_L1x1_SUB2
+
+CGEMM_L1x1_SUB1:
+
+ andi. L, K, 7
+ ble CGEMM_L1x1_SAVE
+
+CGEMM_L1x1_SUB2:
+
+ KERNEL1x1_SUB1
+
+ addic. L, L, -1
+ bgt CGEMM_L1x1_SUB2
+
+CGEMM_L1x1_SAVE:
+
+ SAVE1x1
+
+CGEMM_L1x1_END:
+
+CGEMM_L1_END:
diff --git a/kernel/power/cgemm_macros_8x4_power8.S b/kernel/power/cgemm_macros_8x4_power8.S
new file mode 100644
index 000000000..9a18cb189
--- /dev/null
+++ b/kernel/power/cgemm_macros_8x4_power8.S
@@ -0,0 +1,6355 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xssubdp
+ #define XSFADD_I1 xsadddp
+ #define XSFADD_I2 xsadddp
+ #define XVFADD_R1 xvaddsp
+ #define XVFADD_R2 xvsubsp
+ #define XVFADD_I1 xvaddsp
+ #define XVFADD_I2 xvaddsp
+
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xsadddp
+ #define XSFADD_I1 xssubdp
+ #define XSFADD_I2 xsadddp
+ #define XVFADD_R1 xvaddsp
+ #define XVFADD_R2 xvaddsp
+ #define XVFADD_I1 xvsubsp
+ #define XVFADD_I2 xvaddsp
+
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xsadddp
+ #define XSFADD_I1 xsadddp
+ #define XSFADD_I2 xssubdp
+ #define XVFADD_R1 xvaddsp
+ #define XVFADD_R2 xvaddsp
+ #define XVFADD_I1 xvaddsp
+ #define XVFADD_I2 xvsubsp
+
+#else // CC || CR || RC || RR
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xssubdp
+ #define XSFADD_I1 xssubdp
+ #define XSFADD_I2 xssubdp
+ #define XVFADD_R1 xvaddsp
+ #define XVFADD_R2 xvsubsp
+ #define XVFADD_I1 xvsubsp
+ #define XVFADD_I2 xvsubsp
+
+#endif
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+ lxvw4x vs2, o32, AO // load a4, a5
+ lxvw4x vs3, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs12, o0, BO // load b2_r
+ lxvw4x vs13, o16, BO // load b2_i
+ lxvw4x vs14, o32, BO // load b3_r
+ lxvw4x vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+.endm
+
+.macro KERNEL4x8_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+ lxvw4x vs5, o16, AO // load a2, a3
+ lxvw4x vs6, o32, AO // load a4, a5
+ lxvw4x vs7, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+ lxvw4x vs18, o32, BO // load b1_r
+ lxvw4x vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs20, o0, BO // load b2_r
+ lxvw4x vs21, o16, BO // load b2_i
+ lxvw4x vs22, o32, BO // load b3_r
+ lxvw4x vs23, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+ lxvw4x vs5, o16, AO // load a2, a3
+ lxvw4x vs6, o32, AO // load a4, a5
+ lxvw4x vs7, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+ lxvw4x vs18, o32, BO // load b1_r
+ lxvw4x vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs20, o0, BO // load b2_r
+ lxvw4x vs21, o16, BO // load b2_i
+ lxvw4x vs22, o32, BO // load b3_r
+ lxvw4x vs23, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+ lxvw4x vs2, o32, AO // load a4, a5
+ lxvw4x vs3, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs12, o0, BO // load b2_r
+ lxvw4x vs13, o16, BO // load b2_i
+ lxvw4x vs14, o32, BO // load b3_r
+ lxvw4x vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+ lxvw4x vs2, o32, AO // load a4, a5
+ lxvw4x vs3, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs12, o0, BO // load b2_r
+ lxvw4x vs13, o16, BO // load b2_i
+ lxvw4x vs14, o32, BO // load b3_r
+ lxvw4x vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+ lxvw4x vs2, o32, AO // load a4, a5
+ lxvw4x vs3, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs12, o0, BO // load b2_r
+ lxvw4x vs13, o16, BO // load b2_i
+ lxvw4x vs14, o32, BO // load b3_r
+ lxvw4x vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x8
+
+ mr T1, CO
+ xxlxor vs24, vs24, vs24
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=4
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+ xxspltw vs10, vs36, 2
+ xxspltw vs11, vs36, 3
+
+
+ xxspltw vs12, vs37, 0
+ xxspltw vs13, vs37, 1
+ xxspltw vs14, vs37, 2
+ xxspltw vs15, vs37, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=6
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs38, 0
+ xxspltw vs9, vs38, 1
+ xxspltw vs10, vs38, 2
+ xxspltw vs11, vs38, 3
+
+
+ xxspltw vs12, vs39, 0
+ xxspltw vs13, vs39, 1
+ xxspltw vs14, vs39, 2
+ xxspltw vs15, vs39, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs40, 0
+ xxspltw vs9, vs40, 1
+ xxspltw vs10, vs40, 2
+ xxspltw vs11, vs40, 3
+
+
+ xxspltw vs12, vs41, 0
+ xxspltw vs13, vs41, 1
+ xxspltw vs14, vs41, 2
+ xxspltw vs15, vs41, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs42, 0
+ xxspltw vs9, vs42, 1
+ xxspltw vs10, vs42, 2
+ xxspltw vs11, vs42, 3
+
+
+ xxspltw vs12, vs43, 0
+ xxspltw vs13, vs43, 1
+ xxspltw vs14, vs43, 2
+ xxspltw vs15, vs43, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=4
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs44, 0
+ xxspltw vs9, vs44, 1
+ xxspltw vs10, vs44, 2
+ xxspltw vs11, vs44, 3
+
+
+ xxspltw vs12, vs45, 0
+ xxspltw vs13, vs45, 1
+ xxspltw vs14, vs45, 2
+ xxspltw vs15, vs45, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=6
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs46, 0
+ xxspltw vs9, vs46, 1
+ xxspltw vs10, vs46, 2
+ xxspltw vs11, vs46, 3
+
+
+ xxspltw vs12, vs47, 0
+ xxspltw vs13, vs47, 1
+ xxspltw vs14, vs47, 2
+ xxspltw vs15, vs47, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=2
+
+ mr T2, T1
+
+// N=2 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs48, 0
+ xxspltw vs9, vs48, 1
+ xxspltw vs10, vs48, 2
+ xxspltw vs11, vs48, 3
+
+
+ xxspltw vs12, vs49, 0
+ xxspltw vs13, vs49, 1
+ xxspltw vs14, vs49, 2
+ xxspltw vs15, vs49, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=2 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs50, 0
+ xxspltw vs9, vs50, 1
+ xxspltw vs10, vs50, 2
+ xxspltw vs11, vs50, 3
+
+
+ xxspltw vs12, vs51, 0
+ xxspltw vs13, vs51, 1
+ xxspltw vs14, vs51, 2
+ xxspltw vs15, vs51, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=2 M=4
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs52, 0
+ xxspltw vs9, vs52, 1
+ xxspltw vs10, vs52, 2
+ xxspltw vs11, vs52, 3
+
+
+ xxspltw vs12, vs53, 0
+ xxspltw vs13, vs53, 1
+ xxspltw vs14, vs53, 2
+ xxspltw vs15, vs53, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=2 M=6
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs54, 0
+ xxspltw vs9, vs54, 1
+ xxspltw vs10, vs54, 2
+ xxspltw vs11, vs54, 3
+
+
+ xxspltw vs12, vs55, 0
+ xxspltw vs13, vs55, 1
+ xxspltw vs14, vs55, 2
+ xxspltw vs15, vs55, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=3
+
+ mr T2, T1
+
+// N=3 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs56, 0
+ xxspltw vs9, vs56, 1
+ xxspltw vs10, vs56, 2
+ xxspltw vs11, vs56, 3
+
+
+ xxspltw vs12, vs57, 0
+ xxspltw vs13, vs57, 1
+ xxspltw vs14, vs57, 2
+ xxspltw vs15, vs57, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=3 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs58, 0
+ xxspltw vs9, vs58, 1
+ xxspltw vs10, vs58, 2
+ xxspltw vs11, vs58, 3
+
+
+ xxspltw vs12, vs59, 0
+ xxspltw vs13, vs59, 1
+ xxspltw vs14, vs59, 2
+ xxspltw vs15, vs59, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=3 M=4
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs60, 0
+ xxspltw vs9, vs60, 1
+ xxspltw vs10, vs60, 2
+ xxspltw vs11, vs60, 3
+
+
+ xxspltw vs12, vs61, 0
+ xxspltw vs13, vs61, 1
+ xxspltw vs14, vs61, 2
+ xxspltw vs15, vs61, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=3 M=6
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs62, 0
+ xxspltw vs9, vs62, 1
+ xxspltw vs10, vs62, 2
+ xxspltw vs11, vs62, 3
+
+
+ xxspltw vs12, vs63, 0
+ xxspltw vs13, vs63, 1
+ xxspltw vs14, vs63, 2
+ xxspltw vs15, vs63, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs12, o0, BO // load b2_r
+ lxvw4x vs13, o16, BO // load b2_i
+ lxvw4x vs14, o32, BO // load b3_r
+ lxvw4x vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+.endm
+
+.macro KERNEL4x4_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+ lxvw4x vs5, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+ lxvw4x vs18, o32, BO // load b1_r
+ lxvw4x vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs20, o0, BO // load b2_r
+ lxvw4x vs21, o16, BO // load b2_i
+ lxvw4x vs22, o32, BO // load b3_r
+ lxvw4x vs23, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+ lxvw4x vs5, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+ lxvw4x vs18, o32, BO // load b1_r
+ lxvw4x vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs20, o0, BO // load b2_r
+ lxvw4x vs21, o16, BO // load b2_i
+ lxvw4x vs22, o32, BO // load b3_r
+ lxvw4x vs23, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs12, o0, BO // load b2_r
+ lxvw4x vs13, o16, BO // load b2_i
+ lxvw4x vs14, o32, BO // load b3_r
+ lxvw4x vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs12, o0, BO // load b2_r
+ lxvw4x vs13, o16, BO // load b2_i
+ lxvw4x vs14, o32, BO // load b3_r
+ lxvw4x vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs12, o0, BO // load b2_r
+ lxvw4x vs13, o16, BO // load b2_i
+ lxvw4x vs14, o32, BO // load b3_r
+ lxvw4x vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x4
+
+ mr T1, CO
+ xxlxor vs24, vs24, vs24
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+ xxspltw vs10, vs36, 2
+ xxspltw vs11, vs36, 3
+
+
+ xxspltw vs12, vs37, 0
+ xxspltw vs13, vs37, 1
+ xxspltw vs14, vs37, 2
+ xxspltw vs15, vs37, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs38, 0
+ xxspltw vs9, vs38, 1
+ xxspltw vs10, vs38, 2
+ xxspltw vs11, vs38, 3
+
+
+ xxspltw vs12, vs39, 0
+ xxspltw vs13, vs39, 1
+ xxspltw vs14, vs39, 2
+ xxspltw vs15, vs39, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=2
+
+ mr T2, T1
+
+// N=2 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs40, 0
+ xxspltw vs9, vs40, 1
+ xxspltw vs10, vs40, 2
+ xxspltw vs11, vs40, 3
+
+
+ xxspltw vs12, vs41, 0
+ xxspltw vs13, vs41, 1
+ xxspltw vs14, vs41, 2
+ xxspltw vs15, vs41, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=2 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs42, 0
+ xxspltw vs9, vs42, 1
+ xxspltw vs10, vs42, 2
+ xxspltw vs11, vs42, 3
+
+
+ xxspltw vs12, vs43, 0
+ xxspltw vs13, vs43, 1
+ xxspltw vs14, vs43, 2
+ xxspltw vs15, vs43, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=3
+
+ mr T2, T1
+
+// N=3 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs44, 0
+ xxspltw vs9, vs44, 1
+ xxspltw vs10, vs44, 2
+ xxspltw vs11, vs44, 3
+
+
+ xxspltw vs12, vs45, 0
+ xxspltw vs13, vs45, 1
+ xxspltw vs14, vs45, 2
+ xxspltw vs15, vs45, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=3 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs46, 0
+ xxspltw vs9, vs46, 1
+ xxspltw vs10, vs46, 2
+ xxspltw vs11, vs46, 3
+
+
+ xxspltw vs12, vs47, 0
+ xxspltw vs13, vs47, 1
+ xxspltw vs14, vs47, 2
+ xxspltw vs15, vs47, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro LOAD4x2_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs12, o0, BO // load b2_r
+ lxvw4x vs13, o16, BO // load b2_i
+ lxvw4x vs14, o32, BO // load b3_r
+ lxvw4x vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+.endm
+
+.macro KERNEL4x2_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+ lxvw4x vs18, o32, BO // load b1_r
+ lxvw4x vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs20, o0, BO // load b2_r
+ lxvw4x vs21, o16, BO // load b2_i
+ lxvw4x vs22, o32, BO // load b3_r
+ lxvw4x vs23, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+ lxvw4x vs18, o32, BO // load b1_r
+ lxvw4x vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs20, o0, BO // load b2_r
+ lxvw4x vs21, o16, BO // load b2_i
+ lxvw4x vs22, o32, BO // load b3_r
+ lxvw4x vs23, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs12, o0, BO // load b2_r
+ lxvw4x vs13, o16, BO // load b2_i
+ lxvw4x vs14, o32, BO // load b3_r
+ lxvw4x vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs12, o0, BO // load b2_r
+ lxvw4x vs13, o16, BO // load b2_i
+ lxvw4x vs14, o32, BO // load b3_r
+ lxvw4x vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxvw4x vs12, o0, BO // load b2_r
+ lxvw4x vs13, o16, BO // load b2_i
+ lxvw4x vs14, o32, BO // load b3_r
+ lxvw4x vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x2
+
+ mr T1, CO
+ xxlxor vs24, vs24, vs24
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=2
+
+ mr T2, T1
+
+// N=2 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+ xxspltw vs10, vs36, 2
+ xxspltw vs11, vs36, 3
+
+
+ xxspltw vs12, vs37, 0
+ xxspltw vs13, vs37, 1
+ xxspltw vs14, vs37, 2
+ xxspltw vs15, vs37, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=3
+
+ mr T2, T1
+
+// N=3 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs38, 0
+ xxspltw vs9, vs38, 1
+ xxspltw vs10, vs38, 2
+ xxspltw vs11, vs38, 3
+
+
+ xxspltw vs12, vs39, 0
+ xxspltw vs13, vs39, 1
+ xxspltw vs14, vs39, 2
+ xxspltw vs15, vs39, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro LOAD4x1_1
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs8, o0, BO // load b0_r
+ lxsspx vs9, o16, BO // load b0_i
+ lxsspx vs10, o32, BO // load b1_r
+ lxsspx vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxsspx vs12, o0, BO // load b2_r
+ lxsspx vs13, o16, BO // load b2_i
+ lxsspx vs14, o32, BO // load b3_r
+ lxsspx vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+.endm
+
+.macro KERNEL4x1_I1
+
+
+ lxsspx vs4, o0, AO // load a0_r
+ lxsspx vs5, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs16, o0, BO // load b0_r
+ lxsspx vs17, o16, BO // load b0_i
+ lxsspx vs18, o32, BO // load b1_r
+ lxsspx vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxsspx vs20, o0, BO // load b2_r
+ lxsspx vs21, o16, BO // load b2_i
+ lxsspx vs22, o32, BO // load b3_r
+ lxsspx vs23, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xsmuldp vs32, vs0, vs8 // a0_r*b0_r
+ xsmuldp vs33, vs1, vs9 // a0_i*b0_i
+ xsmuldp vs34, vs0, vs9 // a0_r*b0_i
+ xsmuldp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmuldp vs36, vs0, vs10 // a0_r*b1_r
+ xsmuldp vs37, vs1, vs11 // a0_i*b1_i
+ xsmuldp vs38, vs0, vs11 // a0_r*b1_i
+ xsmuldp vs39, vs1, vs10 // a0_i*b1_r
+
+ xsmuldp vs40, vs0, vs12 // a0_r*b2_r
+ xsmuldp vs41, vs1, vs13 // a0_i*b2_i
+ xsmuldp vs42, vs0, vs13 // a0_r*b2_i
+ xsmuldp vs43, vs1, vs12 // a0_i*b2_r
+
+ xsmuldp vs44, vs0, vs14 // a0_r*b3_r
+ xsmuldp vs45, vs1, vs15 // a0_i*b3_i
+ xsmuldp vs46, vs0, vs15 // a0_r*b3_i
+ xsmuldp vs47, vs1, vs14 // a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_1
+
+
+ lxsspx vs4, o0, AO // load a0_r
+ lxsspx vs5, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs16, o0, BO // load b0_r
+ lxsspx vs17, o16, BO // load b0_i
+ lxsspx vs18, o32, BO // load b1_r
+ lxsspx vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxsspx vs20, o0, BO // load b2_r
+ lxsspx vs21, o16, BO // load b2_i
+ lxsspx vs22, o32, BO // load b3_r
+ lxsspx vs23, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xsmaddadp vs32, vs0, vs8 // a0_r*b0_r
+ xsmaddadp vs33, vs1, vs9 // a0_i*b0_i
+ xsmaddadp vs34, vs0, vs9 // a0_r*b0_i
+ xsmaddadp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmaddadp vs36, vs0, vs10 // a0_r*b1_r
+ xsmaddadp vs37, vs1, vs11 // a0_i*b1_i
+ xsmaddadp vs38, vs0, vs11 // a0_r*b1_i
+ xsmaddadp vs39, vs1, vs10 // a0_i*b1_r
+
+ xsmaddadp vs40, vs0, vs12 // a0_r*b2_r
+ xsmaddadp vs41, vs1, vs13 // a0_i*b2_i
+ xsmaddadp vs42, vs0, vs13 // a0_r*b2_i
+ xsmaddadp vs43, vs1, vs12 // a0_i*b2_r
+
+ xsmaddadp vs44, vs0, vs14 // a0_r*b3_r
+ xsmaddadp vs45, vs1, vs15 // a0_i*b3_i
+ xsmaddadp vs46, vs0, vs15 // a0_r*b3_i
+ xsmaddadp vs47, vs1, vs14 // a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_2
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs8, o0, BO // load b0_r
+ lxsspx vs9, o16, BO // load b0_i
+ lxsspx vs10, o32, BO // load b1_r
+ lxsspx vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxsspx vs12, o0, BO // load b2_r
+ lxsspx vs13, o16, BO // load b2_i
+ lxsspx vs14, o32, BO // load b3_r
+ lxsspx vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xsmaddadp vs32, vs4, vs16 // a4_r*b0_r
+ xsmaddadp vs33, vs5, vs17 // a4_i*b0_i
+ xsmaddadp vs34, vs4, vs17 // a4_r*b0_i
+ xsmaddadp vs35, vs5, vs16 // a4_i*b0_r
+
+ xsmaddadp vs36, vs4, vs18 // a4_r*b1_r
+ xsmaddadp vs37, vs5, vs19 // a4_i*b1_i
+ xsmaddadp vs38, vs4, vs19 // a4_r*b1_i
+ xsmaddadp vs39, vs5, vs18 // a4_i*b1_r
+
+ xsmaddadp vs40, vs4, vs20 // a4_r*b2_r
+ xsmaddadp vs41, vs5, vs21 // a4_i*b2_i
+ xsmaddadp vs42, vs4, vs21 // a4_r*b2_i
+ xsmaddadp vs43, vs5, vs20 // a4_i*b2_r
+
+ xsmaddadp vs44, vs4, vs22 // a4_r*b3_r
+ xsmaddadp vs45, vs5, vs23 // a4_i*b3_i
+ xsmaddadp vs46, vs4, vs23 // a4_r*b3_i
+ xsmaddadp vs47, vs5, vs22 // a4_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+ xsmaddadp vs32, vs4, vs16 // a4_r*b0_r
+ xsmaddadp vs33, vs5, vs17 // a4_i*b0_i
+ xsmaddadp vs34, vs4, vs17 // a4_r*b0_i
+ xsmaddadp vs35, vs5, vs16 // a4_i*b0_r
+
+ xsmaddadp vs36, vs4, vs18 // a4_r*b1_r
+ xsmaddadp vs37, vs5, vs19 // a4_i*b1_i
+ xsmaddadp vs38, vs4, vs19 // a4_r*b1_i
+ xsmaddadp vs39, vs5, vs18 // a4_i*b1_r
+
+ xsmaddadp vs40, vs4, vs20 // a4_r*b2_r
+ xsmaddadp vs41, vs5, vs21 // a4_i*b2_i
+ xsmaddadp vs42, vs4, vs21 // a4_r*b2_i
+ xsmaddadp vs43, vs5, vs20 // a4_i*b2_r
+
+ xsmaddadp vs44, vs4, vs22 // a4_r*b3_r
+ xsmaddadp vs45, vs5, vs23 // a4_i*b3_i
+ xsmaddadp vs46, vs4, vs23 // a4_r*b3_i
+ xsmaddadp vs47, vs5, vs22 // a4_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs8, o0, BO // load b0_r
+ lxsspx vs9, o16, BO // load b0_i
+ lxsspx vs10, o32, BO // load b1_r
+ lxsspx vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxsspx vs12, o0, BO // load b2_r
+ lxsspx vs13, o16, BO // load b2_i
+ lxsspx vs14, o32, BO // load b3_r
+ lxsspx vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xsmuldp vs32, vs0, vs8 // a0_r*b0_r
+ xsmuldp vs33, vs1, vs9 // a0_i*b0_i
+ xsmuldp vs34, vs0, vs9 // a0_r*b0_i
+ xsmuldp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmuldp vs36, vs0, vs10 // a0_r*b1_r
+ xsmuldp vs37, vs1, vs11 // a0_i*b1_i
+ xsmuldp vs38, vs0, vs11 // a0_r*b1_i
+ xsmuldp vs39, vs1, vs10 // a0_i*b1_r
+
+ xsmuldp vs40, vs0, vs12 // a0_r*b2_r
+ xsmuldp vs41, vs1, vs13 // a0_i*b2_i
+ xsmuldp vs42, vs0, vs13 // a0_r*b2_i
+ xsmuldp vs43, vs1, vs12 // a0_i*b2_r
+
+ xsmuldp vs44, vs0, vs14 // a0_r*b3_r
+ xsmuldp vs45, vs1, vs15 // a0_i*b3_i
+ xsmuldp vs46, vs0, vs15 // a0_r*b3_i
+ xsmuldp vs47, vs1, vs14 // a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs8, o0, BO // load b0_r
+ lxsspx vs9, o16, BO // load b0_i
+ lxsspx vs10, o32, BO // load b1_r
+ lxsspx vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+ lxsspx vs12, o0, BO // load b2_r
+ lxsspx vs13, o16, BO // load b2_i
+ lxsspx vs14, o32, BO // load b3_r
+ lxsspx vs15, o48, BO // load b3_i
+
+ addi BO, BO, 64
+
+
+
+ xsmaddadp vs32, vs0, vs8 // a0_r*b0_r
+ xsmaddadp vs33, vs1, vs9 // a0_i*b0_i
+ xsmaddadp vs34, vs0, vs9 // a0_r*b0_i
+ xsmaddadp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmaddadp vs36, vs0, vs10 // a0_r*b1_r
+ xsmaddadp vs37, vs1, vs11 // a0_i*b1_i
+ xsmaddadp vs38, vs0, vs11 // a0_r*b1_i
+ xsmaddadp vs39, vs1, vs10 // a0_i*b1_r
+
+ xsmaddadp vs40, vs0, vs12 // a0_r*b2_r
+ xsmaddadp vs41, vs1, vs13 // a0_i*b2_i
+ xsmaddadp vs42, vs0, vs13 // a0_r*b2_i
+ xsmaddadp vs43, vs1, vs12 // a0_i*b2_r
+
+ xsmaddadp vs44, vs0, vs14 // a0_r*b3_r
+ xsmaddadp vs45, vs1, vs15 // a0_i*b3_i
+ xsmaddadp vs46, vs0, vs15 // a0_r*b3_i
+ xsmaddadp vs47, vs1, vs14 // a0_i*b3_r
+
+
+.endm
+
+.macro SAVE4x1
+
+ mr T1, CO
+ xxlxor vs24, vs24, vs24
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+#ifndef TRMMKERNEL
+ lxsspx vs0, o0, T2 // load c0_r
+ lxsspx vs1, o4, T2 // load c0_i
+#else
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+#endif
+
+ XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r
+ XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i
+
+ XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i
+ XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r
+
+ xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r
+ xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i
+ xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i
+ xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r
+
+ xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xsadddp vs0, vs0, vs20
+ xsadddp vs1, vs1, vs21
+
+
+ stxsspx vs0, o0, T2 // store c0_r
+ stxsspx vs1, o4, T2 // store c0_i
+
+ addi T2, T2, 8
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+#ifndef TRMMKERNEL
+ lxsspx vs0, o0, T2 // load c0_r
+ lxsspx vs1, o4, T2 // load c0_i
+#else
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+#endif
+
+ XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r
+ XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i
+
+ XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i
+ XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r
+
+ xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r
+ xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i
+ xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i
+ xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r
+
+ xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xsadddp vs0, vs0, vs20
+ xsadddp vs1, vs1, vs21
+
+
+ stxsspx vs0, o0, T2 // store c0_r
+ stxsspx vs1, o4, T2 // store c0_i
+
+ addi T2, T2, 8
+ add T1, T1, LDC
+
+
+// N=2
+
+ mr T2, T1
+
+// N=2 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+#ifndef TRMMKERNEL
+ lxsspx vs0, o0, T2 // load c0_r
+ lxsspx vs1, o4, T2 // load c0_i
+#else
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+#endif
+
+ XSFADD_R1 vs4, vs4, vs40 // add a0_r * b0_r
+ XSFADD_I1 vs5, vs5, vs43 // add a0_r * b0_i
+
+ XSFADD_R2 vs4, vs4, vs41 // add a0_i * b0_i
+ XSFADD_I2 vs5, vs5, vs42 // add a0_i * b0_r
+
+ xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r
+ xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i
+ xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i
+ xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r
+
+ xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xsadddp vs0, vs0, vs20
+ xsadddp vs1, vs1, vs21
+
+
+ stxsspx vs0, o0, T2 // store c0_r
+ stxsspx vs1, o4, T2 // store c0_i
+
+ addi T2, T2, 8
+ add T1, T1, LDC
+
+
+// N=3
+
+ mr T2, T1
+
+// N=3 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+#ifndef TRMMKERNEL
+ lxsspx vs0, o0, T2 // load c0_r
+ lxsspx vs1, o4, T2 // load c0_i
+#else
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+#endif
+
+ XSFADD_R1 vs4, vs4, vs44 // add a0_r * b0_r
+ XSFADD_I1 vs5, vs5, vs47 // add a0_r * b0_i
+
+ XSFADD_R2 vs4, vs4, vs45 // add a0_i * b0_i
+ XSFADD_I2 vs5, vs5, vs46 // add a0_i * b0_r
+
+ xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r
+ xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i
+ xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i
+ xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r
+
+ xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xsadddp vs0, vs0, vs20
+ xsadddp vs1, vs1, vs21
+
+
+ stxsspx vs0, o0, T2 // store c0_r
+ stxsspx vs1, o4, T2 // store c0_i
+
+ addi T2, T2, 8
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+ lxvw4x vs2, o32, AO // load a4, a5
+ lxvw4x vs3, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+.endm
+
+.macro KERNEL2x8_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+ lxvw4x vs5, o16, AO // load a2, a3
+ lxvw4x vs6, o32, AO // load a4, a5
+ lxvw4x vs7, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+ lxvw4x vs18, o32, BO // load b1_r
+ lxvw4x vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+ lxvw4x vs5, o16, AO // load a2, a3
+ lxvw4x vs6, o32, AO // load a4, a5
+ lxvw4x vs7, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+ lxvw4x vs18, o32, BO // load b1_r
+ lxvw4x vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+ lxvw4x vs2, o32, AO // load a4, a5
+ lxvw4x vs3, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+ lxvw4x vs2, o32, AO // load a4, a5
+ lxvw4x vs3, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+ lxvw4x vs2, o32, AO // load a4, a5
+ lxvw4x vs3, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x8
+
+ mr T1, CO
+ xxlxor vs24, vs24, vs24
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=4
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+ xxspltw vs10, vs36, 2
+ xxspltw vs11, vs36, 3
+
+
+ xxspltw vs12, vs37, 0
+ xxspltw vs13, vs37, 1
+ xxspltw vs14, vs37, 2
+ xxspltw vs15, vs37, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=6
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs38, 0
+ xxspltw vs9, vs38, 1
+ xxspltw vs10, vs38, 2
+ xxspltw vs11, vs38, 3
+
+
+ xxspltw vs12, vs39, 0
+ xxspltw vs13, vs39, 1
+ xxspltw vs14, vs39, 2
+ xxspltw vs15, vs39, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs40, 0
+ xxspltw vs9, vs40, 1
+ xxspltw vs10, vs40, 2
+ xxspltw vs11, vs40, 3
+
+
+ xxspltw vs12, vs41, 0
+ xxspltw vs13, vs41, 1
+ xxspltw vs14, vs41, 2
+ xxspltw vs15, vs41, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs42, 0
+ xxspltw vs9, vs42, 1
+ xxspltw vs10, vs42, 2
+ xxspltw vs11, vs42, 3
+
+
+ xxspltw vs12, vs43, 0
+ xxspltw vs13, vs43, 1
+ xxspltw vs14, vs43, 2
+ xxspltw vs15, vs43, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=4
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs44, 0
+ xxspltw vs9, vs44, 1
+ xxspltw vs10, vs44, 2
+ xxspltw vs11, vs44, 3
+
+
+ xxspltw vs12, vs45, 0
+ xxspltw vs13, vs45, 1
+ xxspltw vs14, vs45, 2
+ xxspltw vs15, vs45, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=6
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs46, 0
+ xxspltw vs9, vs46, 1
+ xxspltw vs10, vs46, 2
+ xxspltw vs11, vs46, 3
+
+
+ xxspltw vs12, vs47, 0
+ xxspltw vs13, vs47, 1
+ xxspltw vs14, vs47, 2
+ xxspltw vs15, vs47, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+.endm
+
+.macro KERNEL2x4_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+ lxvw4x vs5, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+ lxvw4x vs18, o32, BO // load b1_r
+ lxvw4x vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+ lxvw4x vs5, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+ lxvw4x vs18, o32, BO // load b1_r
+ lxvw4x vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x4
+
+ mr T1, CO
+ xxlxor vs24, vs24, vs24
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+ xxspltw vs10, vs36, 2
+ xxspltw vs11, vs36, 3
+
+
+ xxspltw vs12, vs37, 0
+ xxspltw vs13, vs37, 1
+ xxspltw vs14, vs37, 2
+ xxspltw vs15, vs37, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs38, 0
+ xxspltw vs9, vs38, 1
+ xxspltw vs10, vs38, 2
+ xxspltw vs11, vs38, 3
+
+
+ xxspltw vs12, vs39, 0
+ xxspltw vs13, vs39, 1
+ xxspltw vs14, vs39, 2
+ xxspltw vs15, vs39, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+.endm
+
+.macro KERNEL2x2_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+ lxvw4x vs18, o32, BO // load b1_r
+ lxvw4x vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+ lxvw4x vs18, o32, BO // load b1_r
+ lxvw4x vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+ lxvw4x vs10, o32, BO // load b1_r
+ lxvw4x vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x2
+
+ mr T1, CO
+ xxlxor vs24, vs24, vs24
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs8, o0, BO // load b0_r
+ lxsspx vs9, o16, BO // load b0_i
+ lxsspx vs10, o32, BO // load b1_r
+ lxsspx vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+.endm
+
+.macro KERNEL2x1_I1
+
+
+ lxsspx vs4, o0, AO // load a0_r
+ lxsspx vs5, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs16, o0, BO // load b0_r
+ lxsspx vs17, o16, BO // load b0_i
+ lxsspx vs18, o32, BO // load b1_r
+ lxsspx vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xsmuldp vs32, vs0, vs8 // a0_r*b0_r
+ xsmuldp vs33, vs1, vs9 // a0_i*b0_i
+ xsmuldp vs34, vs0, vs9 // a0_r*b0_i
+ xsmuldp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmuldp vs36, vs0, vs10 // a0_r*b1_r
+ xsmuldp vs37, vs1, vs11 // a0_i*b1_i
+ xsmuldp vs38, vs0, vs11 // a0_r*b1_i
+ xsmuldp vs39, vs1, vs10 // a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+
+ lxsspx vs4, o0, AO // load a0_r
+ lxsspx vs5, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs16, o0, BO // load b0_r
+ lxsspx vs17, o16, BO // load b0_i
+ lxsspx vs18, o32, BO // load b1_r
+ lxsspx vs19, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xsmaddadp vs32, vs0, vs8 // a0_r*b0_r
+ xsmaddadp vs33, vs1, vs9 // a0_i*b0_i
+ xsmaddadp vs34, vs0, vs9 // a0_r*b0_i
+ xsmaddadp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmaddadp vs36, vs0, vs10 // a0_r*b1_r
+ xsmaddadp vs37, vs1, vs11 // a0_i*b1_i
+ xsmaddadp vs38, vs0, vs11 // a0_r*b1_i
+ xsmaddadp vs39, vs1, vs10 // a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs8, o0, BO // load b0_r
+ lxsspx vs9, o16, BO // load b0_i
+ lxsspx vs10, o32, BO // load b1_r
+ lxsspx vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xsmaddadp vs32, vs4, vs16 // a4_r*b0_r
+ xsmaddadp vs33, vs5, vs17 // a4_i*b0_i
+ xsmaddadp vs34, vs4, vs17 // a4_r*b0_i
+ xsmaddadp vs35, vs5, vs16 // a4_i*b0_r
+
+ xsmaddadp vs36, vs4, vs18 // a4_r*b1_r
+ xsmaddadp vs37, vs5, vs19 // a4_i*b1_i
+ xsmaddadp vs38, vs4, vs19 // a4_r*b1_i
+ xsmaddadp vs39, vs5, vs18 // a4_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+ xsmaddadp vs32, vs4, vs16 // a4_r*b0_r
+ xsmaddadp vs33, vs5, vs17 // a4_i*b0_i
+ xsmaddadp vs34, vs4, vs17 // a4_r*b0_i
+ xsmaddadp vs35, vs5, vs16 // a4_i*b0_r
+
+ xsmaddadp vs36, vs4, vs18 // a4_r*b1_r
+ xsmaddadp vs37, vs5, vs19 // a4_i*b1_i
+ xsmaddadp vs38, vs4, vs19 // a4_r*b1_i
+ xsmaddadp vs39, vs5, vs18 // a4_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs8, o0, BO // load b0_r
+ lxsspx vs9, o16, BO // load b0_i
+ lxsspx vs10, o32, BO // load b1_r
+ lxsspx vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xsmuldp vs32, vs0, vs8 // a0_r*b0_r
+ xsmuldp vs33, vs1, vs9 // a0_i*b0_i
+ xsmuldp vs34, vs0, vs9 // a0_r*b0_i
+ xsmuldp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmuldp vs36, vs0, vs10 // a0_r*b1_r
+ xsmuldp vs37, vs1, vs11 // a0_i*b1_i
+ xsmuldp vs38, vs0, vs11 // a0_r*b1_i
+ xsmuldp vs39, vs1, vs10 // a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs8, o0, BO // load b0_r
+ lxsspx vs9, o16, BO // load b0_i
+ lxsspx vs10, o32, BO // load b1_r
+ lxsspx vs11, o48, BO // load b1_i
+
+ addi BO, BO, 64
+
+
+
+ xsmaddadp vs32, vs0, vs8 // a0_r*b0_r
+ xsmaddadp vs33, vs1, vs9 // a0_i*b0_i
+ xsmaddadp vs34, vs0, vs9 // a0_r*b0_i
+ xsmaddadp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmaddadp vs36, vs0, vs10 // a0_r*b1_r
+ xsmaddadp vs37, vs1, vs11 // a0_i*b1_i
+ xsmaddadp vs38, vs0, vs11 // a0_r*b1_i
+ xsmaddadp vs39, vs1, vs10 // a0_i*b1_r
+
+
+.endm
+
+.macro SAVE2x1
+
+ mr T1, CO
+ xxlxor vs24, vs24, vs24
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+#ifndef TRMMKERNEL
+ lxsspx vs0, o0, T2 // load c0_r
+ lxsspx vs1, o4, T2 // load c0_i
+#else
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+#endif
+
+ XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r
+ XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i
+
+ XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i
+ XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r
+
+ xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r
+ xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i
+ xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i
+ xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r
+
+ xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xsadddp vs0, vs0, vs20
+ xsadddp vs1, vs1, vs21
+
+
+ stxsspx vs0, o0, T2 // store c0_r
+ stxsspx vs1, o4, T2 // store c0_i
+
+ addi T2, T2, 8
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+#ifndef TRMMKERNEL
+ lxsspx vs0, o0, T2 // load c0_r
+ lxsspx vs1, o4, T2 // load c0_i
+#else
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+#endif
+
+ XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r
+ XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i
+
+ XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i
+ XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r
+
+ xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r
+ xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i
+ xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i
+ xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r
+
+ xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xsadddp vs0, vs0, vs20
+ xsadddp vs1, vs1, vs21
+
+
+ stxsspx vs0, o0, T2 // store c0_r
+ stxsspx vs1, o4, T2 // store c0_i
+
+ addi T2, T2, 8
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+ lxvw4x vs2, o32, AO // load a4, a5
+ lxvw4x vs3, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL1x8_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+ lxvw4x vs5, o16, AO // load a2, a3
+ lxvw4x vs6, o32, AO // load a4, a5
+ lxvw4x vs7, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+ lxvw4x vs5, o16, AO // load a2, a3
+ lxvw4x vs6, o32, AO // load a4, a5
+ lxvw4x vs7, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+ lxvw4x vs2, o32, AO // load a4, a5
+ lxvw4x vs3, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+ lxvw4x vs2, o32, AO // load a4, a5
+ lxvw4x vs3, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+ lxvw4x vs2, o32, AO // load a4, a5
+ lxvw4x vs3, o48, AO // load a6, a7
+
+ addi AO, AO, 64
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x8
+
+ mr T1, CO
+ xxlxor vs24, vs24, vs24
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=4
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+ xxspltw vs10, vs36, 2
+ xxspltw vs11, vs36, 3
+
+
+ xxspltw vs12, vs37, 0
+ xxspltw vs13, vs37, 1
+ xxspltw vs14, vs37, 2
+ xxspltw vs15, vs37, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=6
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs38, 0
+ xxspltw vs9, vs38, 1
+ xxspltw vs10, vs38, 2
+ xxspltw vs11, vs38, 3
+
+
+ xxspltw vs12, vs39, 0
+ xxspltw vs13, vs39, 1
+ xxspltw vs14, vs39, 2
+ xxspltw vs15, vs39, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL1x4_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+ lxvw4x vs5, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+ lxvw4x vs5, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ addi AO, AO, 32
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x4
+
+ mr T1, CO
+ xxlxor vs24, vs24, vs24
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL1x2_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs16, o0, BO // load b0_r
+ lxvw4x vs17, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ addi AO, AO, 16
+
+ lxvw4x vs8, o0, BO // load b0_r
+ lxvw4x vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x2
+
+ mr T1, CO
+ xxlxor vs24, vs24, vs24
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs8, o0, BO // load b0_r
+ lxsspx vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL1x1_I1
+
+
+ lxsspx vs4, o0, AO // load a0_r
+ lxsspx vs5, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs16, o0, BO // load b0_r
+ lxsspx vs17, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs8 // a0_r*b0_r
+ xsmuldp vs33, vs1, vs9 // a0_i*b0_i
+ xsmuldp vs34, vs0, vs9 // a0_r*b0_i
+ xsmuldp vs35, vs1, vs8 // a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+
+ lxsspx vs4, o0, AO // load a0_r
+ lxsspx vs5, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs16, o0, BO // load b0_r
+ lxsspx vs17, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs8 // a0_r*b0_r
+ xsmaddadp vs33, vs1, vs9 // a0_i*b0_i
+ xsmaddadp vs34, vs0, vs9 // a0_r*b0_i
+ xsmaddadp vs35, vs1, vs8 // a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs8, o0, BO // load b0_r
+ lxsspx vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs4, vs16 // a4_r*b0_r
+ xsmaddadp vs33, vs5, vs17 // a4_i*b0_i
+ xsmaddadp vs34, vs4, vs17 // a4_r*b0_i
+ xsmaddadp vs35, vs5, vs16 // a4_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+ xsmaddadp vs32, vs4, vs16 // a4_r*b0_r
+ xsmaddadp vs33, vs5, vs17 // a4_i*b0_i
+ xsmaddadp vs34, vs4, vs17 // a4_r*b0_i
+ xsmaddadp vs35, vs5, vs16 // a4_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs8, o0, BO // load b0_r
+ lxsspx vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs8 // a0_r*b0_r
+ xsmuldp vs33, vs1, vs9 // a0_i*b0_i
+ xsmuldp vs34, vs0, vs9 // a0_r*b0_i
+ xsmuldp vs35, vs1, vs8 // a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ lxsspx vs8, o0, BO // load b0_r
+ lxsspx vs9, o16, BO // load b0_i
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs8 // a0_r*b0_r
+ xsmaddadp vs33, vs1, vs9 // a0_i*b0_i
+ xsmaddadp vs34, vs0, vs9 // a0_r*b0_i
+ xsmaddadp vs35, vs1, vs8 // a0_i*b0_r
+
+
+.endm
+
+.macro SAVE1x1
+
+ mr T1, CO
+ xxlxor vs24, vs24, vs24
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+#ifndef TRMMKERNEL
+ lxsspx vs0, o0, T2 // load c0_r
+ lxsspx vs1, o4, T2 // load c0_i
+#else
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+#endif
+
+ XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r
+ XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i
+
+ XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i
+ XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r
+
+ xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r
+ xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i
+ xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i
+ xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r
+
+ xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xsadddp vs0, vs0, vs20
+ xsadddp vs1, vs1, vs21
+
+
+ stxsspx vs0, o0, T2 // store c0_r
+ stxsspx vs1, o4, T2 // store c0_i
+
+ addi T2, T2, 8
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c
new file mode 100644
index 000000000..da97c896e
--- /dev/null
+++ b/kernel/power/cswap.c
@@ -0,0 +1,175 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+
+
+#if defined(POWER8)
+#include "cswap_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_32
+
+static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG i=0;
+ FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+ FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+
+ while ( i 0 )
+ {
+ cswap_kernel_32(n1, x, y);
+ i=n1;
+ ix = 2* n1;
+ iy = 2* n1;
+ }
+
+ while(i < n)
+ {
+
+ temp[0] = x[ix] ;
+ temp[1] = x[ix+1] ;
+ x[ix] = y[iy] ;
+ x[ix+1] = y[iy+1] ;
+ y[iy] = temp[0] ;
+ y[iy+1] = temp[1] ;
+
+ ix += 2 ;
+ iy += 2 ;
+ i++ ;
+
+
+ }
+
+
+ }
+ else
+ {
+
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ while(i < n)
+ {
+
+ temp[0] = x[ix] ;
+ temp[1] = x[ix+1] ;
+ x[ix] = y[iy] ;
+ x[ix+1] = y[iy+1] ;
+ y[iy] = temp[0] ;
+ y[iy+1] = temp[1] ;
+
+ ix += inc_x2 ;
+ iy += inc_y2 ;
+ i++ ;
+
+ }
+
+ }
+ return(0);
+
+
+}
+
+
diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c
new file mode 100644
index 000000000..90ab59c54
--- /dev/null
+++ b/kernel/power/cswap_microk_power8.c
@@ -0,0 +1,180 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_32 1
+
+static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ FLOAT *x2=x+1;
+ FLOAT *y2=y+1;
+ BLASLONG pre = 384;
+ BLASLONG alpha=0;
+
+ __asm__ __volatile__
+ (
+
+ "addi %3, %3, -4 \n\t"
+ "addi %4, %4, -4 \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "lxvw4x 32, 0, %2 \n\t"
+ "lxvw4x 33, %5, %2 \n\t"
+ "lxvw4x 34, %6, %2 \n\t"
+ "lxvw4x 35, %7, %2 \n\t"
+ "lxvw4x 36, %8, %2 \n\t"
+ "lxvw4x 37, %9, %2 \n\t"
+ "lxvw4x 38, %10, %2 \n\t"
+ "lxvw4x 39, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "lxvw4x 48, 0, %1 \n\t"
+ "lxvw4x 49, %5, %1 \n\t"
+ "lxvw4x 50, %6, %1 \n\t"
+ "lxvw4x 51, %7, %1 \n\t"
+ "lxvw4x 52, %8, %1 \n\t"
+ "lxvw4x 53, %9, %1 \n\t"
+ "lxvw4x 54, %10, %1 \n\t"
+ "lxvw4x 55, %11, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+
+ "lxvw4x 56, 0, %1 \n\t"
+ "lxvw4x 57, %5, %1 \n\t"
+ "lxvw4x 58, %6, %1 \n\t"
+ "lxvw4x 59, %7, %1 \n\t"
+ "lxvw4x 60, %8, %1 \n\t"
+ "lxvw4x 61, %9, %1 \n\t"
+ "lxvw4x 62, %10, %1 \n\t"
+ "lxvw4x 63, %11, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+
+ "stxvw4x 32, 0, %3 \n\t"
+ "stxvw4x 33, %5, %3 \n\t"
+ "stxvw4x 34, %6, %3 \n\t"
+ "stxvw4x 35, %7, %3 \n\t"
+ "stxvw4x 36, %8, %3 \n\t"
+ "stxvw4x 37, %9, %3 \n\t"
+ "stxvw4x 38, %10, %3 \n\t"
+ "stxvw4x 39, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvw4x 40, 0, %3 \n\t"
+ "stxvw4x 41, %5, %3 \n\t"
+ "stxvw4x 42, %6, %3 \n\t"
+ "stxvw4x 43, %7, %3 \n\t"
+ "stxvw4x 44, %8, %3 \n\t"
+ "stxvw4x 45, %9, %3 \n\t"
+ "stxvw4x 46, %10, %3 \n\t"
+ "stxvw4x 47, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvw4x 48, 0, %4 \n\t"
+ "stxvw4x 49, %5, %4 \n\t"
+ "stxvw4x 50, %6, %4 \n\t"
+ "stxvw4x 51, %7, %4 \n\t"
+ "stxvw4x 52, %8, %4 \n\t"
+ "stxvw4x 53, %9, %4 \n\t"
+ "stxvw4x 54, %10, %4 \n\t"
+ "stxvw4x 55, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "stxvw4x 56, 0, %4 \n\t"
+ "stxvw4x 57, %5, %4 \n\t"
+ "stxvw4x 58, %6, %4 \n\t"
+ "stxvw4x 59, %7, %4 \n\t"
+ "stxvw4x 60, %8, %4 \n\t"
+ "stxvw4x 61, %9, %4 \n\t"
+ "stxvw4x 62, %10, %4 \n\t"
+ "stxvw4x 63, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "addic. %0 , %0 , -32 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (y1), // 1
+ "r" (x1), // 2
+ "r" (y2), // 3
+ "r" (x2), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S
new file mode 100644
index 000000000..460a387fb
--- /dev/null
+++ b/kernel/power/ctrmm_kernel_8x4_power8.S
@@ -0,0 +1,399 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD lwz
+#else
+#define LOAD ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 400
+#define ALPHA_R_SP 304(SP)
+#define ALPHA_I_SP 312(SP)
+#else
+#define STACKSIZE 256
+#define ALPHA_R_SP 224(SP)
+#define ALPHA_I_SP 232(SP)
+#define FZERO 240(SP)
+#endif
+
+#define M r3
+#define N r4
+#define K r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A r6
+#define B r7
+#define C r8
+#define LDC r9
+#define OFFSET r10
+#else
+#define A r8
+#define B r9
+#define C r10
+#define LDC r6
+#define OFFSET r7
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A r10
+#define B r6
+#define C r7
+#define LDC r8
+#define OFFSET r9
+#else
+#define A r8
+#define B r9
+#define C r10
+#define LDC r6
+#define OFFSET r7
+#endif
+#endif
+
+#define o0 0
+
+#define alpha_dr vs28
+#define alpha_di vs29
+#define alpha_sr vs30
+#define alpha_si vs31
+
+#define o12 r12
+#define KKK r13
+#define K1 r14
+#define L r15
+#define o16 r16
+#define NOTUSED r17
+#define T2 r19
+#define KK r20
+#define o8 r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define o4 r27
+#define o32 r28
+#define o48 r29
+
+#define PRE r30
+#define T1 r31
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+#ifdef __64BIT__
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+ std r13, 288(SP)
+ std r12, 296(SP)
+#else
+ stw r31, 144(SP)
+ stw r30, 148(SP)
+ stw r29, 152(SP)
+ stw r28, 156(SP)
+ stw r27, 160(SP)
+ stw r26, 164(SP)
+ stw r25, 168(SP)
+ stw r24, 172(SP)
+ stw r23, 176(SP)
+ stw r22, 180(SP)
+ stw r21, 184(SP)
+ stw r20, 188(SP)
+ stw r19, 192(SP)
+ stw r18, 196(SP)
+ stw r17, 200(SP)
+ stw r16, 204(SP)
+ stw r15, 208(SP)
+ stw r14, 212(SP)
+ stw r13, 216(SP)
+#endif
+
+ stfs f1, ALPHA_R_SP
+ stfs f2, ALPHA_I_SP
+ // stw r0, FZERO
+
+#ifdef linux
+#ifdef __64BIT__
+ ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+ ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+ lwz B, FRAMESLOT(0) + STACKSIZE(SP)
+ lwz C, FRAMESLOT(1) + STACKSIZE(SP)
+ lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
+#else
+ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+ lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
+#else
+ lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg KK, OFFSET
+#endif
+#endif
+
+#include "ctrmm_macros_8x4_power8.S"
+
+ cmpwi cr0, M, 0
+ ble L999_H1
+ cmpwi cr0, N, 0
+ ble L999_H1
+ cmpwi cr0, K, 0
+ ble L999_H1
+
+ slwi LDC, LDC, ZBASE_SHIFT
+ li PRE, 384
+ li o4 , 4
+ li o8 , 8
+ li o12 , 12
+ li o16 , 16
+ li o32 , 32
+ li o48 , 48
+
+
+#ifdef __64BIT__
+ addi T1, SP, 304
+#else
+ addi T1, SP, 224
+#endif
+
+ lxsspx alpha_dr, 0, T1
+ lxsspx alpha_di, o8, T1
+ addi T1, SP, 360
+ li T2, 0
+
+ stw T2, 0(T1)
+ stw T2, 4(T1)
+ stw T2, 8(T1)
+ stxsspx alpha_dr, o12, T1
+ lxvw4x alpha_sr, o0 , T1
+ addi T1, T1, 16
+
+ stw T2, 0(T1)
+ stw T2, 4(T1)
+ stw T2, 8(T1)
+ stxsspx alpha_di, o12, T1
+ lxvw4x alpha_si, o0 , T1
+
+ .align 5
+
+#include "ctrmm_logic_8x4_power8.S"
+
+L999:
+ addi r3, 0, 0
+
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+#ifdef __64BIT__
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+ ld r13, 288(SP)
+ ld r12, 296(SP)
+#else
+ lwz r31, 144(SP)
+ lwz r30, 148(SP)
+ lwz r29, 152(SP)
+ lwz r28, 156(SP)
+ lwz r27, 160(SP)
+ lwz r26, 164(SP)
+ lwz r25, 168(SP)
+ lwz r24, 172(SP)
+ lwz r23, 176(SP)
+ lwz r22, 180(SP)
+ lwz r21, 184(SP)
+ lwz r20, 188(SP)
+ lwz r19, 192(SP)
+ lwz r18, 196(SP)
+ lwz r17, 200(SP)
+ lwz r16, 204(SP)
+ lwz r15, 208(SP)
+ lwz r14, 212(SP)
+ lwz r13, 216(SP)
+#endif
+
+ addi SP, SP, STACKSIZE
+
+ blr
+
+ EPILOGUE
+#endif
diff --git a/kernel/power/ctrmm_logic_8x4_power8.S b/kernel/power/ctrmm_logic_8x4_power8.S
new file mode 100644
index 000000000..9ab258501
--- /dev/null
+++ b/kernel/power/ctrmm_logic_8x4_power8.S
@@ -0,0 +1,1769 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+ srawi. J, N, 2
+ ble CTRMM_L4_END
+
+CTRMM_L4_BEGIN:
+
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 2
+ add C, C, T1
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 3
+ ble CTRMM_L4x8_END
+
+CTRMM_L4x8_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble CTRMM_L4x8_SUB0
+ cmpwi cr0, L, 1
+ ble CTRMM_L4x8_SUB4
+
+CTRMM_L4x8_LOOP_START:
+
+ dcbt AO, PRE
+ dcbt BO, PRE
+ LOAD4x8_1
+ KERNEL4x8_I1
+ dcbt AO, PRE
+ KERNEL4x8_2
+ KERNEL4x8_1
+ dcbt AO, PRE
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ dcbt AO, PRE
+ KERNEL4x8_2
+ KERNEL4x8_1
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL4x8_2
+
+ addic. L, L, -2
+ ble CTRMM_L4x8_LOOP_END
+
+ .align 5
+
+CTRMM_L4x8_LOOP:
+
+ KERNEL4x8_1
+ dcbt AO, PRE
+ KERNEL4x8_2
+ KERNEL4x8_1
+ dcbt AO, PRE
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ dcbt AO, PRE
+ KERNEL4x8_2
+ KERNEL4x8_1
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL4x8_2
+
+ addic. L, L, -1
+ bgt CTRMM_L4x8_LOOP
+
+CTRMM_L4x8_LOOP_END:
+
+ KERNEL4x8_1
+ dcbt AO, PRE
+ KERNEL4x8_2
+ KERNEL4x8_1
+ dcbt AO, PRE
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_E2
+
+ b CTRMM_L4x8_SUB1
+
+CTRMM_L4x8_SUB4:
+
+ KERNEL4x8_SUBI1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+
+ b CTRMM_L4x8_SUB1
+
+CTRMM_L4x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x8_SUBI1
+
+ addic. L, L, -1
+ ble CTRMM_L4x8_SAVE
+ b CTRMM_L4x8_SUB2
+
+CTRMM_L4x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble CTRMM_L4x8_SAVE
+
+CTRMM_L4x8_SUB2:
+
+ KERNEL4x8_SUB1
+
+ addic. L, L, -1
+ bgt CTRMM_L4x8_SUB2
+
+CTRMM_L4x8_SAVE:
+
+ SAVE4x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt CTRMM_L4x8_BEGIN
+
+CTRMM_L4x8_END:
+
+CTRMM_L4x4_BEGIN:
+ andi. T2, M, 7
+ ble CTRMM_L4x1_END
+
+ andi. T1, M, 4
+ ble CTRMM_L4x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble CTRMM_L4x4_SUB0
+ cmpwi cr0, L, 1
+ ble CTRMM_L4x4_SUB4
+
+CTRMM_L4x4_LOOP_START:
+
+ LOAD4x4_1
+ KERNEL4x4_I1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ addic. L, L, -2
+ ble CTRMM_L4x4_LOOP_END
+
+ .align 5
+
+CTRMM_L4x4_LOOP:
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ addic. L, L, -1
+ bgt CTRMM_L4x4_LOOP
+
+CTRMM_L4x4_LOOP_END:
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_E2
+
+ b CTRMM_L4x4_SUB1
+
+CTRMM_L4x4_SUB4:
+
+ KERNEL4x4_SUBI1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+
+ b CTRMM_L4x4_SUB1
+
+CTRMM_L4x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x4_SUBI1
+
+ addic. L, L, -1
+ ble CTRMM_L4x4_SAVE
+ b CTRMM_L4x4_SUB2
+
+CTRMM_L4x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble CTRMM_L4x4_SAVE
+
+CTRMM_L4x4_SUB2:
+
+ KERNEL4x4_SUB1
+
+ addic. L, L, -1
+ bgt CTRMM_L4x4_SUB2
+
+CTRMM_L4x4_SAVE:
+
+ SAVE4x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+CTRMM_L4x4_END:
+
+CTRMM_L4x2_BEGIN:
+
+ andi. T1, M, 2
+ ble CTRMM_L4x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble CTRMM_L4x2_SUB0
+ cmpwi cr0, L, 1
+ ble CTRMM_L4x2_SUB4
+
+CTRMM_L4x2_LOOP_START:
+
+ LOAD4x2_1
+ KERNEL4x2_I1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ addic. L, L, -2
+ ble CTRMM_L4x2_LOOP_END
+
+ .align 5
+
+CTRMM_L4x2_LOOP:
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ addic. L, L, -1
+ bgt CTRMM_L4x2_LOOP
+
+CTRMM_L4x2_LOOP_END:
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_E2
+
+ b CTRMM_L4x2_SUB1
+
+CTRMM_L4x2_SUB4:
+
+ KERNEL4x2_SUBI1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+
+ b CTRMM_L4x2_SUB1
+
+CTRMM_L4x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x2_SUBI1
+
+ addic. L, L, -1
+ ble CTRMM_L4x2_SAVE
+ b CTRMM_L4x2_SUB2
+
+CTRMM_L4x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble CTRMM_L4x2_SAVE
+
+CTRMM_L4x2_SUB2:
+
+ KERNEL4x2_SUB1
+
+ addic. L, L, -1
+ bgt CTRMM_L4x2_SUB2
+
+CTRMM_L4x2_SAVE:
+
+ SAVE4x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+CTRMM_L4x2_END:
+
+CTRMM_L4x1_BEGIN:
+
+ andi. T1, M, 1
+ ble CTRMM_L4x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 3 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble CTRMM_L4x1_SUB0
+ cmpwi cr0, L, 1
+ ble CTRMM_L4x1_SUB4
+
+CTRMM_L4x1_LOOP_START:
+
+ LOAD4x1_1
+ KERNEL4x1_I1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ addic. L, L, -2
+ ble CTRMM_L4x1_LOOP_END
+
+ .align 5
+
+CTRMM_L4x1_LOOP:
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ addic. L, L, -1
+ bgt CTRMM_L4x1_LOOP
+
+CTRMM_L4x1_LOOP_END:
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_E2
+
+ b CTRMM_L4x1_SUB1
+
+CTRMM_L4x1_SUB4:
+
+ KERNEL4x1_SUBI1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+
+ b CTRMM_L4x1_SUB1
+
+CTRMM_L4x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x1_SUBI1
+
+ addic. L, L, -1
+ ble CTRMM_L4x1_SAVE
+ b CTRMM_L4x1_SUB2
+
+CTRMM_L4x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble CTRMM_L4x1_SAVE
+
+CTRMM_L4x1_SUB2:
+
+ KERNEL4x1_SUB1
+
+ addic. L, L, -1
+ bgt CTRMM_L4x1_SUB2
+
+CTRMM_L4x1_SAVE:
+
+ SAVE4x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+CTRMM_L4x1_END:
+
+ slwi T1, K, 5
+ add B, B, T1
+
+#if !defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in B
+#endif
+
+
+ addic. J, J, -1
+ bgt CTRMM_L4_BEGIN
+
+ andi. T2, N, 3
+ ble L999_H2
+
+CTRMM_L4_END:
+
+ b CTRMM_L2_BEGIN
+
+L999_H1:
+
+ b L999_H2
+
+CTRMM_L2_BEGIN:
+
+ andi. T1, N, 2
+ ble CTRMM_L2_END
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 1
+ add C, C, T1
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 3
+ ble CTRMM_L2x8_END
+
+CTRMM_L2x8_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble CTRMM_L2x8_SUB0
+ cmpwi cr0, L, 1
+ ble CTRMM_L2x8_SUB4
+
+CTRMM_L2x8_LOOP_START:
+
+ LOAD2x8_1
+ KERNEL2x8_I1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ addic. L, L, -2
+ ble CTRMM_L2x8_LOOP_END
+
+ .align 5
+
+CTRMM_L2x8_LOOP:
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ addic. L, L, -1
+ bgt CTRMM_L2x8_LOOP
+
+CTRMM_L2x8_LOOP_END:
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_E2
+
+ b CTRMM_L2x8_SUB1
+
+CTRMM_L2x8_SUB4:
+
+ KERNEL2x8_SUBI1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ b CTRMM_L2x8_SUB1
+
+CTRMM_L2x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x8_SUBI1
+
+ addic. L, L, -1
+ ble CTRMM_L2x8_SAVE
+ b CTRMM_L2x8_SUB2
+
+CTRMM_L2x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble CTRMM_L2x8_SAVE
+
+CTRMM_L2x8_SUB2:
+
+ KERNEL2x8_SUB1
+
+ addic. L, L, -1
+ bgt CTRMM_L2x8_SUB2
+
+CTRMM_L2x8_SAVE:
+
+ SAVE2x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt CTRMM_L2x8_BEGIN
+
+CTRMM_L2x8_END:
+
+CTRMM_L2x4_BEGIN:
+ andi. T2, M, 7
+ ble CTRMM_L2x1_END
+
+ andi. T1, M, 4
+ ble CTRMM_L2x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble CTRMM_L2x4_SUB0
+ cmpwi cr0, L, 1
+ ble CTRMM_L2x4_SUB4
+
+CTRMM_L2x4_LOOP_START:
+
+ LOAD2x4_1
+ KERNEL2x4_I1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -2
+ ble CTRMM_L2x4_LOOP_END
+
+ .align 5
+
+CTRMM_L2x4_LOOP:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -1
+ bgt CTRMM_L2x4_LOOP
+
+CTRMM_L2x4_LOOP_END:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_E2
+
+ b CTRMM_L2x4_SUB1
+
+CTRMM_L2x4_SUB4:
+
+ KERNEL2x4_SUBI1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ b CTRMM_L2x4_SUB1
+
+CTRMM_L2x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x4_SUBI1
+
+ addic. L, L, -1
+ ble CTRMM_L2x4_SAVE
+ b CTRMM_L2x4_SUB2
+
+CTRMM_L2x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble CTRMM_L2x4_SAVE
+
+CTRMM_L2x4_SUB2:
+
+ KERNEL2x4_SUB1
+
+ addic. L, L, -1
+ bgt CTRMM_L2x4_SUB2
+
+CTRMM_L2x4_SAVE:
+
+ SAVE2x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+CTRMM_L2x4_END:
+
+CTRMM_L2x2_BEGIN:
+
+ andi. T1, M, 2
+ ble CTRMM_L2x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble CTRMM_L2x2_SUB0
+ cmpwi cr0, L, 1
+ ble CTRMM_L2x2_SUB4
+
+CTRMM_L2x2_LOOP_START:
+
+ LOAD2x2_1
+ KERNEL2x2_I1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -2
+ ble CTRMM_L2x2_LOOP_END
+
+ .align 5
+
+CTRMM_L2x2_LOOP:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -1
+ bgt CTRMM_L2x2_LOOP
+
+CTRMM_L2x2_LOOP_END:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_E2
+
+ b CTRMM_L2x2_SUB1
+
+CTRMM_L2x2_SUB4:
+
+ KERNEL2x2_SUBI1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ b CTRMM_L2x2_SUB1
+
+CTRMM_L2x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x2_SUBI1
+
+ addic. L, L, -1
+ ble CTRMM_L2x2_SAVE
+ b CTRMM_L2x2_SUB2
+
+CTRMM_L2x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble CTRMM_L2x2_SAVE
+
+CTRMM_L2x2_SUB2:
+
+ KERNEL2x2_SUB1
+
+ addic. L, L, -1
+ bgt CTRMM_L2x2_SUB2
+
+CTRMM_L2x2_SAVE:
+
+ SAVE2x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+CTRMM_L2x2_END:
+
+CTRMM_L2x1_BEGIN:
+
+ andi. T1, M, 1
+ ble CTRMM_L2x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 3 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble CTRMM_L2x1_SUB0
+ cmpwi cr0, L, 1
+ ble CTRMM_L2x1_SUB4
+
+CTRMM_L2x1_LOOP_START:
+
+ LOAD2x1_1
+ KERNEL2x1_I1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -2
+ ble CTRMM_L2x1_LOOP_END
+
+ .align 5
+
+CTRMM_L2x1_LOOP:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -1
+ bgt CTRMM_L2x1_LOOP
+
+CTRMM_L2x1_LOOP_END:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_E2
+
+ b CTRMM_L2x1_SUB1
+
+CTRMM_L2x1_SUB4:
+
+ KERNEL2x1_SUBI1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ b CTRMM_L2x1_SUB1
+
+CTRMM_L2x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x1_SUBI1
+
+ addic. L, L, -1
+ ble CTRMM_L2x1_SAVE
+ b CTRMM_L2x1_SUB2
+
+CTRMM_L2x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble CTRMM_L2x1_SAVE
+
+CTRMM_L2x1_SUB2:
+
+ KERNEL2x1_SUB1
+
+ addic. L, L, -1
+ bgt CTRMM_L2x1_SUB2
+
+CTRMM_L2x1_SAVE:
+
+ SAVE2x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+CTRMM_L2x1_END:
+
+ slwi T1, K, 4
+ add B, B, T1
+
+#if !defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in B
+#endif
+
+
+CTRMM_L2_END:
+
+ b CTRMM_L1_BEGIN
+
+L999_H2:
+
+ b L999
+
+CTRMM_L1_BEGIN:
+
+ andi. T1, N, 1
+ ble CTRMM_L1_END
+ mr CO, C
+ mr AO, A
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 3
+ ble CTRMM_L1x8_END
+
+CTRMM_L1x8_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble CTRMM_L1x8_SUB0
+ cmpwi cr0, L, 1
+ ble CTRMM_L1x8_SUB4
+
+CTRMM_L1x8_LOOP_START:
+
+ LOAD1x8_1
+ KERNEL1x8_I1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ addic. L, L, -2
+ ble CTRMM_L1x8_LOOP_END
+
+ .align 5
+
+CTRMM_L1x8_LOOP:
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ addic. L, L, -1
+ bgt CTRMM_L1x8_LOOP
+
+CTRMM_L1x8_LOOP_END:
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_E2
+
+ b CTRMM_L1x8_SUB1
+
+CTRMM_L1x8_SUB4:
+
+ KERNEL1x8_SUBI1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ b CTRMM_L1x8_SUB1
+
+CTRMM_L1x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x8_SUBI1
+
+ addic. L, L, -1
+ ble CTRMM_L1x8_SAVE
+ b CTRMM_L1x8_SUB2
+
+CTRMM_L1x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble CTRMM_L1x8_SAVE
+
+CTRMM_L1x8_SUB2:
+
+ KERNEL1x8_SUB1
+
+ addic. L, L, -1
+ bgt CTRMM_L1x8_SUB2
+
+CTRMM_L1x8_SAVE:
+
+ SAVE1x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt CTRMM_L1x8_BEGIN
+
+CTRMM_L1x8_END:
+
+CTRMM_L1x4_BEGIN:
+ andi. T2, M, 7
+ ble CTRMM_L1x1_END
+
+ andi. T1, M, 4
+ ble CTRMM_L1x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble CTRMM_L1x4_SUB0
+ cmpwi cr0, L, 1
+ ble CTRMM_L1x4_SUB4
+
+CTRMM_L1x4_LOOP_START:
+
+ LOAD1x4_1
+ KERNEL1x4_I1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -2
+ ble CTRMM_L1x4_LOOP_END
+
+ .align 5
+
+CTRMM_L1x4_LOOP:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -1
+ bgt CTRMM_L1x4_LOOP
+
+CTRMM_L1x4_LOOP_END:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_E2
+
+ b CTRMM_L1x4_SUB1
+
+CTRMM_L1x4_SUB4:
+
+ KERNEL1x4_SUBI1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ b CTRMM_L1x4_SUB1
+
+CTRMM_L1x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x4_SUBI1
+
+ addic. L, L, -1
+ ble CTRMM_L1x4_SAVE
+ b CTRMM_L1x4_SUB2
+
+CTRMM_L1x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble CTRMM_L1x4_SAVE
+
+CTRMM_L1x4_SUB2:
+
+ KERNEL1x4_SUB1
+
+ addic. L, L, -1
+ bgt CTRMM_L1x4_SUB2
+
+CTRMM_L1x4_SAVE:
+
+ SAVE1x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+CTRMM_L1x4_END:
+
+CTRMM_L1x2_BEGIN:
+
+ andi. T1, M, 2
+ ble CTRMM_L1x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble CTRMM_L1x2_SUB0
+ cmpwi cr0, L, 1
+ ble CTRMM_L1x2_SUB4
+
+CTRMM_L1x2_LOOP_START:
+
+ LOAD1x2_1
+ KERNEL1x2_I1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -2
+ ble CTRMM_L1x2_LOOP_END
+
+ .align 5
+
+CTRMM_L1x2_LOOP:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -1
+ bgt CTRMM_L1x2_LOOP
+
+CTRMM_L1x2_LOOP_END:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_E2
+
+ b CTRMM_L1x2_SUB1
+
+CTRMM_L1x2_SUB4:
+
+ KERNEL1x2_SUBI1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ b CTRMM_L1x2_SUB1
+
+CTRMM_L1x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x2_SUBI1
+
+ addic. L, L, -1
+ ble CTRMM_L1x2_SAVE
+ b CTRMM_L1x2_SUB2
+
+CTRMM_L1x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble CTRMM_L1x2_SAVE
+
+CTRMM_L1x2_SUB2:
+
+ KERNEL1x2_SUB1
+
+ addic. L, L, -1
+ bgt CTRMM_L1x2_SUB2
+
+CTRMM_L1x2_SAVE:
+
+ SAVE1x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+CTRMM_L1x2_END:
+
+CTRMM_L1x1_BEGIN:
+
+ andi. T1, M, 1
+ ble CTRMM_L1x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 3 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble CTRMM_L1x1_SUB0
+ cmpwi cr0, L, 1
+ ble CTRMM_L1x1_SUB4
+
+CTRMM_L1x1_LOOP_START:
+
+ LOAD1x1_1
+ KERNEL1x1_I1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -2
+ ble CTRMM_L1x1_LOOP_END
+
+ .align 5
+
+CTRMM_L1x1_LOOP:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -1
+ bgt CTRMM_L1x1_LOOP
+
+CTRMM_L1x1_LOOP_END:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_E2
+
+ b CTRMM_L1x1_SUB1
+
+CTRMM_L1x1_SUB4:
+
+ KERNEL1x1_SUBI1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ b CTRMM_L1x1_SUB1
+
+CTRMM_L1x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x1_SUBI1
+
+ addic. L, L, -1
+ ble CTRMM_L1x1_SAVE
+ b CTRMM_L1x1_SUB2
+
+CTRMM_L1x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble CTRMM_L1x1_SAVE
+
+CTRMM_L1x1_SUB2:
+
+ KERNEL1x1_SUB1
+
+ addic. L, L, -1
+ bgt CTRMM_L1x1_SUB2
+
+CTRMM_L1x1_SAVE:
+
+ SAVE1x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+CTRMM_L1x1_END:
+
+#if !defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in B
+#endif
+
+
+CTRMM_L1_END:
diff --git a/kernel/power/ctrmm_macros_8x4_power8.S b/kernel/power/ctrmm_macros_8x4_power8.S
new file mode 100644
index 000000000..48a21252c
--- /dev/null
+++ b/kernel/power/ctrmm_macros_8x4_power8.S
@@ -0,0 +1,6794 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xssubdp
+ #define XSFADD_I1 xsadddp
+ #define XSFADD_I2 xsadddp
+ #define XVFADD_R1 xvaddsp
+ #define XVFADD_R2 xvsubsp
+ #define XVFADD_I1 xvaddsp
+ #define XVFADD_I2 xvaddsp
+
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xsadddp
+ #define XSFADD_I1 xssubdp
+ #define XSFADD_I2 xsadddp
+ #define XVFADD_R1 xvaddsp
+ #define XVFADD_R2 xvaddsp
+ #define XVFADD_I1 xvsubsp
+ #define XVFADD_I2 xvaddsp
+
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xsadddp
+ #define XSFADD_I1 xsadddp
+ #define XSFADD_I2 xssubdp
+ #define XVFADD_R1 xvaddsp
+ #define XVFADD_R2 xvaddsp
+ #define XVFADD_I1 xvaddsp
+ #define XVFADD_I2 xvsubsp
+
+#else // CC || CR || RC || RR
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xssubdp
+ #define XSFADD_I1 xssubdp
+ #define XSFADD_I2 xssubdp
+ #define XVFADD_R1 xvaddsp
+ #define XVFADD_R2 xvsubsp
+ #define XVFADD_I1 xvsubsp
+ #define XVFADD_I2 xvsubsp
+
+#endif
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ lxvw4x vs2, o32, AO // load a4, a5
+
+ lxvw4x vs3, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs12, vs25, 0
+ xxspltw vs13, vs25, 1
+ xxspltw vs14, vs25, 2
+ xxspltw vs15, vs25, 3
+
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ lxvw4x vs5, o16, AO // load a2, a3
+
+ lxvw4x vs6, o32, AO // load a4, a5
+
+ lxvw4x vs7, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs20, vs25, 0
+ xxspltw vs21, vs25, 1
+ xxspltw vs22, vs25, 2
+ xxspltw vs23, vs25, 3
+
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ lxvw4x vs5, o16, AO // load a2, a3
+
+ lxvw4x vs6, o32, AO // load a4, a5
+
+ lxvw4x vs7, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs20, vs25, 0
+ xxspltw vs21, vs25, 1
+ xxspltw vs22, vs25, 2
+ xxspltw vs23, vs25, 3
+
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ lxvw4x vs2, o32, AO // load a4, a5
+
+ lxvw4x vs3, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs12, vs25, 0
+ xxspltw vs13, vs25, 1
+ xxspltw vs14, vs25, 2
+ xxspltw vs15, vs25, 3
+
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ lxvw4x vs2, o32, AO // load a4, a5
+
+ lxvw4x vs3, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs12, vs25, 0
+ xxspltw vs13, vs25, 1
+ xxspltw vs14, vs25, 2
+ xxspltw vs15, vs25, 3
+
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ lxvw4x vs2, o32, AO // load a4, a5
+
+ lxvw4x vs3, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs12, vs25, 0
+ xxspltw vs13, vs25, 1
+ xxspltw vs14, vs25, 2
+ xxspltw vs15, vs25, 3
+
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x8
+
+ mr T1, CO
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=4
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+ xxspltw vs10, vs36, 2
+ xxspltw vs11, vs36, 3
+
+
+ xxspltw vs12, vs37, 0
+ xxspltw vs13, vs37, 1
+ xxspltw vs14, vs37, 2
+ xxspltw vs15, vs37, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=6
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs38, 0
+ xxspltw vs9, vs38, 1
+ xxspltw vs10, vs38, 2
+ xxspltw vs11, vs38, 3
+
+
+ xxspltw vs12, vs39, 0
+ xxspltw vs13, vs39, 1
+ xxspltw vs14, vs39, 2
+ xxspltw vs15, vs39, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs40, 0
+ xxspltw vs9, vs40, 1
+ xxspltw vs10, vs40, 2
+ xxspltw vs11, vs40, 3
+
+
+ xxspltw vs12, vs41, 0
+ xxspltw vs13, vs41, 1
+ xxspltw vs14, vs41, 2
+ xxspltw vs15, vs41, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs42, 0
+ xxspltw vs9, vs42, 1
+ xxspltw vs10, vs42, 2
+ xxspltw vs11, vs42, 3
+
+
+ xxspltw vs12, vs43, 0
+ xxspltw vs13, vs43, 1
+ xxspltw vs14, vs43, 2
+ xxspltw vs15, vs43, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=4
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs44, 0
+ xxspltw vs9, vs44, 1
+ xxspltw vs10, vs44, 2
+ xxspltw vs11, vs44, 3
+
+
+ xxspltw vs12, vs45, 0
+ xxspltw vs13, vs45, 1
+ xxspltw vs14, vs45, 2
+ xxspltw vs15, vs45, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=6
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs46, 0
+ xxspltw vs9, vs46, 1
+ xxspltw vs10, vs46, 2
+ xxspltw vs11, vs46, 3
+
+
+ xxspltw vs12, vs47, 0
+ xxspltw vs13, vs47, 1
+ xxspltw vs14, vs47, 2
+ xxspltw vs15, vs47, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=2
+
+ mr T2, T1
+
+// N=2 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs48, 0
+ xxspltw vs9, vs48, 1
+ xxspltw vs10, vs48, 2
+ xxspltw vs11, vs48, 3
+
+
+ xxspltw vs12, vs49, 0
+ xxspltw vs13, vs49, 1
+ xxspltw vs14, vs49, 2
+ xxspltw vs15, vs49, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=2 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs50, 0
+ xxspltw vs9, vs50, 1
+ xxspltw vs10, vs50, 2
+ xxspltw vs11, vs50, 3
+
+
+ xxspltw vs12, vs51, 0
+ xxspltw vs13, vs51, 1
+ xxspltw vs14, vs51, 2
+ xxspltw vs15, vs51, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=2 M=4
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs52, 0
+ xxspltw vs9, vs52, 1
+ xxspltw vs10, vs52, 2
+ xxspltw vs11, vs52, 3
+
+
+ xxspltw vs12, vs53, 0
+ xxspltw vs13, vs53, 1
+ xxspltw vs14, vs53, 2
+ xxspltw vs15, vs53, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=2 M=6
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs54, 0
+ xxspltw vs9, vs54, 1
+ xxspltw vs10, vs54, 2
+ xxspltw vs11, vs54, 3
+
+
+ xxspltw vs12, vs55, 0
+ xxspltw vs13, vs55, 1
+ xxspltw vs14, vs55, 2
+ xxspltw vs15, vs55, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=3
+
+ mr T2, T1
+
+// N=3 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs56, 0
+ xxspltw vs9, vs56, 1
+ xxspltw vs10, vs56, 2
+ xxspltw vs11, vs56, 3
+
+
+ xxspltw vs12, vs57, 0
+ xxspltw vs13, vs57, 1
+ xxspltw vs14, vs57, 2
+ xxspltw vs15, vs57, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=3 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs58, 0
+ xxspltw vs9, vs58, 1
+ xxspltw vs10, vs58, 2
+ xxspltw vs11, vs58, 3
+
+
+ xxspltw vs12, vs59, 0
+ xxspltw vs13, vs59, 1
+ xxspltw vs14, vs59, 2
+ xxspltw vs15, vs59, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=3 M=4
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs60, 0
+ xxspltw vs9, vs60, 1
+ xxspltw vs10, vs60, 2
+ xxspltw vs11, vs60, 3
+
+
+ xxspltw vs12, vs61, 0
+ xxspltw vs13, vs61, 1
+ xxspltw vs14, vs61, 2
+ xxspltw vs15, vs61, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=3 M=6
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs62, 0
+ xxspltw vs9, vs62, 1
+ xxspltw vs10, vs62, 2
+ xxspltw vs11, vs62, 3
+
+
+ xxspltw vs12, vs63, 0
+ xxspltw vs13, vs63, 1
+ xxspltw vs14, vs63, 2
+ xxspltw vs15, vs63, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs12, vs25, 0
+ xxspltw vs13, vs25, 1
+ xxspltw vs14, vs25, 2
+ xxspltw vs15, vs25, 3
+
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x4_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ lxvw4x vs5, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs20, vs25, 0
+ xxspltw vs21, vs25, 1
+ xxspltw vs22, vs25, 2
+ xxspltw vs23, vs25, 3
+
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ lxvw4x vs5, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs20, vs25, 0
+ xxspltw vs21, vs25, 1
+ xxspltw vs22, vs25, 2
+ xxspltw vs23, vs25, 3
+
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs12, vs25, 0
+ xxspltw vs13, vs25, 1
+ xxspltw vs14, vs25, 2
+ xxspltw vs15, vs25, 3
+
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs12, vs25, 0
+ xxspltw vs13, vs25, 1
+ xxspltw vs14, vs25, 2
+ xxspltw vs15, vs25, 3
+
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs12, vs25, 0
+ xxspltw vs13, vs25, 1
+ xxspltw vs14, vs25, 2
+ xxspltw vs15, vs25, 3
+
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+ xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+ xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x4
+
+ mr T1, CO
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+ xxspltw vs10, vs36, 2
+ xxspltw vs11, vs36, 3
+
+
+ xxspltw vs12, vs37, 0
+ xxspltw vs13, vs37, 1
+ xxspltw vs14, vs37, 2
+ xxspltw vs15, vs37, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs38, 0
+ xxspltw vs9, vs38, 1
+ xxspltw vs10, vs38, 2
+ xxspltw vs11, vs38, 3
+
+
+ xxspltw vs12, vs39, 0
+ xxspltw vs13, vs39, 1
+ xxspltw vs14, vs39, 2
+ xxspltw vs15, vs39, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=2
+
+ mr T2, T1
+
+// N=2 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs40, 0
+ xxspltw vs9, vs40, 1
+ xxspltw vs10, vs40, 2
+ xxspltw vs11, vs40, 3
+
+
+ xxspltw vs12, vs41, 0
+ xxspltw vs13, vs41, 1
+ xxspltw vs14, vs41, 2
+ xxspltw vs15, vs41, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=2 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs42, 0
+ xxspltw vs9, vs42, 1
+ xxspltw vs10, vs42, 2
+ xxspltw vs11, vs42, 3
+
+
+ xxspltw vs12, vs43, 0
+ xxspltw vs13, vs43, 1
+ xxspltw vs14, vs43, 2
+ xxspltw vs15, vs43, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=3
+
+ mr T2, T1
+
+// N=3 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs44, 0
+ xxspltw vs9, vs44, 1
+ xxspltw vs10, vs44, 2
+ xxspltw vs11, vs44, 3
+
+
+ xxspltw vs12, vs45, 0
+ xxspltw vs13, vs45, 1
+ xxspltw vs14, vs45, 2
+ xxspltw vs15, vs45, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=3 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs46, 0
+ xxspltw vs9, vs46, 1
+ xxspltw vs10, vs46, 2
+ xxspltw vs11, vs46, 3
+
+
+ xxspltw vs12, vs47, 0
+ xxspltw vs13, vs47, 1
+ xxspltw vs14, vs47, 2
+ xxspltw vs15, vs47, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro LOAD4x2_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs12, vs25, 0
+ xxspltw vs13, vs25, 1
+ xxspltw vs14, vs25, 2
+ xxspltw vs15, vs25, 3
+
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x2_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs20, vs25, 0
+ xxspltw vs21, vs25, 1
+ xxspltw vs22, vs25, 2
+ xxspltw vs23, vs25, 3
+
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs20, vs25, 0
+ xxspltw vs21, vs25, 1
+ xxspltw vs22, vs25, 2
+ xxspltw vs23, vs25, 3
+
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs12, vs25, 0
+ xxspltw vs13, vs25, 1
+ xxspltw vs14, vs25, 2
+ xxspltw vs15, vs25, 3
+
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs12, vs25, 0
+ xxspltw vs13, vs25, 1
+ xxspltw vs14, vs25, 2
+ xxspltw vs15, vs25, 3
+
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+ lxvw4x vs25, o16, BO // load b2, b3
+
+ xxspltw vs12, vs25, 0
+ xxspltw vs13, vs25, 1
+ xxspltw vs14, vs25, 2
+ xxspltw vs15, vs25, 3
+
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+ xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+ xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+ xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+ xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x2
+
+ mr T1, CO
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=2
+
+ mr T2, T1
+
+// N=2 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+ xxspltw vs10, vs36, 2
+ xxspltw vs11, vs36, 3
+
+
+ xxspltw vs12, vs37, 0
+ xxspltw vs13, vs37, 1
+ xxspltw vs14, vs37, 2
+ xxspltw vs15, vs37, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=3
+
+ mr T2, T1
+
+// N=3 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs38, 0
+ xxspltw vs9, vs38, 1
+ xxspltw vs10, vs38, 2
+ xxspltw vs11, vs38, 3
+
+
+ xxspltw vs12, vs39, 0
+ xxspltw vs13, vs39, 1
+ xxspltw vs14, vs39, 2
+ xxspltw vs15, vs39, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro LOAD4x1_1
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1 // load b0_r
+ lxsspx vs9, o4, T1 // load b0_i
+
+ addi T1, T1,8
+
+ lxsspx vs10, o0, T1 // load b1_r
+ lxsspx vs11, o4, T1 // load b1_i
+
+ addi T1, T1,8
+
+ lxsspx vs12, o0, T1 // load b2_r
+ lxsspx vs13, o4, T1 // load b2_i
+
+ addi T1, T1,8
+
+ lxsspx vs14, o0, T1 // load b3_r
+ lxsspx vs15, o4, T1 // load b3_i
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x1_I1
+
+
+ lxsspx vs4, o0, AO // load a0_r
+ lxsspx vs5, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1 // load b0_r
+ lxsspx vs17, o4, T1 // load b0_i
+
+ addi T1, T1,8
+
+ lxsspx vs18, o0, T1 // load b1_r
+ lxsspx vs19, o4, T1 // load b1_i
+
+ addi T1, T1,8
+
+ lxsspx vs20, o0, T1 // load b2_r
+ lxsspx vs21, o4, T1 // load b2_i
+
+ addi T1, T1,8
+
+ lxsspx vs22, o0, T1 // load b3_r
+ lxsspx vs23, o4, T1 // load b3_i
+
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs8 // a0_r*b0_r
+ xsmuldp vs33, vs1, vs9 // a0_i*b0_i
+ xsmuldp vs34, vs0, vs9 // a0_r*b0_i
+ xsmuldp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmuldp vs36, vs0, vs10 // a0_r*b1_r
+ xsmuldp vs37, vs1, vs11 // a0_i*b1_i
+ xsmuldp vs38, vs0, vs11 // a0_r*b1_i
+ xsmuldp vs39, vs1, vs10 // a0_i*b1_r
+
+ xsmuldp vs40, vs0, vs12 // a0_r*b2_r
+ xsmuldp vs41, vs1, vs13 // a0_i*b2_i
+ xsmuldp vs42, vs0, vs13 // a0_r*b2_i
+ xsmuldp vs43, vs1, vs12 // a0_i*b2_r
+
+ xsmuldp vs44, vs0, vs14 // a0_r*b3_r
+ xsmuldp vs45, vs1, vs15 // a0_i*b3_i
+ xsmuldp vs46, vs0, vs15 // a0_r*b3_i
+ xsmuldp vs47, vs1, vs14 // a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_1
+
+
+ lxsspx vs4, o0, AO // load a0_r
+ lxsspx vs5, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1 // load b0_r
+ lxsspx vs17, o4, T1 // load b0_i
+
+ addi T1, T1,8
+
+ lxsspx vs18, o0, T1 // load b1_r
+ lxsspx vs19, o4, T1 // load b1_i
+
+ addi T1, T1,8
+
+ lxsspx vs20, o0, T1 // load b2_r
+ lxsspx vs21, o4, T1 // load b2_i
+
+ addi T1, T1,8
+
+ lxsspx vs22, o0, T1 // load b3_r
+ lxsspx vs23, o4, T1 // load b3_i
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs8 // a0_r*b0_r
+ xsmaddadp vs33, vs1, vs9 // a0_i*b0_i
+ xsmaddadp vs34, vs0, vs9 // a0_r*b0_i
+ xsmaddadp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmaddadp vs36, vs0, vs10 // a0_r*b1_r
+ xsmaddadp vs37, vs1, vs11 // a0_i*b1_i
+ xsmaddadp vs38, vs0, vs11 // a0_r*b1_i
+ xsmaddadp vs39, vs1, vs10 // a0_i*b1_r
+
+ xsmaddadp vs40, vs0, vs12 // a0_r*b2_r
+ xsmaddadp vs41, vs1, vs13 // a0_i*b2_i
+ xsmaddadp vs42, vs0, vs13 // a0_r*b2_i
+ xsmaddadp vs43, vs1, vs12 // a0_i*b2_r
+
+ xsmaddadp vs44, vs0, vs14 // a0_r*b3_r
+ xsmaddadp vs45, vs1, vs15 // a0_i*b3_i
+ xsmaddadp vs46, vs0, vs15 // a0_r*b3_i
+ xsmaddadp vs47, vs1, vs14 // a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_2
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1 // load b0_r
+ lxsspx vs9, o4, T1 // load b0_i
+
+ addi T1, T1,8
+
+ lxsspx vs10, o0, T1 // load b1_r
+ lxsspx vs11, o4, T1 // load b1_i
+
+ addi T1, T1,8
+
+ lxsspx vs12, o0, T1 // load b2_r
+ lxsspx vs13, o4, T1 // load b2_i
+
+ addi T1, T1,8
+
+ lxsspx vs14, o0, T1 // load b3_r
+ lxsspx vs15, o4, T1 // load b3_i
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs4, vs16 // a4_r*b0_r
+ xsmaddadp vs33, vs5, vs17 // a4_i*b0_i
+ xsmaddadp vs34, vs4, vs17 // a4_r*b0_i
+ xsmaddadp vs35, vs5, vs16 // a4_i*b0_r
+
+ xsmaddadp vs36, vs4, vs18 // a4_r*b1_r
+ xsmaddadp vs37, vs5, vs19 // a4_i*b1_i
+ xsmaddadp vs38, vs4, vs19 // a4_r*b1_i
+ xsmaddadp vs39, vs5, vs18 // a4_i*b1_r
+
+ xsmaddadp vs40, vs4, vs20 // a4_r*b2_r
+ xsmaddadp vs41, vs5, vs21 // a4_i*b2_i
+ xsmaddadp vs42, vs4, vs21 // a4_r*b2_i
+ xsmaddadp vs43, vs5, vs20 // a4_i*b2_r
+
+ xsmaddadp vs44, vs4, vs22 // a4_r*b3_r
+ xsmaddadp vs45, vs5, vs23 // a4_i*b3_i
+ xsmaddadp vs46, vs4, vs23 // a4_r*b3_i
+ xsmaddadp vs47, vs5, vs22 // a4_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+ xsmaddadp vs32, vs4, vs16 // a4_r*b0_r
+ xsmaddadp vs33, vs5, vs17 // a4_i*b0_i
+ xsmaddadp vs34, vs4, vs17 // a4_r*b0_i
+ xsmaddadp vs35, vs5, vs16 // a4_i*b0_r
+
+ xsmaddadp vs36, vs4, vs18 // a4_r*b1_r
+ xsmaddadp vs37, vs5, vs19 // a4_i*b1_i
+ xsmaddadp vs38, vs4, vs19 // a4_r*b1_i
+ xsmaddadp vs39, vs5, vs18 // a4_i*b1_r
+
+ xsmaddadp vs40, vs4, vs20 // a4_r*b2_r
+ xsmaddadp vs41, vs5, vs21 // a4_i*b2_i
+ xsmaddadp vs42, vs4, vs21 // a4_r*b2_i
+ xsmaddadp vs43, vs5, vs20 // a4_i*b2_r
+
+ xsmaddadp vs44, vs4, vs22 // a4_r*b3_r
+ xsmaddadp vs45, vs5, vs23 // a4_i*b3_i
+ xsmaddadp vs46, vs4, vs23 // a4_r*b3_i
+ xsmaddadp vs47, vs5, vs22 // a4_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1 // load b0_r
+ lxsspx vs9, o4, T1 // load b0_i
+
+ addi T1, T1,8
+
+ lxsspx vs10, o0, T1 // load b1_r
+ lxsspx vs11, o4, T1 // load b1_i
+
+ addi T1, T1,8
+
+ lxsspx vs12, o0, T1 // load b2_r
+ lxsspx vs13, o4, T1 // load b2_i
+
+ addi T1, T1,8
+
+ lxsspx vs14, o0, T1 // load b3_r
+ lxsspx vs15, o4, T1 // load b3_i
+
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs8 // a0_r*b0_r
+ xsmuldp vs33, vs1, vs9 // a0_i*b0_i
+ xsmuldp vs34, vs0, vs9 // a0_r*b0_i
+ xsmuldp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmuldp vs36, vs0, vs10 // a0_r*b1_r
+ xsmuldp vs37, vs1, vs11 // a0_i*b1_i
+ xsmuldp vs38, vs0, vs11 // a0_r*b1_i
+ xsmuldp vs39, vs1, vs10 // a0_i*b1_r
+
+ xsmuldp vs40, vs0, vs12 // a0_r*b2_r
+ xsmuldp vs41, vs1, vs13 // a0_i*b2_i
+ xsmuldp vs42, vs0, vs13 // a0_r*b2_i
+ xsmuldp vs43, vs1, vs12 // a0_i*b2_r
+
+ xsmuldp vs44, vs0, vs14 // a0_r*b3_r
+ xsmuldp vs45, vs1, vs15 // a0_i*b3_i
+ xsmuldp vs46, vs0, vs15 // a0_r*b3_i
+ xsmuldp vs47, vs1, vs14 // a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1 // load b0_r
+ lxsspx vs9, o4, T1 // load b0_i
+
+ addi T1, T1,8
+
+ lxsspx vs10, o0, T1 // load b1_r
+ lxsspx vs11, o4, T1 // load b1_i
+
+ addi T1, T1,8
+
+ lxsspx vs12, o0, T1 // load b2_r
+ lxsspx vs13, o4, T1 // load b2_i
+
+ addi T1, T1,8
+
+ lxsspx vs14, o0, T1 // load b3_r
+ lxsspx vs15, o4, T1 // load b3_i
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs8 // a0_r*b0_r
+ xsmaddadp vs33, vs1, vs9 // a0_i*b0_i
+ xsmaddadp vs34, vs0, vs9 // a0_r*b0_i
+ xsmaddadp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmaddadp vs36, vs0, vs10 // a0_r*b1_r
+ xsmaddadp vs37, vs1, vs11 // a0_i*b1_i
+ xsmaddadp vs38, vs0, vs11 // a0_r*b1_i
+ xsmaddadp vs39, vs1, vs10 // a0_i*b1_r
+
+ xsmaddadp vs40, vs0, vs12 // a0_r*b2_r
+ xsmaddadp vs41, vs1, vs13 // a0_i*b2_i
+ xsmaddadp vs42, vs0, vs13 // a0_r*b2_i
+ xsmaddadp vs43, vs1, vs12 // a0_i*b2_r
+
+ xsmaddadp vs44, vs0, vs14 // a0_r*b3_r
+ xsmaddadp vs45, vs1, vs15 // a0_i*b3_i
+ xsmaddadp vs46, vs0, vs15 // a0_r*b3_i
+ xsmaddadp vs47, vs1, vs14 // a0_i*b3_r
+
+
+.endm
+
+.macro SAVE4x1
+
+ mr T1, CO
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+#ifndef TRMMKERNEL
+ lxsspx vs0, o0, T2 // load c0_r
+ lxsspx vs1, o4, T2 // load c0_i
+#else
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+#endif
+
+ XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r
+ XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i
+
+ XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i
+ XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r
+
+ xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r
+ xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i
+ xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i
+ xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r
+
+ xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xsadddp vs0, vs0, vs20
+ xsadddp vs1, vs1, vs21
+
+
+ stxsspx vs0, o0, T2 // store c0_r
+ stxsspx vs1, o4, T2 // store c0_i
+
+ addi T2, T2, 8
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+#ifndef TRMMKERNEL
+ lxsspx vs0, o0, T2 // load c0_r
+ lxsspx vs1, o4, T2 // load c0_i
+#else
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+#endif
+
+ XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r
+ XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i
+
+ XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i
+ XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r
+
+ xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r
+ xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i
+ xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i
+ xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r
+
+ xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xsadddp vs0, vs0, vs20
+ xsadddp vs1, vs1, vs21
+
+
+ stxsspx vs0, o0, T2 // store c0_r
+ stxsspx vs1, o4, T2 // store c0_i
+
+ addi T2, T2, 8
+ add T1, T1, LDC
+
+
+// N=2
+
+ mr T2, T1
+
+// N=2 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+#ifndef TRMMKERNEL
+ lxsspx vs0, o0, T2 // load c0_r
+ lxsspx vs1, o4, T2 // load c0_i
+#else
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+#endif
+
+ XSFADD_R1 vs4, vs4, vs40 // add a0_r * b0_r
+ XSFADD_I1 vs5, vs5, vs43 // add a0_r * b0_i
+
+ XSFADD_R2 vs4, vs4, vs41 // add a0_i * b0_i
+ XSFADD_I2 vs5, vs5, vs42 // add a0_i * b0_r
+
+ xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r
+ xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i
+ xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i
+ xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r
+
+ xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xsadddp vs0, vs0, vs20
+ xsadddp vs1, vs1, vs21
+
+
+ stxsspx vs0, o0, T2 // store c0_r
+ stxsspx vs1, o4, T2 // store c0_i
+
+ addi T2, T2, 8
+ add T1, T1, LDC
+
+
+// N=3
+
+ mr T2, T1
+
+// N=3 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+#ifndef TRMMKERNEL
+ lxsspx vs0, o0, T2 // load c0_r
+ lxsspx vs1, o4, T2 // load c0_i
+#else
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+#endif
+
+ XSFADD_R1 vs4, vs4, vs44 // add a0_r * b0_r
+ XSFADD_I1 vs5, vs5, vs47 // add a0_r * b0_i
+
+ XSFADD_R2 vs4, vs4, vs45 // add a0_i * b0_i
+ XSFADD_I2 vs5, vs5, vs46 // add a0_i * b0_r
+
+ xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r
+ xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i
+ xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i
+ xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r
+
+ xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xsadddp vs0, vs0, vs20
+ xsadddp vs1, vs1, vs21
+
+
+ stxsspx vs0, o0, T2 // store c0_r
+ stxsspx vs1, o4, T2 // store c0_i
+
+ addi T2, T2, 8
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ lxvw4x vs2, o32, AO // load a4, a5
+
+ lxvw4x vs3, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x8_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ lxvw4x vs5, o16, AO // load a2, a3
+
+ lxvw4x vs6, o32, AO // load a4, a5
+
+ lxvw4x vs7, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ lxvw4x vs5, o16, AO // load a2, a3
+
+ lxvw4x vs6, o32, AO // load a4, a5
+
+ lxvw4x vs7, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ lxvw4x vs2, o32, AO // load a4, a5
+
+ lxvw4x vs3, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ lxvw4x vs2, o32, AO // load a4, a5
+
+ lxvw4x vs3, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ lxvw4x vs2, o32, AO // load a4, a5
+
+ lxvw4x vs3, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x8
+
+ mr T1, CO
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=4
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+ xxspltw vs10, vs36, 2
+ xxspltw vs11, vs36, 3
+
+
+ xxspltw vs12, vs37, 0
+ xxspltw vs13, vs37, 1
+ xxspltw vs14, vs37, 2
+ xxspltw vs15, vs37, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=6
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs38, 0
+ xxspltw vs9, vs38, 1
+ xxspltw vs10, vs38, 2
+ xxspltw vs11, vs38, 3
+
+
+ xxspltw vs12, vs39, 0
+ xxspltw vs13, vs39, 1
+ xxspltw vs14, vs39, 2
+ xxspltw vs15, vs39, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs40, 0
+ xxspltw vs9, vs40, 1
+ xxspltw vs10, vs40, 2
+ xxspltw vs11, vs40, 3
+
+
+ xxspltw vs12, vs41, 0
+ xxspltw vs13, vs41, 1
+ xxspltw vs14, vs41, 2
+ xxspltw vs15, vs41, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs42, 0
+ xxspltw vs9, vs42, 1
+ xxspltw vs10, vs42, 2
+ xxspltw vs11, vs42, 3
+
+
+ xxspltw vs12, vs43, 0
+ xxspltw vs13, vs43, 1
+ xxspltw vs14, vs43, 2
+ xxspltw vs15, vs43, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=4
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs44, 0
+ xxspltw vs9, vs44, 1
+ xxspltw vs10, vs44, 2
+ xxspltw vs11, vs44, 3
+
+
+ xxspltw vs12, vs45, 0
+ xxspltw vs13, vs45, 1
+ xxspltw vs14, vs45, 2
+ xxspltw vs15, vs45, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=6
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs46, 0
+ xxspltw vs9, vs46, 1
+ xxspltw vs10, vs46, 2
+ xxspltw vs11, vs46, 3
+
+
+ xxspltw vs12, vs47, 0
+ xxspltw vs13, vs47, 1
+ xxspltw vs14, vs47, 2
+ xxspltw vs15, vs47, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x4_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ lxvw4x vs5, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ lxvw4x vs5, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+ xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x4
+
+ mr T1, CO
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+ xxspltw vs10, vs36, 2
+ xxspltw vs11, vs36, 3
+
+
+ xxspltw vs12, vs37, 0
+ xxspltw vs13, vs37, 1
+ xxspltw vs14, vs37, 2
+ xxspltw vs15, vs37, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=1 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs38, 0
+ xxspltw vs9, vs38, 1
+ xxspltw vs10, vs38, 2
+ xxspltw vs11, vs38, 3
+
+
+ xxspltw vs12, vs39, 0
+ xxspltw vs13, vs39, 1
+ xxspltw vs14, vs39, 2
+ xxspltw vs15, vs39, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x2_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+ xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+ xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x2
+
+ mr T1, CO
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1 // load b0_r
+ lxsspx vs9, o4, T1 // load b0_i
+
+ addi T1, T1,8
+
+ lxsspx vs10, o0, T1 // load b1_r
+ lxsspx vs11, o4, T1 // load b1_i
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x1_I1
+
+
+ lxsspx vs4, o0, AO // load a0_r
+ lxsspx vs5, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1 // load b0_r
+ lxsspx vs17, o4, T1 // load b0_i
+
+ addi T1, T1,8
+
+ lxsspx vs18, o0, T1 // load b1_r
+ lxsspx vs19, o4, T1 // load b1_i
+
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs8 // a0_r*b0_r
+ xsmuldp vs33, vs1, vs9 // a0_i*b0_i
+ xsmuldp vs34, vs0, vs9 // a0_r*b0_i
+ xsmuldp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmuldp vs36, vs0, vs10 // a0_r*b1_r
+ xsmuldp vs37, vs1, vs11 // a0_i*b1_i
+ xsmuldp vs38, vs0, vs11 // a0_r*b1_i
+ xsmuldp vs39, vs1, vs10 // a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+
+ lxsspx vs4, o0, AO // load a0_r
+ lxsspx vs5, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1 // load b0_r
+ lxsspx vs17, o4, T1 // load b0_i
+
+ addi T1, T1,8
+
+ lxsspx vs18, o0, T1 // load b1_r
+ lxsspx vs19, o4, T1 // load b1_i
+
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs8 // a0_r*b0_r
+ xsmaddadp vs33, vs1, vs9 // a0_i*b0_i
+ xsmaddadp vs34, vs0, vs9 // a0_r*b0_i
+ xsmaddadp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmaddadp vs36, vs0, vs10 // a0_r*b1_r
+ xsmaddadp vs37, vs1, vs11 // a0_i*b1_i
+ xsmaddadp vs38, vs0, vs11 // a0_r*b1_i
+ xsmaddadp vs39, vs1, vs10 // a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1 // load b0_r
+ lxsspx vs9, o4, T1 // load b0_i
+
+ addi T1, T1,8
+
+ lxsspx vs10, o0, T1 // load b1_r
+ lxsspx vs11, o4, T1 // load b1_i
+
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs4, vs16 // a4_r*b0_r
+ xsmaddadp vs33, vs5, vs17 // a4_i*b0_i
+ xsmaddadp vs34, vs4, vs17 // a4_r*b0_i
+ xsmaddadp vs35, vs5, vs16 // a4_i*b0_r
+
+ xsmaddadp vs36, vs4, vs18 // a4_r*b1_r
+ xsmaddadp vs37, vs5, vs19 // a4_i*b1_i
+ xsmaddadp vs38, vs4, vs19 // a4_r*b1_i
+ xsmaddadp vs39, vs5, vs18 // a4_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+ xsmaddadp vs32, vs4, vs16 // a4_r*b0_r
+ xsmaddadp vs33, vs5, vs17 // a4_i*b0_i
+ xsmaddadp vs34, vs4, vs17 // a4_r*b0_i
+ xsmaddadp vs35, vs5, vs16 // a4_i*b0_r
+
+ xsmaddadp vs36, vs4, vs18 // a4_r*b1_r
+ xsmaddadp vs37, vs5, vs19 // a4_i*b1_i
+ xsmaddadp vs38, vs4, vs19 // a4_r*b1_i
+ xsmaddadp vs39, vs5, vs18 // a4_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1 // load b0_r
+ lxsspx vs9, o4, T1 // load b0_i
+
+ addi T1, T1,8
+
+ lxsspx vs10, o0, T1 // load b1_r
+ lxsspx vs11, o4, T1 // load b1_i
+
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs8 // a0_r*b0_r
+ xsmuldp vs33, vs1, vs9 // a0_i*b0_i
+ xsmuldp vs34, vs0, vs9 // a0_r*b0_i
+ xsmuldp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmuldp vs36, vs0, vs10 // a0_r*b1_r
+ xsmuldp vs37, vs1, vs11 // a0_i*b1_i
+ xsmuldp vs38, vs0, vs11 // a0_r*b1_i
+ xsmuldp vs39, vs1, vs10 // a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1 // load b0_r
+ lxsspx vs9, o4, T1 // load b0_i
+
+ addi T1, T1,8
+
+ lxsspx vs10, o0, T1 // load b1_r
+ lxsspx vs11, o4, T1 // load b1_i
+
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs8 // a0_r*b0_r
+ xsmaddadp vs33, vs1, vs9 // a0_i*b0_i
+ xsmaddadp vs34, vs0, vs9 // a0_r*b0_i
+ xsmaddadp vs35, vs1, vs8 // a0_i*b0_r
+
+ xsmaddadp vs36, vs0, vs10 // a0_r*b1_r
+ xsmaddadp vs37, vs1, vs11 // a0_i*b1_i
+ xsmaddadp vs38, vs0, vs11 // a0_r*b1_i
+ xsmaddadp vs39, vs1, vs10 // a0_i*b1_r
+
+
+.endm
+
+.macro SAVE2x1
+
+ mr T1, CO
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+#ifndef TRMMKERNEL
+ lxsspx vs0, o0, T2 // load c0_r
+ lxsspx vs1, o4, T2 // load c0_i
+#else
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+#endif
+
+ XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r
+ XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i
+
+ XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i
+ XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r
+
+ xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r
+ xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i
+ xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i
+ xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r
+
+ xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xsadddp vs0, vs0, vs20
+ xsadddp vs1, vs1, vs21
+
+
+ stxsspx vs0, o0, T2 // store c0_r
+ stxsspx vs1, o4, T2 // store c0_i
+
+ addi T2, T2, 8
+ add T1, T1, LDC
+
+
+// N=1
+
+ mr T2, T1
+
+// N=1 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+#ifndef TRMMKERNEL
+ lxsspx vs0, o0, T2 // load c0_r
+ lxsspx vs1, o4, T2 // load c0_i
+#else
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+#endif
+
+ XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r
+ XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i
+
+ XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i
+ XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r
+
+ xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r
+ xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i
+ xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i
+ xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r
+
+ xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xsadddp vs0, vs0, vs20
+ xsadddp vs1, vs1, vs21
+
+
+ stxsspx vs0, o0, T2 // store c0_r
+ stxsspx vs1, o4, T2 // store c0_i
+
+ addi T2, T2, 8
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ lxvw4x vs2, o32, AO // load a4, a5
+
+ lxvw4x vs3, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x8_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ lxvw4x vs5, o16, AO // load a2, a3
+
+ lxvw4x vs6, o32, AO // load a4, a5
+
+ lxvw4x vs7, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ lxvw4x vs5, o16, AO // load a2, a3
+
+ lxvw4x vs6, o32, AO // load a4, a5
+
+ lxvw4x vs7, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ lxvw4x vs2, o32, AO // load a4, a5
+
+ lxvw4x vs3, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ lxvw4x vs2, o32, AO // load a4, a5
+
+ lxvw4x vs3, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+ lxvw4x vs2, o32, AO // load a4, a5
+
+ lxvw4x vs3, o48, AO // load a6, a7
+
+
+ addi AO, AO, 64
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x8
+
+ mr T1, CO
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=4
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+ xxspltw vs10, vs36, 2
+ xxspltw vs11, vs36, 3
+
+
+ xxspltw vs12, vs37, 0
+ xxspltw vs13, vs37, 1
+ xxspltw vs14, vs37, 2
+ xxspltw vs15, vs37, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=6
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs38, 0
+ xxspltw vs9, vs38, 1
+ xxspltw vs10, vs38, 2
+ xxspltw vs11, vs38, 3
+
+
+ xxspltw vs12, vs39, 0
+ xxspltw vs13, vs39, 1
+ xxspltw vs14, vs39, 2
+ xxspltw vs15, vs39, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x4_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ lxvw4x vs5, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+ lxvw4x vs5, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+ lxvw4x vs1, o16, AO // load a2, a3
+
+
+ addi AO, AO, 32
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+ xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x4
+
+ mr T1, CO
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+
+// N=0 M=2
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs34, 0
+ xxspltw vs9, vs34, 1
+ xxspltw vs10, vs34, 2
+ xxspltw vs11, vs34, 3
+
+
+ xxspltw vs12, vs35, 0
+ xxspltw vs13, vs35, 1
+ xxspltw vs14, vs35, 2
+ xxspltw vs15, vs35, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x2_I1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+
+ lxvw4x vs4, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs16, vs24, 0
+ xxspltw vs17, vs24, 1
+ xxspltw vs18, vs24, 2
+ xxspltw vs19, vs24, 3
+
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+ xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+
+ lxvw4x vs0, o0, AO // load a0, a1
+
+
+ addi AO, AO, 16
+
+ lxvw4x vs24, o0, BO // load b0, b1
+
+
+
+ xxspltw vs8, vs24, 0
+ xxspltw vs9, vs24, 1
+ xxspltw vs10, vs24, 2
+ xxspltw vs11, vs24, 3
+
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+ xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x2
+
+ mr T1, CO
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+
+#ifndef TRMMKERNEL
+ lxvw4x vs0, o0, T2 // c0, c1
+#else
+ xxlxor vs0, vs0, vs0
+#endif
+
+
+ xxspltw vs8, vs32, 0
+ xxspltw vs9, vs32, 1
+ xxspltw vs10, vs32, 2
+ xxspltw vs11, vs32, 3
+
+
+ xxspltw vs12, vs33, 0
+ xxspltw vs13, vs33, 1
+ xxspltw vs14, vs33, 2
+ xxspltw vs15, vs33, 3
+
+ XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r
+ XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i
+ XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r
+ XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i
+
+ XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i
+ XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r
+ XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i
+ XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r
+
+ xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r
+ xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i
+ xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i
+ xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r
+
+ xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r
+ xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i
+ xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i
+ xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r
+
+ xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i
+ xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r
+
+ xxlxor vs24, vs24, vs24
+ xxsldwi vs20, vs20, vs24, 3 // r0_r
+ xxsldwi vs21, vs21, vs24, 2 // r0_i
+ xxsldwi vs22, vs22, vs24, 1 // r1_r
+ xxsldwi vs23, vs23, vs24, 0 // r1_i
+ xvaddsp vs20, vs20, vs21 // r0_r, r0_i
+ xvaddsp vs22, vs22, vs23 // r1_r, r1_i
+ xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i
+ xvaddsp vs0, vs0, vs1
+
+
+ stxvw4x vs0, o0, T2 // c0, c1
+
+ addi T2, T2, 16
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1 // load b0_r
+ lxsspx vs9, o4, T1 // load b0_i
+
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x1_I1
+
+
+ lxsspx vs4, o0, AO // load a0_r
+ lxsspx vs5, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1 // load b0_r
+ lxsspx vs17, o4, T1 // load b0_i
+
+ addi BO, BO, 8
+
+
+ xsmuldp vs32, vs0, vs8 // a0_r*b0_r
+ xsmuldp vs33, vs1, vs9 // a0_i*b0_i
+ xsmuldp vs34, vs0, vs9 // a0_r*b0_i
+ xsmuldp vs35, vs1, vs8 // a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+
+ lxsspx vs4, o0, AO // load a0_r
+ lxsspx vs5, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1 // load b0_r
+ lxsspx vs17, o4, T1 // load b0_i
+
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs0, vs8 // a0_r*b0_r
+ xsmaddadp vs33, vs1, vs9 // a0_i*b0_i
+ xsmaddadp vs34, vs0, vs9 // a0_r*b0_i
+ xsmaddadp vs35, vs1, vs8 // a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1 // load b0_r
+ lxsspx vs9, o4, T1 // load b0_i
+
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs4, vs16 // a4_r*b0_r
+ xsmaddadp vs33, vs5, vs17 // a4_i*b0_i
+ xsmaddadp vs34, vs4, vs17 // a4_r*b0_i
+ xsmaddadp vs35, vs5, vs16 // a4_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+ xsmaddadp vs32, vs4, vs16 // a4_r*b0_r
+ xsmaddadp vs33, vs5, vs17 // a4_i*b0_i
+ xsmaddadp vs34, vs4, vs17 // a4_r*b0_i
+ xsmaddadp vs35, vs5, vs16 // a4_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1 // load b0_r
+ lxsspx vs9, o4, T1 // load b0_i
+
+ addi BO, BO, 8
+
+
+ xsmuldp vs32, vs0, vs8 // a0_r*b0_r
+ xsmuldp vs33, vs1, vs9 // a0_i*b0_i
+ xsmuldp vs34, vs0, vs9 // a0_r*b0_i
+ xsmuldp vs35, vs1, vs8 // a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+
+ lxsspx vs0, o0, AO // load a0_r
+ lxsspx vs1, o4, AO // load a0_i
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1 // load b0_r
+ lxsspx vs9, o4, T1 // load b0_i
+
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs0, vs8 // a0_r*b0_r
+ xsmaddadp vs33, vs1, vs9 // a0_i*b0_i
+ xsmaddadp vs34, vs0, vs9 // a0_r*b0_i
+ xsmaddadp vs35, vs1, vs8 // a0_i*b0_r
+
+
+.endm
+
+.macro SAVE1x1
+
+ mr T1, CO
+
+// N=0
+
+ mr T2, T1
+
+// N=0 M=0
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+#ifndef TRMMKERNEL
+ lxsspx vs0, o0, T2 // load c0_r
+ lxsspx vs1, o4, T2 // load c0_i
+#else
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+#endif
+
+ XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r
+ XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i
+
+ XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i
+ XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r
+
+ xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r
+ xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i
+ xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i
+ xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r
+
+ xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i
+ xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r
+
+ xsadddp vs0, vs0, vs20
+ xsadddp vs1, vs1, vs21
+
+
+ stxsspx vs0, o0, T2 // store c0_r
+ stxsspx vs1, o4, T2 // store c0_i
+
+ addi T2, T2, 8
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c
new file mode 100644
index 000000000..77f5345ba
--- /dev/null
+++ b/kernel/power/dasum.c
@@ -0,0 +1,144 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+#include
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#if defined(POWER8)
+#include "dasum_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
+{
+
+ BLASLONG i=0;
+ FLOAT *x = x1;
+ FLOAT temp0, temp1, temp2, temp3;
+ FLOAT temp4, temp5, temp6, temp7;
+ FLOAT sum0 = 0.0;
+ FLOAT sum1 = 0.0;
+ FLOAT sum2 = 0.0;
+ FLOAT sum3 = 0.0;
+
+ while ( i< n )
+ {
+
+ temp0 = ABS(x[0]);
+ temp1 = ABS(x[1]);
+ temp2 = ABS(x[2]);
+ temp3 = ABS(x[3]);
+ temp4 = ABS(x[4]);
+ temp5 = ABS(x[5]);
+ temp6 = ABS(x[6]);
+ temp7 = ABS(x[7]);
+
+ sum0 += temp0;
+ sum1 += temp1;
+ sum2 += temp2;
+ sum3 += temp3;
+
+ sum0 += temp4;
+ sum1 += temp5;
+ sum2 += temp6;
+ sum3 += temp7;
+
+ x+=8;
+ i+=8;
+
+ }
+
+ svec[0] = sum0+sum1+sum2+sum3;
+ svec[1] = 0.0;
+
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ FLOAT sumf = 0.0;
+ FLOAT svec[2] __attribute__ ((aligned (16)));;
+ BLASLONG n1;
+
+ if (n <= 0 || inc_x <= 0) return(sumf);
+
+ if ( inc_x == 1 )
+ {
+
+ n1 = n & -16;
+ if ( n1 > 0 )
+ {
+
+ dasum_kernel_16(n1, x, svec);
+ sumf = svec[0] + svec[1];
+ i=n1;
+ }
+
+ while(i < n)
+ {
+ sumf += ABS(x[i]);
+ i++;
+ }
+
+ }
+ else
+ {
+
+ n *= inc_x;
+ while(i < n)
+ {
+ sumf += ABS(x[i]);
+ i += inc_x;
+ }
+
+ }
+ return(sumf);
+}
+
+
diff --git a/kernel/power/dasum_microk_power8.c b/kernel/power/dasum_microk_power8.c
new file mode 100644
index 000000000..cc38c4f7d
--- /dev/null
+++ b/kernel/power/dasum_microk_power8.c
@@ -0,0 +1,177 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
+
+static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ BLASLONG pre = 384;
+
+ __asm__ __volatile__
+ (
+
+ "dcbt %2 , %4 \n\t"
+
+ "xxlxor 32,32,32 \n\t"
+ "xxlxor 33,33,33 \n\t"
+ "xxlxor 34,34,34 \n\t"
+ "xxlxor 35,35,35 \n\t"
+ "xxlxor 36,36,36 \n\t"
+ "xxlxor 37,37,37 \n\t"
+ "xxlxor 38,38,38 \n\t"
+ "xxlxor 39,39,39 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -16 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "dcbt %2 , %4 \n\t"
+
+ "xvabsdp 48, 40 \n\t"
+ "xvabsdp 49, 41 \n\t"
+ "xvabsdp 50, 42 \n\t"
+ "xvabsdp 51, 43 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+
+ "xvabsdp 52, 44 \n\t"
+ "xvabsdp 53, 45 \n\t"
+
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+
+ "xvabsdp 54, 46 \n\t"
+ "xvabsdp 55, 47 \n\t"
+
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+
+ "xvadddp 32, 32, 48 \n\t"
+ "xvadddp 33, 33, 49 \n\t"
+
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "xvadddp 34, 34, 50 \n\t"
+ "xvadddp 35, 35, 51 \n\t"
+ "addi %2, %2, 128 \n\t"
+ "xvadddp 36, 36, 52 \n\t"
+ "xvadddp 37, 37, 53 \n\t"
+ "addic. %0 , %0 , -16 \n\t"
+ "xvadddp 38, 38, 54 \n\t"
+ "xvadddp 39, 39, 55 \n\t"
+
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+
+ "xvabsdp 48, 40 \n\t"
+ "xvabsdp 49, 41 \n\t"
+ "xvabsdp 50, 42 \n\t"
+ "xvabsdp 51, 43 \n\t"
+ "xvabsdp 52, 44 \n\t"
+ "xvabsdp 53, 45 \n\t"
+ "xvabsdp 54, 46 \n\t"
+ "xvabsdp 55, 47 \n\t"
+
+ "xvadddp 32, 32, 48 \n\t"
+ "xvadddp 33, 33, 49 \n\t"
+ "xvadddp 34, 34, 50 \n\t"
+ "xvadddp 35, 35, 51 \n\t"
+ "xvadddp 36, 36, 52 \n\t"
+ "xvadddp 37, 37, 53 \n\t"
+ "xvadddp 38, 38, 54 \n\t"
+ "xvadddp 39, 39, 55 \n\t"
+
+ "xvadddp 32, 32, 33 \n\t"
+ "xvadddp 34, 34, 35 \n\t"
+ "xvadddp 36, 36, 37 \n\t"
+ "xvadddp 38, 38, 39 \n\t"
+
+ "xvadddp 32, 32, 34 \n\t"
+ "xvadddp 36, 36, 38 \n\t"
+
+ "xvadddp 32, 32, 36 \n\t"
+
+
+ "stxvd2x 32, 0, %3 \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x1), // 2
+ "r" (svec), // 3
+ "r" (pre), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c
new file mode 100644
index 000000000..4365bd88d
--- /dev/null
+++ b/kernel/power/daxpy.c
@@ -0,0 +1,136 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+
+
+#if defined(POWER8)
+#include "daxpy_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG register i = 0;
+ FLOAT a = *alpha;
+
+ while(i < n)
+ {
+ y[i] += a * x[i];
+ y[i+1] += a * x[i+1];
+ y[i+2] += a * x[i+2];
+ y[i+3] += a * x[i+3];
+ y[i+4] += a * x[i+4];
+ y[i+5] += a * x[i+5];
+ y[i+6] += a * x[i+6];
+ y[i+7] += a * x[i+7];
+ i+=8 ;
+
+ }
+
+}
+
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+ FLOAT a2[4];
+ a2[0]=da;
+ a2[1]=da;
+ a2[2]=da;
+ a2[3]=da;
+
+ if ( n <= 0 ) return(0);
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ BLASLONG n1 = n & -16;
+
+ if ( n1 )
+ daxpy_kernel_8(n1, x, y , a2 );
+
+ i = n1;
+ while(i < n)
+ {
+
+ y[i] += da * x[i] ;
+ i++ ;
+
+ }
+ return(0);
+
+
+ }
+
+ BLASLONG n1 = n & -4;
+
+ while(i < n1)
+ {
+
+ FLOAT m1 = da * x[ix] ;
+ FLOAT m2 = da * x[ix+inc_x] ;
+ FLOAT m3 = da * x[ix+2*inc_x] ;
+ FLOAT m4 = da * x[ix+3*inc_x] ;
+
+ y[iy] += m1 ;
+ y[iy+inc_y] += m2 ;
+ y[iy+2*inc_y] += m3 ;
+ y[iy+3*inc_y] += m4 ;
+
+ ix += inc_x*4 ;
+ iy += inc_y*4 ;
+ i+=4 ;
+
+ }
+
+ while(i < n)
+ {
+
+ y[iy] += da * x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/power/daxpy_microk_power8.c b/kernel/power/daxpy_microk_power8.c
new file mode 100644
index 000000000..bb3f73aca
--- /dev/null
+++ b/kernel/power/daxpy_microk_power8.c
@@ -0,0 +1,201 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+#define HAVE_KERNEL_8 1
+static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
+
+static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ FLOAT *y2=y+1;
+ BLASLONG pre = 384;
+
+ __asm__ __volatile__
+ (
+
+ "lxsdx 33, %5, %4 \n\t"
+ "xxspltd 32, 33, 0 \n\t"
+ "addi %8, %8, -8 \n\t"
+
+ "dcbt %2, %9 \n\t"
+ "dcbt %3, %9 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+
+ "lxvd2x 48, 0, %3 \n\t"
+ "lxvd2x 49, %5, %3 \n\t"
+ "lxvd2x 50, %6, %3 \n\t"
+ "lxvd2x 51, %7, %3 \n\t"
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 64 \n\t"
+
+ "lxvd2x 44, 0, %2 \n\t"
+ "lxvd2x 45, %5, %2 \n\t"
+ "lxvd2x 46, %6, %2 \n\t"
+ "lxvd2x 47, %7, %2 \n\t"
+
+ "lxvd2x 52, 0, %3 \n\t"
+ "lxvd2x 53, %5, %3 \n\t"
+ "lxvd2x 54, %6, %3 \n\t"
+ "lxvd2x 55, %7, %3 \n\t"
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 64 \n\t"
+
+ "addic. %0 , %0 , -16 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "dcbt %2, %9 \n\t"
+ "dcbt %3, %9 \n\t"
+
+ "xvmaddadp 48, 40, 32 \n\t"
+ "xvmaddadp 49, 41, 32 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+
+ "stxvd2x 48, 0, %8 \n\t"
+ "stxvd2x 49, %5, %8 \n\t"
+
+ "xvmaddadp 50, 42, 32 \n\t"
+ "xvmaddadp 51, 43, 32 \n\t"
+
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+
+ "stxvd2x 50, %6, %8 \n\t"
+ "stxvd2x 51, %7, %8 \n\t"
+
+ "lxvd2x 48, 0, %3 \n\t"
+ "lxvd2x 49, %5, %3 \n\t"
+ "lxvd2x 50, %6, %3 \n\t"
+ "lxvd2x 51, %7, %3 \n\t"
+
+ "addi %2, %2, 64 \n\t"
+ "addi %8, %8, 64 \n\t"
+
+ "xvmaddadp 52, 44, 32 \n\t"
+ "addi %3, %3, 64 \n\t"
+ "xvmaddadp 53, 45, 32 \n\t"
+
+ "lxvd2x 44, 0, %2 \n\t"
+ "lxvd2x 45, %5, %2 \n\t"
+
+ "stxvd2x 52, 0, %8 \n\t"
+ "stxvd2x 53, %5, %8 \n\t"
+
+ "xvmaddadp 54, 46, 32 \n\t"
+ "xvmaddadp 55, 47, 32 \n\t"
+
+ "lxvd2x 46, %6, %2 \n\t"
+ "lxvd2x 47, %7, %2 \n\t"
+
+ "stxvd2x 54, %6, %8 \n\t"
+ "stxvd2x 55, %7, %8 \n\t"
+
+ "addi %2, %2, 64 \n\t"
+ "addi %8, %8, 64 \n\t"
+
+ "lxvd2x 52, 0, %3 \n\t"
+ "lxvd2x 53, %5, %3 \n\t"
+ "lxvd2x 54, %6, %3 \n\t"
+ "lxvd2x 55, %7, %3 \n\t"
+
+ "addi %3, %3, 64 \n\t"
+
+
+ "addic. %0 , %0 , -16 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+
+ "xvmaddadp 48, 40, 32 \n\t"
+ "xvmaddadp 49, 41, 32 \n\t"
+ "xvmaddadp 50, 42, 32 \n\t"
+ "xvmaddadp 51, 43, 32 \n\t"
+
+ "xvmaddadp 52, 44, 32 \n\t"
+ "xvmaddadp 53, 45, 32 \n\t"
+ "xvmaddadp 54, 46, 32 \n\t"
+ "xvmaddadp 55, 47, 32 \n\t"
+
+ "stxvd2x 48, 0, %8 \n\t"
+ "stxvd2x 49, %5, %8 \n\t"
+ "stxvd2x 50, %6, %8 \n\t"
+ "stxvd2x 51, %7, %8 \n\t"
+
+ "addi %8, %8, 64 \n\t"
+
+ "stxvd2x 52, 0, %8 \n\t"
+ "stxvd2x 53, %5, %8 \n\t"
+ "stxvd2x 54, %6, %8 \n\t"
+ "stxvd2x 55, %7, %8 \n\t"
+
+ "addi %8, %8, 64 \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x1), // 2
+ "r" (y1), // 3
+ "r" (alpha), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (y2), // 8
+ "r" (pre) // 9
+ : "cr0", "%0", "%2" , "%3", "%8", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/dcopy.c b/kernel/power/dcopy.c
new file mode 100644
index 000000000..059c0e5a9
--- /dev/null
+++ b/kernel/power/dcopy.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8)
+#include "dcopy_microk_power8.c"
+#endif
+
+#ifndef HAVE_KERNEL_32
+
+static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG i=0;
+ FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+
+ while ( i 0 )
+ {
+ dcopy_kernel_32(n1, x, y);
+ i=n1;
+ }
+
+ while(i < n)
+ {
+ y[i] = x[i] ;
+ i++ ;
+
+ }
+
+
+ }
+ else
+ {
+
+ while(i < n)
+ {
+ y[iy] = x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+
+ }
+ return(0);
+
+
+}
+
+
diff --git a/kernel/power/dcopy_microk_power8.c b/kernel/power/dcopy_microk_power8.c
new file mode 100644
index 000000000..04f7db556
--- /dev/null
+++ b/kernel/power/dcopy_microk_power8.c
@@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_32 1
+
+static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ BLASLONG pre = 384;
+ BLASLONG alpha=0;
+
+ __asm__ __volatile__
+ (
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "lxvd2x 50, 0, %2 \n\t"
+ "lxvd2x 51, %5, %2 \n\t"
+ "lxvd2x 52, %6, %2 \n\t"
+ "lxvd2x 53, %7, %2 \n\t"
+ "lxvd2x 54, %8, %2 \n\t"
+ "lxvd2x 55, %9, %2 \n\t"
+ "lxvd2x 56, %10, %2 \n\t"
+ "lxvd2x 57, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -32 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "stxvd2x 40, 0, %1 \n\t"
+ "stxvd2x 41, %5, %1 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "stxvd2x 42, %6, %1 \n\t"
+ "stxvd2x 43, %7, %1 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "stxvd2x 44, %8, %1 \n\t"
+ "stxvd2x 45, %9, %1 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "stxvd2x 46, %10, %1 \n\t"
+ "stxvd2x 47, %11, %1 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+
+ "addi %1, %1, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "stxvd2x 50, 0, %1 \n\t"
+ "stxvd2x 51, %5, %1 \n\t"
+ "lxvd2x 50, 0, %2 \n\t"
+ "lxvd2x 51, %5, %2 \n\t"
+ "stxvd2x 52, %6, %1 \n\t"
+ "stxvd2x 53, %7, %1 \n\t"
+ "lxvd2x 52, %6, %2 \n\t"
+ "lxvd2x 53, %7, %2 \n\t"
+ "stxvd2x 54, %8, %1 \n\t"
+ "stxvd2x 55, %9, %1 \n\t"
+ "lxvd2x 54, %8, %2 \n\t"
+ "lxvd2x 55, %9, %2 \n\t"
+ "stxvd2x 56, %10, %1 \n\t"
+ "stxvd2x 57, %11, %1 \n\t"
+ "lxvd2x 56, %10, %2 \n\t"
+ "lxvd2x 57, %11, %2 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -32 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ "stxvd2x 40, 0, %1 \n\t"
+ "stxvd2x 41, %5, %1 \n\t"
+ "stxvd2x 42, %6, %1 \n\t"
+ "stxvd2x 43, %7, %1 \n\t"
+ "stxvd2x 44, %8, %1 \n\t"
+ "stxvd2x 45, %9, %1 \n\t"
+ "stxvd2x 46, %10, %1 \n\t"
+ "stxvd2x 47, %11, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+
+ "stxvd2x 50, 0, %1 \n\t"
+ "stxvd2x 51, %5, %1 \n\t"
+ "stxvd2x 52, %6, %1 \n\t"
+ "stxvd2x 53, %7, %1 \n\t"
+ "stxvd2x 54, %8, %1 \n\t"
+ "stxvd2x 55, %9, %1 \n\t"
+ "stxvd2x 56, %10, %1 \n\t"
+ "stxvd2x 57, %11, %1 \n\t"
+
+
+ :
+ :
+ "r" (i), // 0
+ "r" (y1), // 1
+ "r" (x1), // 2
+ "r" (alpha), // 3
+ "r" (pre), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2" , "%1", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c
new file mode 100644
index 000000000..cef60a2e5
--- /dev/null
+++ b/kernel/power/ddot.c
@@ -0,0 +1,139 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/20 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+
+
+#if defined(POWER8)
+#include "ddot_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
+{
+ BLASLONG register i = 0;
+ FLOAT dot = 0.0;
+
+ while(i < n)
+ {
+ dot += y[i] * x[i]
+ + y[i+1] * x[i+1]
+ + y[i+2] * x[i+2]
+ + y[i+3] * x[i+3]
+ + y[i+4] * x[i+4]
+ + y[i+5] * x[i+5]
+ + y[i+6] * x[i+6]
+ + y[i+7] * x[i+7] ;
+
+ i+=8 ;
+
+ }
+ *d += dot;
+
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+
+ FLOAT dot = 0.0 ;
+
+ if ( n <= 0 ) return(dot);
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ BLASLONG n1 = n & -16;
+
+ if ( n1 )
+ ddot_kernel_8(n1, x, y , &dot );
+
+ i = n1;
+ while(i < n)
+ {
+
+ dot += y[i] * x[i] ;
+ i++ ;
+
+ }
+ return(dot);
+
+
+ }
+
+ FLOAT temp1 = 0.0;
+ FLOAT temp2 = 0.0;
+
+ BLASLONG n1 = n & -4;
+
+ while(i < n1)
+ {
+
+ FLOAT m1 = y[iy] * x[ix] ;
+ FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
+
+ FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
+ FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
+
+ ix += inc_x*4 ;
+ iy += inc_y*4 ;
+
+ temp1 += m1+m3;
+ temp2 += m2+m4;
+
+ i+=4 ;
+
+ }
+
+ while(i < n)
+ {
+
+ temp1 += y[iy] * x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ dot = temp1 + temp2;
+ return(dot);
+
+}
+
+
diff --git a/kernel/power/ddot_microk_power8.c b/kernel/power/ddot_microk_power8.c
new file mode 100644
index 000000000..b88049212
--- /dev/null
+++ b/kernel/power/ddot_microk_power8.c
@@ -0,0 +1,178 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/20 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_8 1
+static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
+
+static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ BLASLONG pre = 384;
+
+ __asm__ __volatile__
+ (
+ "xxlxor 32,32,32 \n\t"
+ "xxlxor 33,33,33 \n\t"
+ "xxlxor 34,34,34 \n\t"
+ "xxlxor 35,35,35 \n\t"
+ "xxlxor 36,36,36 \n\t"
+ "xxlxor 37,37,37 \n\t"
+ "xxlxor 38,38,38 \n\t"
+ "xxlxor 39,39,39 \n\t"
+
+ "dcbt %2, %12 \n\t"
+ "dcbt %3, %12 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 48, 0, %3 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 49, %5, %3 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 50, %6, %3 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 51, %7, %3 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 52, %8, %3 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 53, %9, %3 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 54, %10, %3 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+ "lxvd2x 55, %11, %3 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+ "addi %3, %3, 128 \n\t"
+
+ "addic. %0 , %0 , -16 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "dcbt %2, %12 \n\t"
+ "dcbt %3, %12 \n\t"
+
+ "xvmaddadp 32, 40, 48 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 48, 0, %3 \n\t"
+ "xvmaddadp 33, 41, 49 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 49, %5, %3 \n\t"
+ "xvmaddadp 34, 42, 50 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 50, %6, %3 \n\t"
+ "xvmaddadp 35, 43, 51 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 51, %7, %3 \n\t"
+ "xvmaddadp 36, 44, 52 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 52, %8, %3 \n\t"
+ "xvmaddadp 37, 45, 53 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 53, %9, %3 \n\t"
+ "xvmaddadp 38, 46, 54 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 54, %10, %3 \n\t"
+ "xvmaddadp 39, 47, 55 \n\t"
+
+ "lxvd2x 47, %11, %2 \n\t"
+ "lxvd2x 55, %11, %3 \n\t"
+
+
+ "addi %2, %2, 128 \n\t"
+ "addi %3, %3, 128 \n\t"
+
+ "addic. %0 , %0 , -16 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ "xvmaddadp 32, 40, 48 \n\t"
+ "xvmaddadp 33, 41, 49 \n\t"
+ "xvmaddadp 34, 42, 50 \n\t"
+ "xvmaddadp 35, 43, 51 \n\t"
+ "xvmaddadp 36, 44, 52 \n\t"
+ "xvmaddadp 37, 45, 53 \n\t"
+ "xvmaddadp 38, 46, 54 \n\t"
+ "xvmaddadp 39, 47, 55 \n\t"
+
+ "xvadddp 32, 32, 33 \n\t"
+ "xvadddp 34, 34, 35 \n\t"
+ "xvadddp 36, 36, 37 \n\t"
+ "xvadddp 38, 38, 39 \n\t"
+
+ "xvadddp 32, 32, 34 \n\t"
+ "xvadddp 36, 36, 38 \n\t"
+
+ "xvadddp 32, 32, 36 \n\t"
+
+ "xxswapd 33, 32 \n\t"
+
+ "xsadddp 32, 32, 33 \n\t"
+
+ "stxsdx 32, 0, %4 \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x1), // 2
+ "r" (y1), // 3
+ "r" (dot), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112), // 11
+ "r" (pre) // 12
+ : "cr0", "%0", "%2" , "%3", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c
new file mode 100644
index 000000000..812d09d15
--- /dev/null
+++ b/kernel/power/dgemv_n.c
@@ -0,0 +1,426 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+
+#include "common.h"
+
+
+#if defined(POWER8)
+#include "dgemv_n_microk_power8.c"
+#endif
+
+
+#define NBMAX 4096
+
+#ifndef HAVE_KERNEL_4x4
+
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1,*a2,*a3;
+ FLOAT x[4] __attribute__ ((aligned (16)));;
+ a0 = ap[0];
+ a1 = ap[1];
+ a2 = ap[2];
+ a3 = ap[3];
+
+ for ( i=0; i<4; i++)
+ x[i] = xo[i] * *alpha;
+
+ for ( i=0; i< n; i+=4 )
+ {
+ y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
+ y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
+ y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
+ y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
+ }
+}
+
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1;
+ FLOAT x[4] __attribute__ ((aligned (16)));;
+ a0 = ap[0];
+ a1 = ap[1];
+
+ for ( i=0; i<2; i++)
+ x[i] = xo[i] * *alpha;
+
+ for ( i=0; i< n; i+=4 )
+ {
+ y[i] += a0[i]*x[0] + a1[i]*x[1];
+ y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1];
+ y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1];
+ y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1];
+ }
+}
+
+
+#endif
+
+#ifndef HAVE_KERNEL_4x1
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG i;
+ FLOAT *a0;
+ FLOAT x[4] __attribute__ ((aligned (16)));;
+ a0 = ap;
+
+ for ( i=0; i<1; i++)
+ x[i] = xo[i] * *alpha;
+
+ for ( i=0; i< n; i+=4 )
+ {
+ y[i] += a0[i]*x[0];
+ y[i+1] += a0[i+1]*x[0];
+ y[i+2] += a0[i+2]*x[0];
+ y[i+3] += a0[i+3]*x[0];
+ }
+}
+
+
+#endif
+
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+ BLASLONG i;
+ if ( inc_dest != 1 )
+ {
+ for ( i=0; i> 2 ;
+ n2 = n & 3 ;
+
+ m3 = m & 3 ;
+ m1 = m & -4 ;
+ m2 = (m & (NBMAX-1)) - m3 ;
+
+ y_ptr = y;
+
+ BLASLONG NB = NBMAX;
+
+ while ( NB == NBMAX )
+ {
+
+ m1 -= NB;
+ if ( m1 < 0)
+ {
+ if ( m2 == 0 ) break;
+ NB = m2;
+ }
+
+ a_ptr = a;
+ x_ptr = x;
+
+ ap[0] = a_ptr;
+ ap[1] = a_ptr + lda;
+ ap[2] = ap[1] + lda;
+ ap[3] = ap[2] + lda;
+
+ if ( inc_y != 1 )
+ memset(ybuffer,0,NB*8);
+ else
+ ybuffer = y_ptr;
+
+ if ( inc_x == 1 )
+ {
+
+
+ for( i = 0; i < n1 ; i++)
+ {
+ dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,alpha_r);
+ ap[0] += lda4;
+ ap[1] += lda4;
+ ap[2] += lda4;
+ ap[3] += lda4;
+ a_ptr += lda4;
+ x_ptr += 4;
+ }
+
+ if ( n2 & 2 )
+ {
+ dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,alpha_r);
+ a_ptr += lda*2;
+ x_ptr += 2;
+ }
+
+
+ if ( n2 & 1 )
+ {
+ dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha_r);
+ a_ptr += lda;
+ x_ptr += 1;
+
+ }
+
+
+ }
+ else
+ {
+
+ for( i = 0; i < n1 ; i++)
+ {
+ xbuffer[0] = x_ptr[0];
+ x_ptr += inc_x;
+ xbuffer[1] = x_ptr[0];
+ x_ptr += inc_x;
+ xbuffer[2] = x_ptr[0];
+ x_ptr += inc_x;
+ xbuffer[3] = x_ptr[0];
+ x_ptr += inc_x;
+ dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha_r);
+ ap[0] += lda4;
+ ap[1] += lda4;
+ ap[2] += lda4;
+ ap[3] += lda4;
+ a_ptr += lda4;
+ }
+
+ for( i = 0; i < n2 ; i++)
+ {
+ xbuffer[0] = x_ptr[0];
+ x_ptr += inc_x;
+ dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha_r);
+ a_ptr += lda;
+
+ }
+
+ }
+
+ a += NB;
+ if ( inc_y != 1 )
+ {
+ add_y(NB,ybuffer,y_ptr,inc_y);
+ y_ptr += NB * inc_y;
+ }
+ else
+ y_ptr += NB ;
+
+ }
+
+ if ( m3 == 0 ) return(0);
+
+ if ( m3 == 3 )
+ {
+ a_ptr = a;
+ x_ptr = x;
+ FLOAT temp0 = 0.0;
+ FLOAT temp1 = 0.0;
+ FLOAT temp2 = 0.0;
+ if ( lda == 3 && inc_x ==1 )
+ {
+
+ for( i = 0; i < ( n & -4 ); i+=4 )
+ {
+
+ temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+ temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+
+ temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
+ temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+ temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
+
+ a_ptr += 12;
+ x_ptr += 4;
+ }
+
+ for( ; i < n; i++ )
+ {
+ temp0 += a_ptr[0] * x_ptr[0];
+ temp1 += a_ptr[1] * x_ptr[0];
+ temp2 += a_ptr[2] * x_ptr[0];
+ a_ptr += 3;
+ x_ptr ++;
+ }
+
+ }
+ else
+ {
+
+ for( i = 0; i < n; i++ )
+ {
+ temp0 += a_ptr[0] * x_ptr[0];
+ temp1 += a_ptr[1] * x_ptr[0];
+ temp2 += a_ptr[2] * x_ptr[0];
+ a_ptr += lda;
+ x_ptr += inc_x;
+
+
+ }
+
+ }
+ y_ptr[0] += alpha * temp0;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha * temp1;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha * temp2;
+ return(0);
+ }
+
+
+ if ( m3 == 2 )
+ {
+ a_ptr = a;
+ x_ptr = x;
+ FLOAT temp0 = 0.0;
+ FLOAT temp1 = 0.0;
+ if ( lda == 2 && inc_x ==1 )
+ {
+
+ for( i = 0; i < (n & -4) ; i+=4 )
+ {
+ temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+ temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+ temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+ a_ptr += 8;
+ x_ptr += 4;
+
+ }
+
+
+ for( ; i < n; i++ )
+ {
+ temp0 += a_ptr[0] * x_ptr[0];
+ temp1 += a_ptr[1] * x_ptr[0];
+ a_ptr += 2;
+ x_ptr ++;
+ }
+
+ }
+ else
+ {
+
+ for( i = 0; i < n; i++ )
+ {
+ temp0 += a_ptr[0] * x_ptr[0];
+ temp1 += a_ptr[1] * x_ptr[0];
+ a_ptr += lda;
+ x_ptr += inc_x;
+
+
+ }
+
+ }
+ y_ptr[0] += alpha * temp0;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha * temp1;
+ return(0);
+ }
+
+ if ( m3 == 1 )
+ {
+ a_ptr = a;
+ x_ptr = x;
+ FLOAT temp = 0.0;
+ if ( lda == 1 && inc_x ==1 )
+ {
+
+ for( i = 0; i < (n & -4); i+=4 )
+ {
+ temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+
+ }
+
+ for( ; i < n; i++ )
+ {
+ temp += a_ptr[i] * x_ptr[i];
+ }
+
+ }
+ else
+ {
+
+ for( i = 0; i < n; i++ )
+ {
+ temp += a_ptr[0] * x_ptr[0];
+ a_ptr += lda;
+ x_ptr += inc_x;
+ }
+
+ }
+ y_ptr[0] += alpha * temp;
+ return(0);
+ }
+
+
+ return(0);
+}
+
+
diff --git a/kernel/power/dgemv_n_microk_power8.c b/kernel/power/dgemv_n_microk_power8.c
new file mode 100644
index 000000000..9eabe555c
--- /dev/null
+++ b/kernel/power/dgemv_n_microk_power8.c
@@ -0,0 +1,301 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG i=n;
+ BLASLONG o8 = 8;
+ BLASLONG o16 = 16;
+ BLASLONG o24 = 24;
+ BLASLONG pre = 384;
+
+ FLOAT *a0,*a1,*a2,*a3;
+ FLOAT *y1=y+1;
+ FLOAT x[4] __attribute__ ((aligned (16)));;
+ a0 = ap[0]+1;
+ a1 = ap[1]+1;
+ a2 = ap[2]+1;
+ a3 = ap[3]+1;
+
+ x[0]=xo[0] * *alpha;
+ x[1]=xo[1] * *alpha;
+ x[2]=xo[2] * *alpha;
+ x[3]=xo[3] * *alpha;
+
+
+ __asm__ __volatile__
+ (
+ "lxvdsx 32, 0 , %1 \n\t" // x0
+ "lxvdsx 33,%3 , %1 \n\t" // x1
+ "lxvdsx 34,%4 , %1 \n\t" // x2
+ "lxvdsx 35,%5 , %1 \n\t" // x3
+ "addi %2 , %2 , -8 \n\t"
+ "addi %6 , %6 , -8 \n\t"
+ "addi %7 , %7 , -8 \n\t"
+ "addi %8 , %8 , -8 \n\t"
+ "addi %9 , %9 , -8 \n\t"
+
+ "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
+ "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
+
+ "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
+ "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
+
+ "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
+ "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
+
+ "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
+ "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
+
+ "addi %6, %6, 32 \n\t"
+ "addi %7, %7, 32 \n\t"
+ "addi %8, %8, 32 \n\t"
+ "addi %9, %9, 32 \n\t"
+
+ "addic. %0 , %0 , -4 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "dcbt %2, %10 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t" // y0, y1
+ "lxvd2x 41,%4, %2 \n\t" // y2, y3
+
+ "dcbt %6, %10 \n\t"
+ "dcbt %7, %10 \n\t"
+ "dcbt %8, %10 \n\t"
+ "dcbt %9, %10 \n\t"
+
+ "xvmaddadp 40, 48, 32 \n\t"
+ "xvmaddadp 41, 49, 32 \n\t"
+
+ "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
+ "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
+
+ "xvmaddadp 40, 50, 33 \n\t"
+ "addi %6, %6, 32 \n\t"
+ "xvmaddadp 41, 51, 33 \n\t"
+
+ "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
+ "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
+
+ "xvmaddadp 40, 52, 34 \n\t"
+ "addi %7, %7, 32 \n\t"
+ "xvmaddadp 41, 53, 34 \n\t"
+
+ "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
+ "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
+
+ "xvmaddadp 40, 54, 35 \n\t"
+ "addi %8, %8, 32 \n\t"
+ "xvmaddadp 41, 55, 35 \n\t"
+
+ "stxvd2x 40, 0, %2 \n\t" // y0, y1
+ "stxvd2x 41,%4, %2 \n\t" // y2, y3
+
+ "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
+ "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
+
+ "addi %9, %9, 32 \n\t"
+ "addi %2, %2, 32 \n\t"
+
+ "addic. %0 , %0 , -4 \n\t"
+ "ble 2f \n\t"
+
+
+ "lxvd2x 40, 0, %2 \n\t" // y0, y1
+ "lxvd2x 41,%4, %2 \n\t" // y2, y3
+
+ "xvmaddadp 40, 48, 32 \n\t"
+ "xvmaddadp 41, 49, 32 \n\t"
+
+ "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
+ "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
+
+ "xvmaddadp 40, 50, 33 \n\t"
+ "addi %6, %6, 32 \n\t"
+ "xvmaddadp 41, 51, 33 \n\t"
+
+ "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
+ "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
+
+ "xvmaddadp 40, 52, 34 \n\t"
+ "addi %7, %7, 32 \n\t"
+ "xvmaddadp 41, 53, 34 \n\t"
+
+ "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
+ "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
+
+ "xvmaddadp 40, 54, 35 \n\t"
+ "addi %8, %8, 32 \n\t"
+ "xvmaddadp 41, 55, 35 \n\t"
+
+ "stxvd2x 40, 0, %2 \n\t" // y0, y1
+ "stxvd2x 41,%4, %2 \n\t" // y2, y3
+
+ "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
+ "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
+
+ "addi %9, %9, 32 \n\t"
+ "addi %2, %2, 32 \n\t"
+
+ "addic. %0 , %0 , -4 \n\t"
+ "ble 2f \n\t"
+
+
+ "lxvd2x 40, 0, %2 \n\t" // y0, y1
+ "lxvd2x 41,%4, %2 \n\t" // y2, y3
+
+ "xvmaddadp 40, 48, 32 \n\t"
+ "xvmaddadp 41, 49, 32 \n\t"
+
+ "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
+ "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
+
+ "xvmaddadp 40, 50, 33 \n\t"
+ "addi %6, %6, 32 \n\t"
+ "xvmaddadp 41, 51, 33 \n\t"
+
+ "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
+ "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
+
+ "xvmaddadp 40, 52, 34 \n\t"
+ "addi %7, %7, 32 \n\t"
+ "xvmaddadp 41, 53, 34 \n\t"
+
+ "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
+ "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
+
+ "xvmaddadp 40, 54, 35 \n\t"
+ "addi %8, %8, 32 \n\t"
+ "xvmaddadp 41, 55, 35 \n\t"
+
+ "stxvd2x 40, 0, %2 \n\t" // y0, y1
+ "stxvd2x 41,%4, %2 \n\t" // y2, y3
+
+ "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
+ "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
+
+ "addi %9, %9, 32 \n\t"
+ "addi %2, %2, 32 \n\t"
+
+ "addic. %0 , %0 , -4 \n\t"
+ "ble 2f \n\t"
+
+
+ "lxvd2x 40, 0, %2 \n\t" // y0, y1
+ "lxvd2x 41,%4, %2 \n\t" // y2, y3
+
+ "xvmaddadp 40, 48, 32 \n\t"
+ "xvmaddadp 41, 49, 32 \n\t"
+
+ "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
+ "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
+
+ "xvmaddadp 40, 50, 33 \n\t"
+ "addi %6, %6, 32 \n\t"
+ "xvmaddadp 41, 51, 33 \n\t"
+
+ "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
+ "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
+
+ "xvmaddadp 40, 52, 34 \n\t"
+ "addi %7, %7, 32 \n\t"
+ "xvmaddadp 41, 53, 34 \n\t"
+
+ "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
+ "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
+
+ "xvmaddadp 40, 54, 35 \n\t"
+ "addi %8, %8, 32 \n\t"
+ "xvmaddadp 41, 55, 35 \n\t"
+
+ "stxvd2x 40, 0, %2 \n\t" // y0, y1
+ "stxvd2x 41,%4, %2 \n\t" // y2, y3
+
+ "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
+ "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
+
+ "addi %9, %9, 32 \n\t"
+ "addi %2, %2, 32 \n\t"
+
+ "addic. %0 , %0 , -4 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t" // y0, y1
+ "lxvd2x 41,%4, %2 \n\t" // y2, y3
+
+ "xvmaddadp 40, 48, 32 \n\t"
+ "xvmaddadp 41, 49, 32 \n\t"
+
+ "xvmaddadp 40, 50, 33 \n\t"
+ "xvmaddadp 41, 51, 33 \n\t"
+
+ "xvmaddadp 40, 52, 34 \n\t"
+ "xvmaddadp 41, 53, 34 \n\t"
+
+ "xvmaddadp 40, 54, 35 \n\t"
+ "xvmaddadp 41, 55, 35 \n\t"
+
+ "stxvd2x 40, 0, %2 \n\t" // y0, y1
+ "stxvd2x 41,%4, %2 \n\t" // y2, y3
+
+ :
+ :
+ "r" (i), // 0
+ "r" (x), // 1
+ "r" (y1), // 2
+ "r" (o8), // 3
+ "r" (o16), // 4
+ "r" (o24), // 5
+ "r" (a0), // 6
+ "r" (a1), // 7
+ "r" (a2), // 8
+ "r" (a3), // 9
+ "r" (pre) // 10
+ : "cr0", "%0", "%2" , "%6", "%7", "%8", "%9", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/drot.c b/kernel/power/drot.c
new file mode 100644
index 000000000..c93f69b12
--- /dev/null
+++ b/kernel/power/drot.c
@@ -0,0 +1,167 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+
+#include "common.h"
+
+#pragma GCC optimize "O1"
+
+#if defined(POWER8)
+#include "drot_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+{
+
+ BLASLONG i=0;
+ FLOAT f0, f1, f2, f3;
+ FLOAT x00, x01, x02, x03;
+ FLOAT g0, g1, g2, g3;
+ FLOAT y00, y01, y02, y03;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ FLOAT c1=*c;
+ FLOAT s1=*s;
+
+ while ( i 0 )
+ {
+ c1[0]=c;
+ c1[1]=c;
+ c1[2]=c;
+ c1[3]=c;
+ s1[0]=s;
+ s1[1]=s;
+ s1[2]=s;
+ s1[3]=s;
+ drot_kernel_16(n1, x1, y1, c1, s1);
+ i=n1;
+ }
+
+ while(i < n)
+ {
+ temp = c*x[i] + s*y[i] ;
+ y[i] = c*y[i] - s*x[i] ;
+ x[i] = temp ;
+
+ i++ ;
+
+ }
+
+
+ }
+ else
+ {
+
+ while(i < n)
+ {
+ temp = c*x[ix] + s*y[iy] ;
+ y[iy] = c*y[iy] - s*x[ix] ;
+ x[ix] = temp ;
+
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/power/drot_microk_power8.c b/kernel/power/drot_microk_power8.c
new file mode 100644
index 000000000..4444ac7eb
--- /dev/null
+++ b/kernel/power/drot_microk_power8.c
@@ -0,0 +1,211 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+*
+* I don't use fused multiply-add ( precision problems with lapack )
+*
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline));
+
+static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ FLOAT *x2=x+1;
+ FLOAT *y2=y+1;
+
+ __asm__ __volatile__
+ (
+
+ "lxsdx 36 , %5, %3 \n\t" // load c
+ "lxsdx 37 , %5, %4 \n\t" // load s
+ "addi %8 , %8, -8 \n\t"
+ "addi %9 , %9, -8 \n\t"
+
+ "xxspltd 36 , 36, 0 \n\t"
+ "xxspltd 37 , 37, 0 \n\t"
+
+ "lxvd2x 32, 0, %1 \n\t" // load x
+ "lxvd2x 33, %5, %1 \n\t"
+ "lxvd2x 34, %6, %1 \n\t"
+ "lxvd2x 35, %7, %1 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t" // load y
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+
+ "addi %1, %1, 64 \n\t"
+ "addi %2, %2, 64 \n\t"
+
+ "addic. %0 , %0 , -8 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "xvmuldp 48, 32, 36 \n\t" // c * x
+ "xvmuldp 49, 33, 36 \n\t"
+ "xvmuldp 50, 34, 36 \n\t"
+ "xvmuldp 51, 35, 36 \n\t"
+
+ "xvmuldp 56, 40, 36 \n\t" // c * y
+ "xvmuldp 57, 41, 36 \n\t"
+ "xvmuldp 58, 42, 36 \n\t"
+ "xvmuldp 59, 43, 36 \n\t"
+
+ "xvmuldp 52, 32, 37 \n\t" // s * x
+ "xvmuldp 53, 33, 37 \n\t"
+
+ "lxvd2x 32, 0, %1 \n\t" // load x
+ "lxvd2x 33, %5, %1 \n\t"
+
+ "xvmuldp 54, 34, 37 \n\t"
+ "xvmuldp 55, 35, 37 \n\t"
+
+ "lxvd2x 34, %6, %1 \n\t"
+ "lxvd2x 35, %7, %1 \n\t"
+
+ "xvmuldp 60, 40, 37 \n\t" // s * y
+ "xvmuldp 61, 41, 37 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t" // load y
+ "lxvd2x 41, %5, %2 \n\t"
+
+ "xvmuldp 62, 42, 37 \n\t"
+ "xvmuldp 63, 43, 37 \n\t"
+
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+
+ "xvadddp 48, 48 , 60 \n\t" // c * x + s * y
+ "xvadddp 49, 49 , 61 \n\t" // c * x + s * y
+
+ "addi %1, %1, 64 \n\t"
+ "addi %2, %2, 64 \n\t"
+
+ "xvadddp 50, 50 , 62 \n\t" // c * x + s * y
+ "xvadddp 51, 51 , 63 \n\t" // c * x + s * y
+
+ "xvsubdp 56, 56 , 52 \n\t" // c * y - s * x
+ "xvsubdp 57, 57 , 53 \n\t" // c * y - s * x
+ "xvsubdp 58, 58 , 54 \n\t" // c * y - s * x
+ "xvsubdp 59, 59 , 55 \n\t" // c * y - s * x
+
+ "stxvd2x 48, 0, %8 \n\t" // store x
+ "stxvd2x 49, %5, %8 \n\t"
+ "stxvd2x 50, %6, %8 \n\t"
+ "stxvd2x 51, %7, %8 \n\t"
+
+ "stxvd2x 56, 0, %9 \n\t" // store y
+ "stxvd2x 57, %5, %9 \n\t"
+ "stxvd2x 58, %6, %9 \n\t"
+ "stxvd2x 59, %7, %9 \n\t"
+
+ "addi %8, %8, 64 \n\t"
+ "addi %9, %9, 64 \n\t"
+
+ "addic. %0 , %0 , -8 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ "xvmuldp 48, 32, 36 \n\t" // c * x
+ "xvmuldp 49, 33, 36 \n\t"
+ "xvmuldp 50, 34, 36 \n\t"
+ "xvmuldp 51, 35, 36 \n\t"
+
+ "xvmuldp 56, 40, 36 \n\t" // c * y
+ "xvmuldp 57, 41, 36 \n\t"
+ "xvmuldp 58, 42, 36 \n\t"
+ "xvmuldp 59, 43, 36 \n\t"
+
+ "xvmuldp 52, 32, 37 \n\t" // s * x
+ "xvmuldp 53, 33, 37 \n\t"
+ "xvmuldp 54, 34, 37 \n\t"
+ "xvmuldp 55, 35, 37 \n\t"
+
+ "xvmuldp 60, 40, 37 \n\t" // s * y
+ "xvmuldp 61, 41, 37 \n\t"
+ "xvmuldp 62, 42, 37 \n\t"
+ "xvmuldp 63, 43, 37 \n\t"
+
+ "xvadddp 48, 48 , 60 \n\t" // c * x + s * y
+ "xvadddp 49, 49 , 61 \n\t" // c * x + s * y
+ "xvadddp 50, 50 , 62 \n\t" // c * x + s * y
+ "xvadddp 51, 51 , 63 \n\t" // c * x + s * y
+
+ "xvsubdp 56, 56 , 52 \n\t" // c * y - s * x
+ "xvsubdp 57, 57 , 53 \n\t" // c * y - s * x
+ "xvsubdp 58, 58 , 54 \n\t" // c * y - s * x
+ "xvsubdp 59, 59 , 55 \n\t" // c * y - s * x
+
+ "stxvd2x 48, 0, %8 \n\t" // store x
+ "stxvd2x 49, %5, %8 \n\t"
+ "stxvd2x 50, %6, %8 \n\t"
+ "stxvd2x 51, %7, %8 \n\t"
+
+ "stxvd2x 56, 0, %9 \n\t" // store y
+ "stxvd2x 57, %5, %9 \n\t"
+ "stxvd2x 58, %6, %9 \n\t"
+ "stxvd2x 59, %7, %9 \n\t"
+
+
+
+ :
+ :
+ "r" (i), // 0
+ "r" (x1), // 1
+ "r" (y1), // 2
+ "r" (c), // 3
+ "r" (s), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (x2), // 8
+ "r" (y2) // 9
+ : "cr0", "%0", "%1" , "%2", "%8", "%9", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c
new file mode 100644
index 000000000..c62a56315
--- /dev/null
+++ b/kernel/power/dscal.c
@@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8)
+#include "dscal_microk_power8.c"
+#endif
+
+#if !defined(HAVE_KERNEL_8)
+
+static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x )
+{
+
+ BLASLONG i;
+ FLOAT alpha = *da;
+
+ for( i=0; i 0 )
+ {
+ FLOAT alpha[2];
+ alpha[0]=da;
+ alpha[1]=da;
+ dscal_kernel_8_zero(n1 , alpha , x);
+ j=n1;
+ }
+
+ while(j < n)
+ {
+
+ x[j]=0.0;
+ j++;
+ }
+
+ }
+ else
+ {
+
+ BLASLONG n1 = n & -16;
+ if ( n1 > 0 )
+ {
+ FLOAT alpha[2];
+ alpha[0]=da;
+ alpha[1]=da;
+ dscal_kernel_8(n1 , alpha , x);
+ j=n1;
+ }
+ while(j < n)
+ {
+
+ x[j] = da * x[j] ;
+ j++;
+ }
+ }
+
+
+ }
+ else
+ {
+
+ if ( da == 0.0 )
+ {
+
+ while(j < n)
+ {
+
+ x[i]=0.0;
+ i += inc_x ;
+ j++;
+ }
+
+ }
+ else
+ {
+
+ while(j < n)
+ {
+
+ x[i] = da * x[i] ;
+ i += inc_x ;
+ j++;
+ }
+ }
+
+ }
+ return 0;
+
+}
+
+
diff --git a/kernel/power/dscal_microk_power8.c b/kernel/power/dscal_microk_power8.c
new file mode 100644
index 000000000..d90c3d80c
--- /dev/null
+++ b/kernel/power/dscal_microk_power8.c
@@ -0,0 +1,219 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
+
+static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ FLOAT *x2=x+1;
+ BLASLONG pre = 384;
+
+ __asm__ __volatile__
+ (
+
+ "lxsdx 33, 0, %3 \n\t"
+ "xxspltd 32, 33, 0 \n\t"
+ "addi %1, %1, -8 \n\t"
+
+ "dcbt %2, %4 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -16 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "dcbt %2, %4 \n\t"
+
+ "xvmuldp 48, 40, 32 \n\t"
+ "xvmuldp 49, 41, 32 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "xvmuldp 50, 42, 32 \n\t"
+ "xvmuldp 51, 43, 32 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "xvmuldp 52, 44, 32 \n\t"
+ "xvmuldp 53, 45, 32 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "xvmuldp 54, 46, 32 \n\t"
+ "xvmuldp 55, 47, 32 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "stxvd2x 48, 0, %1 \n\t"
+ "stxvd2x 49, %5, %1 \n\t"
+ "stxvd2x 50, %6, %1 \n\t"
+ "stxvd2x 51, %7, %1 \n\t"
+ "stxvd2x 52, %8, %1 \n\t"
+ "stxvd2x 53, %9, %1 \n\t"
+ "stxvd2x 54, %10, %1 \n\t"
+ "stxvd2x 55, %11, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -16 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ "xvmuldp 48, 40, 32 \n\t"
+ "xvmuldp 49, 41, 32 \n\t"
+ "xvmuldp 50, 42, 32 \n\t"
+ "xvmuldp 51, 43, 32 \n\t"
+ "xvmuldp 52, 44, 32 \n\t"
+ "xvmuldp 53, 45, 32 \n\t"
+ "xvmuldp 54, 46, 32 \n\t"
+ "xvmuldp 55, 47, 32 \n\t"
+
+ "stxvd2x 48, 0, %1 \n\t"
+ "stxvd2x 49, %5, %1 \n\t"
+ "stxvd2x 50, %6, %1 \n\t"
+ "stxvd2x 51, %7, %1 \n\t"
+ "stxvd2x 52, %8, %1 \n\t"
+ "stxvd2x 53, %9, %1 \n\t"
+ "stxvd2x 54, %10, %1 \n\t"
+ "stxvd2x 55, %11, %1 \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (x2), // 1
+ "r" (x1), // 2
+ "r" (alpha), // 3
+ "r" (pre), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2" , "%1", "memory"
+ );
+
+}
+
+
+static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
+
+static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ FLOAT *x2=x+1;
+ BLASLONG pre = 384;
+
+ __asm__ __volatile__
+ (
+
+ "xxlxor 32 , 32 , 32 \n\t"
+ "addi %1, %1, -8 \n\t"
+
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "stxvd2x 32, 0, %1 \n\t"
+ "stxvd2x 32, %5, %1 \n\t"
+ "stxvd2x 32, %6, %1 \n\t"
+ "stxvd2x 32, %7, %1 \n\t"
+ "stxvd2x 32, %8, %1 \n\t"
+ "stxvd2x 32, %9, %1 \n\t"
+ "stxvd2x 32, %10, %1 \n\t"
+ "stxvd2x 32, %11, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+
+ "addic. %0 , %0 , -16 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (x2), // 1
+ "r" (x1), // 2
+ "r" (alpha), // 3
+ "r" (pre), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2" , "%1", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c
new file mode 100644
index 000000000..fd2dec9c4
--- /dev/null
+++ b/kernel/power/dswap.c
@@ -0,0 +1,154 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8)
+#include "dswap_microk_power8.c"
+#endif
+
+#ifndef HAVE_KERNEL_32
+
+static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG i=0;
+ FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+ FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+
+ while ( i 0 )
+ {
+ dswap_kernel_32(n1, x, y);
+ i=n1;
+ }
+
+ while(i < n)
+ {
+ temp = y[i];
+ y[i] = x[i] ;
+ x[i] = temp;
+ i++ ;
+
+ }
+
+
+ }
+ else
+ {
+
+ while(i < n)
+ {
+ temp = y[iy];
+ y[iy] = x[ix] ;
+ x[ix] = temp;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+
+ }
+ return(0);
+
+
+}
+
+
diff --git a/kernel/power/dswap_microk_power8.c b/kernel/power/dswap_microk_power8.c
new file mode 100644
index 000000000..77747c3b9
--- /dev/null
+++ b/kernel/power/dswap_microk_power8.c
@@ -0,0 +1,180 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_32 1
+
+static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ FLOAT *x2=x+1;
+ FLOAT *y2=y+1;
+ BLASLONG pre = 384;
+ BLASLONG alpha=0;
+
+ __asm__ __volatile__
+ (
+
+ "addi %3, %3, -8 \n\t"
+ "addi %4, %4, -8 \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %5, %2 \n\t"
+ "lxvd2x 34, %6, %2 \n\t"
+ "lxvd2x 35, %7, %2 \n\t"
+ "lxvd2x 36, %8, %2 \n\t"
+ "lxvd2x 37, %9, %2 \n\t"
+ "lxvd2x 38, %10, %2 \n\t"
+ "lxvd2x 39, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "lxvd2x 48, 0, %1 \n\t"
+ "lxvd2x 49, %5, %1 \n\t"
+ "lxvd2x 50, %6, %1 \n\t"
+ "lxvd2x 51, %7, %1 \n\t"
+ "lxvd2x 52, %8, %1 \n\t"
+ "lxvd2x 53, %9, %1 \n\t"
+ "lxvd2x 54, %10, %1 \n\t"
+ "lxvd2x 55, %11, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+
+ "lxvd2x 56, 0, %1 \n\t"
+ "lxvd2x 57, %5, %1 \n\t"
+ "lxvd2x 58, %6, %1 \n\t"
+ "lxvd2x 59, %7, %1 \n\t"
+ "lxvd2x 60, %8, %1 \n\t"
+ "lxvd2x 61, %9, %1 \n\t"
+ "lxvd2x 62, %10, %1 \n\t"
+ "lxvd2x 63, %11, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvd2x 48, 0, %4 \n\t"
+ "stxvd2x 49, %5, %4 \n\t"
+ "stxvd2x 50, %6, %4 \n\t"
+ "stxvd2x 51, %7, %4 \n\t"
+ "stxvd2x 52, %8, %4 \n\t"
+ "stxvd2x 53, %9, %4 \n\t"
+ "stxvd2x 54, %10, %4 \n\t"
+ "stxvd2x 55, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "stxvd2x 56, 0, %4 \n\t"
+ "stxvd2x 57, %5, %4 \n\t"
+ "stxvd2x 58, %6, %4 \n\t"
+ "stxvd2x 59, %7, %4 \n\t"
+ "stxvd2x 60, %8, %4 \n\t"
+ "stxvd2x 61, %9, %4 \n\t"
+ "stxvd2x 62, %10, %4 \n\t"
+ "stxvd2x 63, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "addic. %0 , %0 , -32 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (y1), // 1
+ "r" (x1), // 2
+ "r" (y2), // 3
+ "r" (x2), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c
new file mode 100644
index 000000000..43311f2ba
--- /dev/null
+++ b/kernel/power/sasum.c
@@ -0,0 +1,146 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+#include
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#if defined(POWER8)
+#include "sasum_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_32
+
+static void sasum_kernel_32(BLASLONG n, FLOAT *x1, FLOAT *svec)
+{
+
+ BLASLONG i=0;
+ FLOAT *x = x1;
+ FLOAT temp0, temp1, temp2, temp3;
+ FLOAT temp4, temp5, temp6, temp7;
+ FLOAT sum0 = 0.0;
+ FLOAT sum1 = 0.0;
+ FLOAT sum2 = 0.0;
+ FLOAT sum3 = 0.0;
+
+ while ( i< n )
+ {
+
+ temp0 = ABS(x[0]);
+ temp1 = ABS(x[1]);
+ temp2 = ABS(x[2]);
+ temp3 = ABS(x[3]);
+ temp4 = ABS(x[4]);
+ temp5 = ABS(x[5]);
+ temp6 = ABS(x[6]);
+ temp7 = ABS(x[7]);
+
+ sum0 += temp0;
+ sum1 += temp1;
+ sum2 += temp2;
+ sum3 += temp3;
+
+ sum0 += temp4;
+ sum1 += temp5;
+ sum2 += temp6;
+ sum3 += temp7;
+
+ x+=8;
+ i+=8;
+
+ }
+
+ svec[0] = sum0+sum1+sum2+sum3;
+ svec[1] = 0.0;
+ svec[2] = 0.0;
+ svec[3] = 0.0;
+
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ FLOAT sumf = 0.0;
+ FLOAT svec[4] __attribute__ ((aligned (16)));;
+ BLASLONG n1;
+
+ if (n <= 0 || inc_x <= 0) return(sumf);
+
+ if ( inc_x == 1 )
+ {
+
+ n1 = n & -32;
+ if ( n1 > 0 )
+ {
+
+ sasum_kernel_32(n1, x, svec);
+ sumf = svec[0] + svec[1]+svec[2]+svec[3];
+ i=n1;
+ }
+
+ while(i < n)
+ {
+ sumf += ABS(x[i]);
+ i++;
+ }
+
+ }
+ else
+ {
+
+ n *= inc_x;
+ while(i < n)
+ {
+ sumf += ABS(x[i]);
+ i += inc_x;
+ }
+
+ }
+ return(sumf);
+}
+
+
diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c
new file mode 100644
index 000000000..847fffe04
--- /dev/null
+++ b/kernel/power/sasum_microk_power8.c
@@ -0,0 +1,177 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_32 1
+static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
+
+static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ BLASLONG pre = 384;
+
+ __asm__ __volatile__
+ (
+
+ "dcbt %2 , %4 \n\t"
+
+ "xxlxor 32,32,32 \n\t"
+ "xxlxor 33,33,33 \n\t"
+ "xxlxor 34,34,34 \n\t"
+ "xxlxor 35,35,35 \n\t"
+ "xxlxor 36,36,36 \n\t"
+ "xxlxor 37,37,37 \n\t"
+ "xxlxor 38,38,38 \n\t"
+ "xxlxor 39,39,39 \n\t"
+
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -32 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "dcbt %2 , %4 \n\t"
+
+ "xvabssp 48, 40 \n\t"
+ "xvabssp 49, 41 \n\t"
+ "xvabssp 50, 42 \n\t"
+ "xvabssp 51, 43 \n\t"
+
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+
+ "xvabssp 52, 44 \n\t"
+ "xvabssp 53, 45 \n\t"
+
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+
+ "xvabssp 54, 46 \n\t"
+ "xvabssp 55, 47 \n\t"
+
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+
+ "xvaddsp 32, 32, 48 \n\t"
+ "xvaddsp 33, 33, 49 \n\t"
+
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+
+ "xvaddsp 34, 34, 50 \n\t"
+ "xvaddsp 35, 35, 51 \n\t"
+ "addi %2, %2, 128 \n\t"
+ "xvaddsp 36, 36, 52 \n\t"
+ "xvaddsp 37, 37, 53 \n\t"
+ "addic. %0 , %0 , -32 \n\t"
+ "xvaddsp 38, 38, 54 \n\t"
+ "xvaddsp 39, 39, 55 \n\t"
+
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+
+ "xvabssp 48, 40 \n\t"
+ "xvabssp 49, 41 \n\t"
+ "xvabssp 50, 42 \n\t"
+ "xvabssp 51, 43 \n\t"
+ "xvabssp 52, 44 \n\t"
+ "xvabssp 53, 45 \n\t"
+ "xvabssp 54, 46 \n\t"
+ "xvabssp 55, 47 \n\t"
+
+ "xvaddsp 32, 32, 48 \n\t"
+ "xvaddsp 33, 33, 49 \n\t"
+ "xvaddsp 34, 34, 50 \n\t"
+ "xvaddsp 35, 35, 51 \n\t"
+ "xvaddsp 36, 36, 52 \n\t"
+ "xvaddsp 37, 37, 53 \n\t"
+ "xvaddsp 38, 38, 54 \n\t"
+ "xvaddsp 39, 39, 55 \n\t"
+
+ "xvaddsp 32, 32, 33 \n\t"
+ "xvaddsp 34, 34, 35 \n\t"
+ "xvaddsp 36, 36, 37 \n\t"
+ "xvaddsp 38, 38, 39 \n\t"
+
+ "xvaddsp 32, 32, 34 \n\t"
+ "xvaddsp 36, 36, 38 \n\t"
+
+ "xvaddsp 32, 32, 36 \n\t"
+
+
+ "stxvw4x 32, 0, %3 \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x1), // 2
+ "r" (svec), // 3
+ "r" (pre), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/scopy.c b/kernel/power/scopy.c
new file mode 100644
index 000000000..167c29bab
--- /dev/null
+++ b/kernel/power/scopy.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8)
+#include "scopy_microk_power8.c"
+#endif
+
+#ifndef HAVE_KERNEL_32
+
+static void scopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG i=0;
+ FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+
+ while ( i 0 )
+ {
+ scopy_kernel_32(n1, x, y);
+ i=n1;
+ }
+
+ while(i < n)
+ {
+ y[i] = x[i] ;
+ i++ ;
+
+ }
+
+
+ }
+ else
+ {
+
+ while(i < n)
+ {
+ y[iy] = x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+
+ }
+ return(0);
+
+
+}
+
+
diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c
new file mode 100644
index 000000000..2e08e3561
--- /dev/null
+++ b/kernel/power/scopy_microk_power8.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_32 1
+
+static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ BLASLONG pre = 384;
+ BLASLONG alpha=0;
+
+ __asm__ __volatile__
+ (
+
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -32 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "stxvw4x 40, 0, %1 \n\t"
+ "stxvw4x 41, %5, %1 \n\t"
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+ "stxvw4x 42, %6, %1 \n\t"
+ "stxvw4x 43, %7, %1 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+ "stxvw4x 44, %8, %1 \n\t"
+ "stxvw4x 45, %9, %1 \n\t"
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+ "stxvw4x 46, %10, %1 \n\t"
+ "stxvw4x 47, %11, %1 \n\t"
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+
+
+ "addi %1, %1, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -32 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ "stxvw4x 40, 0, %1 \n\t"
+ "stxvw4x 41, %5, %1 \n\t"
+ "stxvw4x 42, %6, %1 \n\t"
+ "stxvw4x 43, %7, %1 \n\t"
+ "stxvw4x 44, %8, %1 \n\t"
+ "stxvw4x 45, %9, %1 \n\t"
+ "stxvw4x 46, %10, %1 \n\t"
+ "stxvw4x 47, %11, %1 \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (y1), // 1
+ "r" (x1), // 2
+ "r" (alpha), // 3
+ "r" (pre), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2" , "%1", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c
new file mode 100644
index 000000000..52fb1fe24
--- /dev/null
+++ b/kernel/power/sdot.c
@@ -0,0 +1,126 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8)
+#include "sdot_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
+{
+ BLASLONG register i = 0;
+ FLOAT dot = 0.0;
+
+ while(i < n)
+ {
+ dot += y[i] * x[i]
+ + y[i+1] * x[i+1]
+ + y[i+2] * x[i+2]
+ + y[i+3] * x[i+3]
+ + y[i+4] * x[i+4]
+ + y[i+5] * x[i+5]
+ + y[i+6] * x[i+6]
+ + y[i+7] * x[i+7] ;
+
+ i+=8 ;
+
+ }
+ *d += dot;
+
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+
+ FLOAT dot = 0.0 ;
+
+ if ( n <= 0 ) return(dot);
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ BLASLONG n1 = n & -32;
+
+ if ( n1 )
+ sdot_kernel_16(n1, x, y , &dot );
+
+
+ i = n1;
+ while(i < n)
+ {
+
+ dot += y[i] * x[i] ;
+ i++ ;
+
+ }
+ return(dot);
+
+
+ }
+
+ BLASLONG n1 = n & -2;
+
+ while(i < n1)
+ {
+
+ dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
+ ix += inc_x*2 ;
+ iy += inc_y*2 ;
+ i+=2 ;
+
+ }
+
+ while(i < n)
+ {
+
+ dot += y[iy] * x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(dot);
+
+}
+
+
diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c
new file mode 100644
index 000000000..6dd588acd
--- /dev/null
+++ b/kernel/power/sdot_microk_power8.c
@@ -0,0 +1,179 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
+
+static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ BLASLONG pre = 384;
+ FLOAT tempdot[4];
+
+
+ __asm__ __volatile__
+ (
+ "xxlxor 32,32,32 \n\t"
+ "xxlxor 33,33,33 \n\t"
+ "xxlxor 34,34,34 \n\t"
+ "xxlxor 35,35,35 \n\t"
+ "xxlxor 36,36,36 \n\t"
+ "xxlxor 37,37,37 \n\t"
+ "xxlxor 38,38,38 \n\t"
+ "xxlxor 39,39,39 \n\t"
+
+ "dcbt %2, %12 \n\t"
+ "dcbt %3, %12 \n\t"
+
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 48, 0, %3 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+ "lxvw4x 49, %5, %3 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 50, %6, %3 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+ "lxvw4x 51, %7, %3 \n\t"
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 52, %8, %3 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+ "lxvw4x 53, %9, %3 \n\t"
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 54, %10, %3 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+ "lxvw4x 55, %11, %3 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+ "addi %3, %3, 128 \n\t"
+
+ "addic. %0 , %0 , -32 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "dcbt %2, %12 \n\t"
+ "dcbt %3, %12 \n\t"
+
+ "xvmaddasp 32, 40, 48 \n\t"
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 48, 0, %3 \n\t"
+ "xvmaddasp 33, 41, 49 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+ "lxvw4x 49, %5, %3 \n\t"
+ "xvmaddasp 34, 42, 50 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 50, %6, %3 \n\t"
+ "xvmaddasp 35, 43, 51 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+ "lxvw4x 51, %7, %3 \n\t"
+ "xvmaddasp 36, 44, 52 \n\t"
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 52, %8, %3 \n\t"
+ "xvmaddasp 37, 45, 53 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+ "lxvw4x 53, %9, %3 \n\t"
+ "xvmaddasp 38, 46, 54 \n\t"
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 54, %10, %3 \n\t"
+ "xvmaddasp 39, 47, 55 \n\t"
+
+ "lxvw4x 47, %11, %2 \n\t"
+ "lxvw4x 55, %11, %3 \n\t"
+
+
+ "addi %2, %2, 128 \n\t"
+ "addi %3, %3, 128 \n\t"
+
+ "addic. %0 , %0 , -32 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ "xvmaddasp 32, 40, 48 \n\t"
+ "xvmaddasp 33, 41, 49 \n\t"
+ "xvmaddasp 34, 42, 50 \n\t"
+ "xvmaddasp 35, 43, 51 \n\t"
+ "xvmaddasp 36, 44, 52 \n\t"
+ "xvmaddasp 37, 45, 53 \n\t"
+ "xvmaddasp 38, 46, 54 \n\t"
+ "xvmaddasp 39, 47, 55 \n\t"
+
+ "xvaddsp 32, 32 , 33 \n\t"
+ "xvaddsp 34, 34 , 35 \n\t"
+ "xvaddsp 36, 36 , 37 \n\t"
+ "xvaddsp 38, 38 , 39 \n\t"
+
+ "xvaddsp 32, 32 , 34 \n\t"
+ "xvaddsp 36, 36 , 38 \n\t"
+
+ "xvaddsp 32, 32 , 36 \n\t"
+
+ "stxvw4x 32, 0 , %4 \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x1), // 2
+ "r" (y1), // 3
+ "r" (tempdot), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112), // 11
+ "r" (pre) // 12
+ : "cr0", "%0", "%2" , "%3", "memory"
+ );
+
+ *dot = tempdot[0] + tempdot[1] + tempdot[2] + tempdot[3];
+
+
+}
+
+
diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S
new file mode 100644
index 000000000..77f3f7cfb
--- /dev/null
+++ b/kernel/power/sgemm_kernel_16x8_power8.S
@@ -0,0 +1,371 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD lwz
+#else
+#define LOAD ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 32752
+#define ALPHA_SP 296(SP)
+#define FZERO 304(SP)
+#else
+#define STACKSIZE 240
+#define ALPHA_SP 224(SP)
+#define FZERO 232(SP)
+#endif
+
+#define M r3
+#define N r4
+#define K r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A r6
+#define B r7
+#define C r8
+#define LDC r9
+#define OFFSET r10
+#else
+#define A r7
+#define B r8
+#define C r9
+#define LDC r10
+#define OFFSET r6
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A r8
+#define B r9
+#define C r10
+#define LDC r7
+#define OFFSET r6
+#else
+#define A r7
+#define B r8
+#define C r9
+#define LDC r10
+#define OFFSET r6
+#endif
+#endif
+
+#define alpha_r vs30
+#define alpha_vr vs31
+
+#define o0 0
+
+#define FRAMEPOINTER r12
+
+#define BBUFFER r14
+#define o4 r15
+#define o12 r16
+#define o8 r17
+#define L r18
+#define T1 r19
+#define KK r20
+#define BBO r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define o16 r27
+#define o32 r28
+#define o48 r29
+
+#define PRE r30
+#define T2 r31
+
+#include "sgemm_macros_16x8_power8.S"
+
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+ mr FRAMEPOINTER, SP
+ addi SP, SP, -STACKSIZE
+ addi SP, SP, -STACKSIZE
+ addi SP, SP, -STACKSIZE
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+#ifdef __64BIT__
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+#else
+ stw r31, 144(SP)
+ stw r30, 148(SP)
+ stw r29, 152(SP)
+ stw r28, 156(SP)
+ stw r27, 160(SP)
+ stw r26, 164(SP)
+ stw r25, 168(SP)
+ stw r24, 172(SP)
+ stw r23, 176(SP)
+ stw r22, 180(SP)
+ stw r21, 184(SP)
+ stw r20, 188(SP)
+ stw r19, 192(SP)
+ stw r18, 196(SP)
+ stw r17, 200(SP)
+ stw r16, 204(SP)
+ stw r15, 208(SP)
+ stw r14, 212(SP)
+#endif
+
+ // stfd f1, ALPHA_SP
+ // stw r0, FZERO
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+ lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+#endif
+
+ slwi LDC, LDC, 2
+
+#if defined(TRMMKERNEL)
+#if defined(linux) && defined(__64BIT__)
+ ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+ ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#else
+#ifdef DOUBLE
+ lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
+#else
+ lwz OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+#endif
+#endif
+#endif
+
+
+ cmpwi cr0, M, 0
+ ble L999_H1
+ cmpwi cr0, N, 0
+ ble L999_H1
+ cmpwi cr0, K, 0
+ ble L999_H1
+
+ li PRE, 256
+ li o4 , 4
+ li o8 , 8
+ li o12, 12
+ li o16, 16
+ li o32, 32
+ li o48, 48
+
+ addi BBUFFER, SP, 512+4096
+ li T1, -4096
+ and BBUFFER, BBUFFER, T1
+
+ addi T1, SP, 300
+ stxsspx f1, o0 , T1
+ stxsspx f1, o4 , T1
+ stxsspx f1, o8 , T1
+ stxsspx f1, o12 , T1
+
+ lxsspx alpha_r, o0, T1
+ lxvw4x alpha_vr, o0, T1
+
+
+
+#include "sgemm_logic_16x8_power8.S"
+
+L999:
+ addi r3, 0, 0
+
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+#ifdef __64BIT__
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+#else
+ lwz r31, 144(SP)
+ lwz r30, 148(SP)
+ lwz r29, 152(SP)
+ lwz r28, 156(SP)
+ lwz r27, 160(SP)
+ lwz r26, 164(SP)
+ lwz r25, 168(SP)
+ lwz r24, 172(SP)
+ lwz r23, 176(SP)
+ lwz r22, 180(SP)
+ lwz r21, 184(SP)
+ lwz r20, 188(SP)
+ lwz r19, 192(SP)
+ lwz r18, 196(SP)
+ lwz r17, 200(SP)
+ lwz r16, 204(SP)
+ lwz r15, 208(SP)
+ lwz r14, 212(SP)
+#endif
+
+ addi SP, SP, STACKSIZE
+ addi SP, SP, STACKSIZE
+ addi SP, SP, STACKSIZE
+ addi SP, SP, STACKSIZE
+
+ blr
+
+ EPILOGUE
+#endif
diff --git a/kernel/power/sgemm_logic_16x8_power8.S b/kernel/power/sgemm_logic_16x8_power8.S
new file mode 100644
index 000000000..06bb79ea3
--- /dev/null
+++ b/kernel/power/sgemm_logic_16x8_power8.S
@@ -0,0 +1,2323 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+ srawi. J, N, 3
+ ble SGEMM_L8_END
+
+SGEMM_L8_BEGIN:
+
+ mr BO, B
+ mr BBO, BBUFFER
+ slwi T1, K, 3
+
+SGEMM_L8_COPYB:
+ dcbtst BBO, PRE
+
+ lxvw4x vs3, o0, BO
+ lxvw4x vs11, o16, BO
+ xxspltw vs4, vs3, 0
+ xxspltw vs5, vs3, 1
+ xxspltw vs6, vs3, 2
+ xxspltw vs7, vs3, 3
+ xxspltw vs12, vs11, 0
+ xxspltw vs13, vs11, 1
+ xxspltw vs14, vs11, 2
+ xxspltw vs15, vs11, 3
+ stxvw4x vs4, o0, BBO
+ stxvw4x vs5, o16, BBO
+ stxvw4x vs6, o32, BBO
+ stxvw4x vs7, o48, BBO
+ addi BO, BO, 32
+ addi BBO, BBO, 64
+ stxvw4x vs12, o0, BBO
+ stxvw4x vs13, o16, BBO
+ stxvw4x vs14, o32, BBO
+ stxvw4x vs15, o48, BBO
+ addic. T1, T1, -8
+ addi BBO, BBO, 64
+
+ bge SGEMM_L8_COPYB
+
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 3
+ add C, C, T1
+ srawi. I, M, 4
+ ble SGEMM_L8x16_END
+
+SGEMM_L8x16_BEGIN:
+
+
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L8x16_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L8x16_SUB4
+
+SGEMM_L8x16_LOOP_START:
+
+ dcbt AO, PRE
+ dcbt BO, PRE
+ LOAD8x16_1
+ dcbt BO, PRE
+ KERNEL8x16_I1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt BO, PRE
+ KERNEL8x16_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL8x16_2
+
+ dcbt BO, PRE
+ KERNEL8x16_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt BO, PRE
+ KERNEL8x16_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL8x16_2
+
+ addic. L, L, -2
+ ble SGEMM_L8x16_LOOP_END
+
+ .align 5
+
+SGEMM_L8x16_LOOP:
+
+ dcbt BO, PRE
+ KERNEL8x16_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt BO, PRE
+ KERNEL8x16_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL8x16_2
+
+ dcbt BO, PRE
+ KERNEL8x16_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt BO, PRE
+ KERNEL8x16_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL8x16_2
+
+ addic. L, L, -1
+ bgt SGEMM_L8x16_LOOP
+
+SGEMM_L8x16_LOOP_END:
+
+ dcbt BO, PRE
+ KERNEL8x16_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt BO, PRE
+ KERNEL8x16_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL8x16_2
+
+ dcbt BO, PRE
+ KERNEL8x16_1
+ dcbt BO, PRE
+ dcbt AO, PRE
+ KERNEL8x16_2
+ KERNEL8x16_1
+ KERNEL8x16_E2
+
+ b SGEMM_L8x16_SUB1
+
+SGEMM_L8x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL8x16_SUBI1
+ KERNEL8x16_SUB1
+ dcbt AO, PRE
+ KERNEL8x16_SUB1
+ KERNEL8x16_SUB1
+
+ KERNEL8x16_SUB1
+ KERNEL8x16_SUB1
+ KERNEL8x16_SUB1
+ KERNEL8x16_SUB1
+
+ b SGEMM_L8x16_SUB1
+
+SGEMM_L8x16_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL8x16_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L8x16_SAVE
+ b SGEMM_L8x16_SUB2
+
+SGEMM_L8x16_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L8x16_SAVE
+
+SGEMM_L8x16_SUB2:
+
+ KERNEL8x16_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L8x16_SUB2
+
+SGEMM_L8x16_SAVE:
+
+ SAVE8x16
+
+ addic. I, I, -1
+ bgt SGEMM_L8x16_BEGIN
+
+SGEMM_L8x16_END:
+
+SGEMM_L8x8_BEGIN:
+
+ andi. T2, M, 15
+ ble SGEMM_L8x1_END
+
+ andi. T1, M, 8
+ ble SGEMM_L8x8_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L8x8_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L8x8_SUB4
+
+SGEMM_L8x8_LOOP_START:
+
+ LOAD8x8_1
+ KERNEL8x8_I1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_2
+
+ KERNEL8x8_1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_2
+
+ addic. L, L, -2
+ ble SGEMM_L8x8_LOOP_END
+
+ .align 5
+
+SGEMM_L8x8_LOOP:
+
+ KERNEL8x8_1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_2
+
+ KERNEL8x8_1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_2
+
+ addic. L, L, -1
+ bgt SGEMM_L8x8_LOOP
+
+SGEMM_L8x8_LOOP_END:
+
+ KERNEL8x8_1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_2
+
+ KERNEL8x8_1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_E2
+
+ b SGEMM_L8x8_SUB1
+
+SGEMM_L8x8_SUB4:
+
+ KERNEL8x8_SUBI1
+ KERNEL8x8_SUB1
+ KERNEL8x8_SUB1
+ KERNEL8x8_SUB1
+
+ KERNEL8x8_SUB1
+ KERNEL8x8_SUB1
+ KERNEL8x8_SUB1
+ KERNEL8x8_SUB1
+
+ b SGEMM_L8x8_SUB1
+
+SGEMM_L8x8_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL8x8_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L8x8_SAVE
+ b SGEMM_L8x8_SUB2
+
+SGEMM_L8x8_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L8x8_SAVE
+
+SGEMM_L8x8_SUB2:
+
+ KERNEL8x8_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L8x8_SUB2
+
+SGEMM_L8x8_SAVE:
+
+ SAVE8x8
+
+SGEMM_L8x8_END:
+
+SGEMM_L8x4_BEGIN:
+
+
+ andi. T1, M, 4
+ ble SGEMM_L8x4_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L8x4_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L8x4_SUB4
+
+SGEMM_L8x4_LOOP_START:
+
+ LOAD8x4_1
+ KERNEL8x4_I1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_2
+
+ KERNEL8x4_1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_2
+
+ addic. L, L, -2
+ ble SGEMM_L8x4_LOOP_END
+
+ .align 5
+
+SGEMM_L8x4_LOOP:
+
+ KERNEL8x4_1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_2
+
+ KERNEL8x4_1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_2
+
+ addic. L, L, -1
+ bgt SGEMM_L8x4_LOOP
+
+SGEMM_L8x4_LOOP_END:
+
+ KERNEL8x4_1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_2
+
+ KERNEL8x4_1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_E2
+
+ b SGEMM_L8x4_SUB1
+
+SGEMM_L8x4_SUB4:
+
+ KERNEL8x4_SUBI1
+ KERNEL8x4_SUB1
+ KERNEL8x4_SUB1
+ KERNEL8x4_SUB1
+
+ KERNEL8x4_SUB1
+ KERNEL8x4_SUB1
+ KERNEL8x4_SUB1
+ KERNEL8x4_SUB1
+
+ b SGEMM_L8x4_SUB1
+
+SGEMM_L8x4_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL8x4_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L8x4_SAVE
+ b SGEMM_L8x4_SUB2
+
+SGEMM_L8x4_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L8x4_SAVE
+
+SGEMM_L8x4_SUB2:
+
+ KERNEL8x4_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L8x4_SUB2
+
+SGEMM_L8x4_SAVE:
+
+ SAVE8x4
+
+SGEMM_L8x4_END:
+
+SGEMM_L8x2_BEGIN:
+
+
+ andi. T1, M, 2
+ ble SGEMM_L8x2_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L8x2_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L8x2_SUB4
+
+SGEMM_L8x2_LOOP_START:
+
+ LOAD8x2_1
+ KERNEL8x2_I1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_2
+
+ KERNEL8x2_1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_2
+
+ addic. L, L, -2
+ ble SGEMM_L8x2_LOOP_END
+
+ .align 5
+
+SGEMM_L8x2_LOOP:
+
+ KERNEL8x2_1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_2
+
+ KERNEL8x2_1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_2
+
+ addic. L, L, -1
+ bgt SGEMM_L8x2_LOOP
+
+SGEMM_L8x2_LOOP_END:
+
+ KERNEL8x2_1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_2
+
+ KERNEL8x2_1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_E2
+
+ b SGEMM_L8x2_SUB1
+
+SGEMM_L8x2_SUB4:
+
+ KERNEL8x2_SUBI1
+ KERNEL8x2_SUB1
+ KERNEL8x2_SUB1
+ KERNEL8x2_SUB1
+
+ KERNEL8x2_SUB1
+ KERNEL8x2_SUB1
+ KERNEL8x2_SUB1
+ KERNEL8x2_SUB1
+
+ b SGEMM_L8x2_SUB1
+
+SGEMM_L8x2_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL8x2_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L8x2_SAVE
+ b SGEMM_L8x2_SUB2
+
+SGEMM_L8x2_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L8x2_SAVE
+
+SGEMM_L8x2_SUB2:
+
+ KERNEL8x2_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L8x2_SUB2
+
+SGEMM_L8x2_SAVE:
+
+ SAVE8x2
+
+SGEMM_L8x2_END:
+
+SGEMM_L8x1_BEGIN:
+
+
+ andi. T1, M, 1
+ ble SGEMM_L8x1_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L8x1_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L8x1_SUB4
+
+SGEMM_L8x1_LOOP_START:
+
+ LOAD8x1_1
+ KERNEL8x1_I1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_2
+
+ KERNEL8x1_1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_2
+
+ addic. L, L, -2
+ ble SGEMM_L8x1_LOOP_END
+
+ .align 5
+
+SGEMM_L8x1_LOOP:
+
+ KERNEL8x1_1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_2
+
+ KERNEL8x1_1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_2
+
+ addic. L, L, -1
+ bgt SGEMM_L8x1_LOOP
+
+SGEMM_L8x1_LOOP_END:
+
+ KERNEL8x1_1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_2
+
+ KERNEL8x1_1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_E2
+
+ b SGEMM_L8x1_SUB1
+
+SGEMM_L8x1_SUB4:
+
+ KERNEL8x1_SUBI1
+ KERNEL8x1_SUB1
+ KERNEL8x1_SUB1
+ KERNEL8x1_SUB1
+
+ KERNEL8x1_SUB1
+ KERNEL8x1_SUB1
+ KERNEL8x1_SUB1
+ KERNEL8x1_SUB1
+
+ b SGEMM_L8x1_SUB1
+
+SGEMM_L8x1_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL8x1_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L8x1_SAVE
+ b SGEMM_L8x1_SUB2
+
+SGEMM_L8x1_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L8x1_SAVE
+
+SGEMM_L8x1_SUB2:
+
+ KERNEL8x1_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L8x1_SUB2
+
+SGEMM_L8x1_SAVE:
+
+ SAVE8x1
+
+SGEMM_L8x1_END:
+
+ slwi T1, K, 5
+ add B, B, T1
+
+ addic. J, J, -1
+ bgt SGEMM_L8_BEGIN
+
+ andi. T2, N, 7
+ ble L999
+
+SGEMM_L8_END:
+
+ b SGEMM_L4_BEGIN
+
+L999_H1:
+
+ b L999
+
+SGEMM_L4_BEGIN:
+
+ mr BO, B
+ mr BBO, BBUFFER
+ slwi T1, K, 2
+
+SGEMM_L4_COPYB:
+ dcbtst BBO, PRE
+
+ lxvw4x vs3, o0, BO
+ lxvw4x vs11, o16, BO
+ xxspltw vs4, vs3, 0
+ xxspltw vs5, vs3, 1
+ xxspltw vs6, vs3, 2
+ xxspltw vs7, vs3, 3
+ xxspltw vs12, vs11, 0
+ xxspltw vs13, vs11, 1
+ xxspltw vs14, vs11, 2
+ xxspltw vs15, vs11, 3
+ stxvw4x vs4, o0, BBO
+ stxvw4x vs5, o16, BBO
+ stxvw4x vs6, o32, BBO
+ stxvw4x vs7, o48, BBO
+ addi BO, BO, 32
+ addi BBO, BBO, 64
+ stxvw4x vs12, o0, BBO
+ stxvw4x vs13, o16, BBO
+ stxvw4x vs14, o32, BBO
+ stxvw4x vs15, o48, BBO
+ addic. T1, T1, -8
+ addi BBO, BBO, 64
+
+ bge SGEMM_L4_COPYB
+
+ andi. T1, N, 4
+ ble SGEMM_L4_END
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 2
+ add C, C, T1
+ srawi. I, M, 4
+ ble SGEMM_L4x16_END
+
+SGEMM_L4x16_BEGIN:
+
+
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L4x16_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L4x16_SUB4
+
+SGEMM_L4x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD4x16_1
+ KERNEL4x16_I1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ addic. L, L, -2
+ ble SGEMM_L4x16_LOOP_END
+
+ .align 5
+
+SGEMM_L4x16_LOOP:
+
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ addic. L, L, -1
+ bgt SGEMM_L4x16_LOOP
+
+SGEMM_L4x16_LOOP_END:
+
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ KERNEL4x16_1
+ KERNEL4x16_E2
+
+ b SGEMM_L4x16_SUB1
+
+SGEMM_L4x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL4x16_SUBI1
+ KERNEL4x16_SUB1
+ dcbt AO, PRE
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+
+ b SGEMM_L4x16_SUB1
+
+SGEMM_L4x16_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL4x16_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L4x16_SAVE
+ b SGEMM_L4x16_SUB2
+
+SGEMM_L4x16_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L4x16_SAVE
+
+SGEMM_L4x16_SUB2:
+
+ KERNEL4x16_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L4x16_SUB2
+
+SGEMM_L4x16_SAVE:
+
+ SAVE4x16
+
+ addic. I, I, -1
+ bgt SGEMM_L4x16_BEGIN
+
+SGEMM_L4x16_END:
+
+SGEMM_L4x8_BEGIN:
+
+ andi. T2, M, 15
+ ble SGEMM_L4x1_END
+
+ andi. T1, M, 8
+ ble SGEMM_L4x8_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L4x8_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L4x8_SUB4
+
+SGEMM_L4x8_LOOP_START:
+
+ LOAD4x8_1
+ KERNEL4x8_I1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ addic. L, L, -2
+ ble SGEMM_L4x8_LOOP_END
+
+ .align 5
+
+SGEMM_L4x8_LOOP:
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ addic. L, L, -1
+ bgt SGEMM_L4x8_LOOP
+
+SGEMM_L4x8_LOOP_END:
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_E2
+
+ b SGEMM_L4x8_SUB1
+
+SGEMM_L4x8_SUB4:
+
+ KERNEL4x8_SUBI1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+
+ b SGEMM_L4x8_SUB1
+
+SGEMM_L4x8_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL4x8_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L4x8_SAVE
+ b SGEMM_L4x8_SUB2
+
+SGEMM_L4x8_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L4x8_SAVE
+
+SGEMM_L4x8_SUB2:
+
+ KERNEL4x8_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L4x8_SUB2
+
+SGEMM_L4x8_SAVE:
+
+ SAVE4x8
+
+SGEMM_L4x8_END:
+
+SGEMM_L4x4_BEGIN:
+
+
+ andi. T1, M, 4
+ ble SGEMM_L4x4_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L4x4_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L4x4_SUB4
+
+SGEMM_L4x4_LOOP_START:
+
+ LOAD4x4_1
+ KERNEL4x4_I1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ addic. L, L, -2
+ ble SGEMM_L4x4_LOOP_END
+
+ .align 5
+
+SGEMM_L4x4_LOOP:
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ addic. L, L, -1
+ bgt SGEMM_L4x4_LOOP
+
+SGEMM_L4x4_LOOP_END:
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_E2
+
+ b SGEMM_L4x4_SUB1
+
+SGEMM_L4x4_SUB4:
+
+ KERNEL4x4_SUBI1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+
+ b SGEMM_L4x4_SUB1
+
+SGEMM_L4x4_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL4x4_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L4x4_SAVE
+ b SGEMM_L4x4_SUB2
+
+SGEMM_L4x4_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L4x4_SAVE
+
+SGEMM_L4x4_SUB2:
+
+ KERNEL4x4_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L4x4_SUB2
+
+SGEMM_L4x4_SAVE:
+
+ SAVE4x4
+
+SGEMM_L4x4_END:
+
+SGEMM_L4x2_BEGIN:
+
+
+ andi. T1, M, 2
+ ble SGEMM_L4x2_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L4x2_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L4x2_SUB4
+
+SGEMM_L4x2_LOOP_START:
+
+ LOAD4x2_1
+ KERNEL4x2_I1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ addic. L, L, -2
+ ble SGEMM_L4x2_LOOP_END
+
+ .align 5
+
+SGEMM_L4x2_LOOP:
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ addic. L, L, -1
+ bgt SGEMM_L4x2_LOOP
+
+SGEMM_L4x2_LOOP_END:
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_E2
+
+ b SGEMM_L4x2_SUB1
+
+SGEMM_L4x2_SUB4:
+
+ KERNEL4x2_SUBI1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+
+ b SGEMM_L4x2_SUB1
+
+SGEMM_L4x2_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL4x2_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L4x2_SAVE
+ b SGEMM_L4x2_SUB2
+
+SGEMM_L4x2_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L4x2_SAVE
+
+SGEMM_L4x2_SUB2:
+
+ KERNEL4x2_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L4x2_SUB2
+
+SGEMM_L4x2_SAVE:
+
+ SAVE4x2
+
+SGEMM_L4x2_END:
+
+SGEMM_L4x1_BEGIN:
+
+
+ andi. T1, M, 1
+ ble SGEMM_L4x1_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L4x1_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L4x1_SUB4
+
+SGEMM_L4x1_LOOP_START:
+
+ LOAD4x1_1
+ KERNEL4x1_I1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ addic. L, L, -2
+ ble SGEMM_L4x1_LOOP_END
+
+ .align 5
+
+SGEMM_L4x1_LOOP:
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ addic. L, L, -1
+ bgt SGEMM_L4x1_LOOP
+
+SGEMM_L4x1_LOOP_END:
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_E2
+
+ b SGEMM_L4x1_SUB1
+
+SGEMM_L4x1_SUB4:
+
+ KERNEL4x1_SUBI1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+
+ b SGEMM_L4x1_SUB1
+
+SGEMM_L4x1_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL4x1_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L4x1_SAVE
+ b SGEMM_L4x1_SUB2
+
+SGEMM_L4x1_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L4x1_SAVE
+
+SGEMM_L4x1_SUB2:
+
+ KERNEL4x1_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L4x1_SUB2
+
+SGEMM_L4x1_SAVE:
+
+ SAVE4x1
+
+SGEMM_L4x1_END:
+
+ slwi T1, K, 4
+ add B, B, T1
+
+SGEMM_L4_END:
+SGEMM_L2_BEGIN:
+
+ mr BO, B
+ mr BBO, BBUFFER
+ slwi T1, K, 1
+
+SGEMM_L2_COPYB:
+ dcbtst BBO, PRE
+
+ lxvw4x vs3, o0, BO
+ lxvw4x vs11, o16, BO
+ xxspltw vs4, vs3, 0
+ xxspltw vs5, vs3, 1
+ xxspltw vs6, vs3, 2
+ xxspltw vs7, vs3, 3
+ xxspltw vs12, vs11, 0
+ xxspltw vs13, vs11, 1
+ xxspltw vs14, vs11, 2
+ xxspltw vs15, vs11, 3
+ stxvw4x vs4, o0, BBO
+ stxvw4x vs5, o16, BBO
+ stxvw4x vs6, o32, BBO
+ stxvw4x vs7, o48, BBO
+ addi BO, BO, 32
+ addi BBO, BBO, 64
+ stxvw4x vs12, o0, BBO
+ stxvw4x vs13, o16, BBO
+ stxvw4x vs14, o32, BBO
+ stxvw4x vs15, o48, BBO
+ addic. T1, T1, -8
+ addi BBO, BBO, 64
+
+ bge SGEMM_L2_COPYB
+
+ andi. T1, N, 2
+ ble SGEMM_L2_END
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 1
+ add C, C, T1
+ srawi. I, M, 4
+ ble SGEMM_L2x16_END
+
+SGEMM_L2x16_BEGIN:
+
+
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L2x16_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L2x16_SUB4
+
+SGEMM_L2x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD2x16_1
+ KERNEL2x16_I1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ addic. L, L, -2
+ ble SGEMM_L2x16_LOOP_END
+
+ .align 5
+
+SGEMM_L2x16_LOOP:
+
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ addic. L, L, -1
+ bgt SGEMM_L2x16_LOOP
+
+SGEMM_L2x16_LOOP_END:
+
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ KERNEL2x16_1
+ KERNEL2x16_E2
+
+ b SGEMM_L2x16_SUB1
+
+SGEMM_L2x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL2x16_SUBI1
+ KERNEL2x16_SUB1
+ dcbt AO, PRE
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+
+ b SGEMM_L2x16_SUB1
+
+SGEMM_L2x16_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x16_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L2x16_SAVE
+ b SGEMM_L2x16_SUB2
+
+SGEMM_L2x16_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L2x16_SAVE
+
+SGEMM_L2x16_SUB2:
+
+ KERNEL2x16_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L2x16_SUB2
+
+SGEMM_L2x16_SAVE:
+
+ SAVE2x16
+
+ addic. I, I, -1
+ bgt SGEMM_L2x16_BEGIN
+
+SGEMM_L2x16_END:
+
+SGEMM_L2x8_BEGIN:
+
+ andi. T2, M, 15
+ ble SGEMM_L2x1_END
+
+ andi. T1, M, 8
+ ble SGEMM_L2x8_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L2x8_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L2x8_SUB4
+
+SGEMM_L2x8_LOOP_START:
+
+ LOAD2x8_1
+ KERNEL2x8_I1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ addic. L, L, -2
+ ble SGEMM_L2x8_LOOP_END
+
+ .align 5
+
+SGEMM_L2x8_LOOP:
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ addic. L, L, -1
+ bgt SGEMM_L2x8_LOOP
+
+SGEMM_L2x8_LOOP_END:
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_E2
+
+ b SGEMM_L2x8_SUB1
+
+SGEMM_L2x8_SUB4:
+
+ KERNEL2x8_SUBI1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ b SGEMM_L2x8_SUB1
+
+SGEMM_L2x8_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x8_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L2x8_SAVE
+ b SGEMM_L2x8_SUB2
+
+SGEMM_L2x8_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L2x8_SAVE
+
+SGEMM_L2x8_SUB2:
+
+ KERNEL2x8_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L2x8_SUB2
+
+SGEMM_L2x8_SAVE:
+
+ SAVE2x8
+
+SGEMM_L2x8_END:
+
+SGEMM_L2x4_BEGIN:
+
+
+ andi. T1, M, 4
+ ble SGEMM_L2x4_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L2x4_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L2x4_SUB4
+
+SGEMM_L2x4_LOOP_START:
+
+ LOAD2x4_1
+ KERNEL2x4_I1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -2
+ ble SGEMM_L2x4_LOOP_END
+
+ .align 5
+
+SGEMM_L2x4_LOOP:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -1
+ bgt SGEMM_L2x4_LOOP
+
+SGEMM_L2x4_LOOP_END:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_E2
+
+ b SGEMM_L2x4_SUB1
+
+SGEMM_L2x4_SUB4:
+
+ KERNEL2x4_SUBI1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ b SGEMM_L2x4_SUB1
+
+SGEMM_L2x4_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x4_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L2x4_SAVE
+ b SGEMM_L2x4_SUB2
+
+SGEMM_L2x4_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L2x4_SAVE
+
+SGEMM_L2x4_SUB2:
+
+ KERNEL2x4_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L2x4_SUB2
+
+SGEMM_L2x4_SAVE:
+
+ SAVE2x4
+
+SGEMM_L2x4_END:
+
+SGEMM_L2x2_BEGIN:
+
+
+ andi. T1, M, 2
+ ble SGEMM_L2x2_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L2x2_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L2x2_SUB4
+
+SGEMM_L2x2_LOOP_START:
+
+ LOAD2x2_1
+ KERNEL2x2_I1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -2
+ ble SGEMM_L2x2_LOOP_END
+
+ .align 5
+
+SGEMM_L2x2_LOOP:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -1
+ bgt SGEMM_L2x2_LOOP
+
+SGEMM_L2x2_LOOP_END:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_E2
+
+ b SGEMM_L2x2_SUB1
+
+SGEMM_L2x2_SUB4:
+
+ KERNEL2x2_SUBI1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ b SGEMM_L2x2_SUB1
+
+SGEMM_L2x2_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x2_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L2x2_SAVE
+ b SGEMM_L2x2_SUB2
+
+SGEMM_L2x2_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L2x2_SAVE
+
+SGEMM_L2x2_SUB2:
+
+ KERNEL2x2_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L2x2_SUB2
+
+SGEMM_L2x2_SAVE:
+
+ SAVE2x2
+
+SGEMM_L2x2_END:
+
+SGEMM_L2x1_BEGIN:
+
+
+ andi. T1, M, 1
+ ble SGEMM_L2x1_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L2x1_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L2x1_SUB4
+
+SGEMM_L2x1_LOOP_START:
+
+ LOAD2x1_1
+ KERNEL2x1_I1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -2
+ ble SGEMM_L2x1_LOOP_END
+
+ .align 5
+
+SGEMM_L2x1_LOOP:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -1
+ bgt SGEMM_L2x1_LOOP
+
+SGEMM_L2x1_LOOP_END:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_E2
+
+ b SGEMM_L2x1_SUB1
+
+SGEMM_L2x1_SUB4:
+
+ KERNEL2x1_SUBI1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ b SGEMM_L2x1_SUB1
+
+SGEMM_L2x1_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL2x1_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L2x1_SAVE
+ b SGEMM_L2x1_SUB2
+
+SGEMM_L2x1_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L2x1_SAVE
+
+SGEMM_L2x1_SUB2:
+
+ KERNEL2x1_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L2x1_SUB2
+
+SGEMM_L2x1_SAVE:
+
+ SAVE2x1
+
+SGEMM_L2x1_END:
+
+ slwi T1, K, 3
+ add B, B, T1
+
+SGEMM_L2_END:
+SGEMM_L1_BEGIN:
+
+ mr BO, B
+ mr BBO, BBUFFER
+ slwi T1, K, 0
+
+SGEMM_L1_COPYB:
+ dcbtst BBO, PRE
+
+ lxvw4x vs3, o0, BO
+ lxvw4x vs11, o16, BO
+ xxspltw vs4, vs3, 0
+ xxspltw vs5, vs3, 1
+ xxspltw vs6, vs3, 2
+ xxspltw vs7, vs3, 3
+ xxspltw vs12, vs11, 0
+ xxspltw vs13, vs11, 1
+ xxspltw vs14, vs11, 2
+ xxspltw vs15, vs11, 3
+ stxvw4x vs4, o0, BBO
+ stxvw4x vs5, o16, BBO
+ stxvw4x vs6, o32, BBO
+ stxvw4x vs7, o48, BBO
+ addi BO, BO, 32
+ addi BBO, BBO, 64
+ stxvw4x vs12, o0, BBO
+ stxvw4x vs13, o16, BBO
+ stxvw4x vs14, o32, BBO
+ stxvw4x vs15, o48, BBO
+ addic. T1, T1, -8
+ addi BBO, BBO, 64
+
+ bge SGEMM_L1_COPYB
+
+ andi. T1, N, 1
+ ble SGEMM_L1_END
+ mr CO, C
+ mr AO, A
+ srawi. I, M, 4
+ ble SGEMM_L1x16_END
+
+SGEMM_L1x16_BEGIN:
+
+
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L1x16_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L1x16_SUB4
+
+SGEMM_L1x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD1x16_1
+ KERNEL1x16_I1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ addic. L, L, -2
+ ble SGEMM_L1x16_LOOP_END
+
+ .align 5
+
+SGEMM_L1x16_LOOP:
+
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ addic. L, L, -1
+ bgt SGEMM_L1x16_LOOP
+
+SGEMM_L1x16_LOOP_END:
+
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ KERNEL1x16_1
+ KERNEL1x16_E2
+
+ b SGEMM_L1x16_SUB1
+
+SGEMM_L1x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL1x16_SUBI1
+ KERNEL1x16_SUB1
+ dcbt AO, PRE
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+
+ b SGEMM_L1x16_SUB1
+
+SGEMM_L1x16_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x16_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L1x16_SAVE
+ b SGEMM_L1x16_SUB2
+
+SGEMM_L1x16_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L1x16_SAVE
+
+SGEMM_L1x16_SUB2:
+
+ KERNEL1x16_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L1x16_SUB2
+
+SGEMM_L1x16_SAVE:
+
+ SAVE1x16
+
+ addic. I, I, -1
+ bgt SGEMM_L1x16_BEGIN
+
+SGEMM_L1x16_END:
+
+SGEMM_L1x8_BEGIN:
+
+ andi. T2, M, 15
+ ble SGEMM_L1x1_END
+
+ andi. T1, M, 8
+ ble SGEMM_L1x8_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L1x8_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L1x8_SUB4
+
+SGEMM_L1x8_LOOP_START:
+
+ LOAD1x8_1
+ KERNEL1x8_I1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ addic. L, L, -2
+ ble SGEMM_L1x8_LOOP_END
+
+ .align 5
+
+SGEMM_L1x8_LOOP:
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ addic. L, L, -1
+ bgt SGEMM_L1x8_LOOP
+
+SGEMM_L1x8_LOOP_END:
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_E2
+
+ b SGEMM_L1x8_SUB1
+
+SGEMM_L1x8_SUB4:
+
+ KERNEL1x8_SUBI1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ b SGEMM_L1x8_SUB1
+
+SGEMM_L1x8_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x8_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L1x8_SAVE
+ b SGEMM_L1x8_SUB2
+
+SGEMM_L1x8_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L1x8_SAVE
+
+SGEMM_L1x8_SUB2:
+
+ KERNEL1x8_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L1x8_SUB2
+
+SGEMM_L1x8_SAVE:
+
+ SAVE1x8
+
+SGEMM_L1x8_END:
+
+SGEMM_L1x4_BEGIN:
+
+
+ andi. T1, M, 4
+ ble SGEMM_L1x4_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L1x4_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L1x4_SUB4
+
+SGEMM_L1x4_LOOP_START:
+
+ LOAD1x4_1
+ KERNEL1x4_I1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -2
+ ble SGEMM_L1x4_LOOP_END
+
+ .align 5
+
+SGEMM_L1x4_LOOP:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -1
+ bgt SGEMM_L1x4_LOOP
+
+SGEMM_L1x4_LOOP_END:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_E2
+
+ b SGEMM_L1x4_SUB1
+
+SGEMM_L1x4_SUB4:
+
+ KERNEL1x4_SUBI1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ b SGEMM_L1x4_SUB1
+
+SGEMM_L1x4_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x4_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L1x4_SAVE
+ b SGEMM_L1x4_SUB2
+
+SGEMM_L1x4_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L1x4_SAVE
+
+SGEMM_L1x4_SUB2:
+
+ KERNEL1x4_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L1x4_SUB2
+
+SGEMM_L1x4_SAVE:
+
+ SAVE1x4
+
+SGEMM_L1x4_END:
+
+SGEMM_L1x2_BEGIN:
+
+
+ andi. T1, M, 2
+ ble SGEMM_L1x2_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L1x2_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L1x2_SUB4
+
+SGEMM_L1x2_LOOP_START:
+
+ LOAD1x2_1
+ KERNEL1x2_I1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -2
+ ble SGEMM_L1x2_LOOP_END
+
+ .align 5
+
+SGEMM_L1x2_LOOP:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -1
+ bgt SGEMM_L1x2_LOOP
+
+SGEMM_L1x2_LOOP_END:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_E2
+
+ b SGEMM_L1x2_SUB1
+
+SGEMM_L1x2_SUB4:
+
+ KERNEL1x2_SUBI1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ b SGEMM_L1x2_SUB1
+
+SGEMM_L1x2_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x2_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L1x2_SAVE
+ b SGEMM_L1x2_SUB2
+
+SGEMM_L1x2_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L1x2_SAVE
+
+SGEMM_L1x2_SUB2:
+
+ KERNEL1x2_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L1x2_SUB2
+
+SGEMM_L1x2_SAVE:
+
+ SAVE1x2
+
+SGEMM_L1x2_END:
+
+SGEMM_L1x1_BEGIN:
+
+
+ andi. T1, M, 1
+ ble SGEMM_L1x1_END
+ mr BO, BBUFFER
+ srawi. L, K, 3
+ ble SGEMM_L1x1_SUB0
+ cmpwi cr0, L, 1
+ ble SGEMM_L1x1_SUB4
+
+SGEMM_L1x1_LOOP_START:
+
+ LOAD1x1_1
+ KERNEL1x1_I1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -2
+ ble SGEMM_L1x1_LOOP_END
+
+ .align 5
+
+SGEMM_L1x1_LOOP:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -1
+ bgt SGEMM_L1x1_LOOP
+
+SGEMM_L1x1_LOOP_END:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_E2
+
+ b SGEMM_L1x1_SUB1
+
+SGEMM_L1x1_SUB4:
+
+ KERNEL1x1_SUBI1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ b SGEMM_L1x1_SUB1
+
+SGEMM_L1x1_SUB0:
+
+ andi. L, K, 7
+
+ KERNEL1x1_SUBI1
+
+ addic. L, L, -1
+ ble SGEMM_L1x1_SAVE
+ b SGEMM_L1x1_SUB2
+
+SGEMM_L1x1_SUB1:
+
+ andi. L, K, 7
+ ble SGEMM_L1x1_SAVE
+
+SGEMM_L1x1_SUB2:
+
+ KERNEL1x1_SUB1
+
+ addic. L, L, -1
+ bgt SGEMM_L1x1_SUB2
+
+SGEMM_L1x1_SAVE:
+
+ SAVE1x1
+
+SGEMM_L1x1_END:
+
+SGEMM_L1_END:
diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S
new file mode 100644
index 000000000..71dc52979
--- /dev/null
+++ b/kernel/power/sgemm_macros_16x8_power8.S
@@ -0,0 +1,5888 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=16
+**********************************************************************************************/
+
+.macro LOAD8x16_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs12, o0, T1
+ lxvw4x vs13, o16, T1
+ lxvw4x vs14, o32, T1
+ lxvw4x vs15, o48, T1
+
+ addi BO, BO, 128
+
+.endm
+
+.macro KERNEL8x16_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+ lxvw4x vs18, o32, T1
+ lxvw4x vs19, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs20, o0, T1
+ lxvw4x vs21, o16, T1
+ lxvw4x vs22, o32, T1
+ lxvw4x vs23, o48, T1
+
+ addi BO, BO, 128
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+ xvmulsp vs40, vs0, vs10
+ xvmulsp vs41, vs1, vs10
+ xvmulsp vs42, vs2, vs10
+ xvmulsp vs43, vs3, vs10
+
+ xvmulsp vs44, vs0, vs11
+ xvmulsp vs45, vs1, vs11
+ xvmulsp vs46, vs2, vs11
+ xvmulsp vs47, vs3, vs11
+
+ xvmulsp vs48, vs0, vs12
+ xvmulsp vs49, vs1, vs12
+ xvmulsp vs50, vs2, vs12
+ xvmulsp vs51, vs3, vs12
+
+ xvmulsp vs52, vs0, vs13
+ xvmulsp vs53, vs1, vs13
+ xvmulsp vs54, vs2, vs13
+ xvmulsp vs55, vs3, vs13
+
+ xvmulsp vs56, vs0, vs14
+ xvmulsp vs57, vs1, vs14
+ xvmulsp vs58, vs2, vs14
+ xvmulsp vs59, vs3, vs14
+
+ xvmulsp vs60, vs0, vs15
+ xvmulsp vs61, vs1, vs15
+ xvmulsp vs62, vs2, vs15
+ xvmulsp vs63, vs3, vs15
+
+
+.endm
+
+.macro KERNEL8x16_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+ lxvw4x vs18, o32, T1
+ lxvw4x vs19, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs20, o0, T1
+ lxvw4x vs21, o16, T1
+ lxvw4x vs22, o32, T1
+ lxvw4x vs23, o48, T1
+
+ addi BO, BO, 128
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+ xvmaddasp vs40, vs0, vs10
+ xvmaddasp vs41, vs1, vs10
+ xvmaddasp vs42, vs2, vs10
+ xvmaddasp vs43, vs3, vs10
+
+ xvmaddasp vs44, vs0, vs11
+ xvmaddasp vs45, vs1, vs11
+ xvmaddasp vs46, vs2, vs11
+ xvmaddasp vs47, vs3, vs11
+
+ xvmaddasp vs48, vs0, vs12
+ xvmaddasp vs49, vs1, vs12
+ xvmaddasp vs50, vs2, vs12
+ xvmaddasp vs51, vs3, vs12
+
+ xvmaddasp vs52, vs0, vs13
+ xvmaddasp vs53, vs1, vs13
+ xvmaddasp vs54, vs2, vs13
+ xvmaddasp vs55, vs3, vs13
+
+ xvmaddasp vs56, vs0, vs14
+ xvmaddasp vs57, vs1, vs14
+ xvmaddasp vs58, vs2, vs14
+ xvmaddasp vs59, vs3, vs14
+
+ xvmaddasp vs60, vs0, vs15
+ xvmaddasp vs61, vs1, vs15
+ xvmaddasp vs62, vs2, vs15
+ xvmaddasp vs63, vs3, vs15
+
+
+.endm
+
+.macro KERNEL8x16_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs12, o0, T1
+ lxvw4x vs13, o16, T1
+ lxvw4x vs14, o32, T1
+ lxvw4x vs15, o48, T1
+
+ addi BO, BO, 128
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+ xvmaddasp vs40, vs4, vs18
+ xvmaddasp vs41, vs5, vs18
+ xvmaddasp vs42, vs6, vs18
+ xvmaddasp vs43, vs7, vs18
+
+ xvmaddasp vs44, vs4, vs19
+ xvmaddasp vs45, vs5, vs19
+ xvmaddasp vs46, vs6, vs19
+ xvmaddasp vs47, vs7, vs19
+
+ xvmaddasp vs48, vs4, vs20
+ xvmaddasp vs49, vs5, vs20
+ xvmaddasp vs50, vs6, vs20
+ xvmaddasp vs51, vs7, vs20
+
+ xvmaddasp vs52, vs4, vs21
+ xvmaddasp vs53, vs5, vs21
+ xvmaddasp vs54, vs6, vs21
+ xvmaddasp vs55, vs7, vs21
+
+ xvmaddasp vs56, vs4, vs22
+ xvmaddasp vs57, vs5, vs22
+ xvmaddasp vs58, vs6, vs22
+ xvmaddasp vs59, vs7, vs22
+
+ xvmaddasp vs60, vs4, vs23
+ xvmaddasp vs61, vs5, vs23
+ xvmaddasp vs62, vs6, vs23
+ xvmaddasp vs63, vs7, vs23
+
+
+.endm
+
+.macro KERNEL8x16_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+ xvmaddasp vs40, vs4, vs18
+ xvmaddasp vs41, vs5, vs18
+ xvmaddasp vs42, vs6, vs18
+ xvmaddasp vs43, vs7, vs18
+
+ xvmaddasp vs44, vs4, vs19
+ xvmaddasp vs45, vs5, vs19
+ xvmaddasp vs46, vs6, vs19
+ xvmaddasp vs47, vs7, vs19
+
+ xvmaddasp vs48, vs4, vs20
+ xvmaddasp vs49, vs5, vs20
+ xvmaddasp vs50, vs6, vs20
+ xvmaddasp vs51, vs7, vs20
+
+ xvmaddasp vs52, vs4, vs21
+ xvmaddasp vs53, vs5, vs21
+ xvmaddasp vs54, vs6, vs21
+ xvmaddasp vs55, vs7, vs21
+
+ xvmaddasp vs56, vs4, vs22
+ xvmaddasp vs57, vs5, vs22
+ xvmaddasp vs58, vs6, vs22
+ xvmaddasp vs59, vs7, vs22
+
+ xvmaddasp vs60, vs4, vs23
+ xvmaddasp vs61, vs5, vs23
+ xvmaddasp vs62, vs6, vs23
+ xvmaddasp vs63, vs7, vs23
+
+
+.endm
+
+.macro KERNEL8x16_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs12, o0, T1
+ lxvw4x vs13, o16, T1
+ lxvw4x vs14, o32, T1
+ lxvw4x vs15, o48, T1
+
+ addi BO, BO, 128
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+ xvmulsp vs40, vs0, vs10
+ xvmulsp vs41, vs1, vs10
+ xvmulsp vs42, vs2, vs10
+ xvmulsp vs43, vs3, vs10
+
+ xvmulsp vs44, vs0, vs11
+ xvmulsp vs45, vs1, vs11
+ xvmulsp vs46, vs2, vs11
+ xvmulsp vs47, vs3, vs11
+
+ xvmulsp vs48, vs0, vs12
+ xvmulsp vs49, vs1, vs12
+ xvmulsp vs50, vs2, vs12
+ xvmulsp vs51, vs3, vs12
+
+ xvmulsp vs52, vs0, vs13
+ xvmulsp vs53, vs1, vs13
+ xvmulsp vs54, vs2, vs13
+ xvmulsp vs55, vs3, vs13
+
+ xvmulsp vs56, vs0, vs14
+ xvmulsp vs57, vs1, vs14
+ xvmulsp vs58, vs2, vs14
+ xvmulsp vs59, vs3, vs14
+
+ xvmulsp vs60, vs0, vs15
+ xvmulsp vs61, vs1, vs15
+ xvmulsp vs62, vs2, vs15
+ xvmulsp vs63, vs3, vs15
+
+
+.endm
+
+.macro KERNEL8x16_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs12, o0, T1
+ lxvw4x vs13, o16, T1
+ lxvw4x vs14, o32, T1
+ lxvw4x vs15, o48, T1
+
+ addi BO, BO, 128
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+ xvmaddasp vs40, vs0, vs10
+ xvmaddasp vs41, vs1, vs10
+ xvmaddasp vs42, vs2, vs10
+ xvmaddasp vs43, vs3, vs10
+
+ xvmaddasp vs44, vs0, vs11
+ xvmaddasp vs45, vs1, vs11
+ xvmaddasp vs46, vs2, vs11
+ xvmaddasp vs47, vs3, vs11
+
+ xvmaddasp vs48, vs0, vs12
+ xvmaddasp vs49, vs1, vs12
+ xvmaddasp vs50, vs2, vs12
+ xvmaddasp vs51, vs3, vs12
+
+ xvmaddasp vs52, vs0, vs13
+ xvmaddasp vs53, vs1, vs13
+ xvmaddasp vs54, vs2, vs13
+ xvmaddasp vs55, vs3, vs13
+
+ xvmaddasp vs56, vs0, vs14
+ xvmaddasp vs57, vs1, vs14
+ xvmaddasp vs58, vs2, vs14
+ xvmaddasp vs59, vs3, vs14
+
+ xvmaddasp vs60, vs0, vs15
+ xvmaddasp vs61, vs1, vs15
+ xvmaddasp vs62, vs2, vs15
+ xvmaddasp vs63, vs3, vs15
+
+
+.endm
+
+.macro SAVE8x16
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+ xvmulsp vs2, vs34, alpha_vr
+ xvmulsp vs3, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+ xvmaddasp vs2, vs34, alpha_vr
+ xvmaddasp vs3, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs36, alpha_vr
+ xvmulsp vs1, vs37, alpha_vr
+ xvmulsp vs2, vs38, alpha_vr
+ xvmulsp vs3, vs39, alpha_vr
+#else
+ xvmaddasp vs0, vs36, alpha_vr
+ xvmaddasp vs1, vs37, alpha_vr
+ xvmaddasp vs2, vs38, alpha_vr
+ xvmaddasp vs3, vs39, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs40, alpha_vr
+ xvmulsp vs1, vs41, alpha_vr
+ xvmulsp vs2, vs42, alpha_vr
+ xvmulsp vs3, vs43, alpha_vr
+#else
+ xvmaddasp vs0, vs40, alpha_vr
+ xvmaddasp vs1, vs41, alpha_vr
+ xvmaddasp vs2, vs42, alpha_vr
+ xvmaddasp vs3, vs43, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs44, alpha_vr
+ xvmulsp vs1, vs45, alpha_vr
+ xvmulsp vs2, vs46, alpha_vr
+ xvmulsp vs3, vs47, alpha_vr
+#else
+ xvmaddasp vs0, vs44, alpha_vr
+ xvmaddasp vs1, vs45, alpha_vr
+ xvmaddasp vs2, vs46, alpha_vr
+ xvmaddasp vs3, vs47, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs48, alpha_vr
+ xvmulsp vs1, vs49, alpha_vr
+ xvmulsp vs2, vs50, alpha_vr
+ xvmulsp vs3, vs51, alpha_vr
+#else
+ xvmaddasp vs0, vs48, alpha_vr
+ xvmaddasp vs1, vs49, alpha_vr
+ xvmaddasp vs2, vs50, alpha_vr
+ xvmaddasp vs3, vs51, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs52, alpha_vr
+ xvmulsp vs1, vs53, alpha_vr
+ xvmulsp vs2, vs54, alpha_vr
+ xvmulsp vs3, vs55, alpha_vr
+#else
+ xvmaddasp vs0, vs52, alpha_vr
+ xvmaddasp vs1, vs53, alpha_vr
+ xvmaddasp vs2, vs54, alpha_vr
+ xvmaddasp vs3, vs55, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs56, alpha_vr
+ xvmulsp vs1, vs57, alpha_vr
+ xvmulsp vs2, vs58, alpha_vr
+ xvmulsp vs3, vs59, alpha_vr
+#else
+ xvmaddasp vs0, vs56, alpha_vr
+ xvmaddasp vs1, vs57, alpha_vr
+ xvmaddasp vs2, vs58, alpha_vr
+ xvmaddasp vs3, vs59, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs60, alpha_vr
+ xvmulsp vs1, vs61, alpha_vr
+ xvmulsp vs2, vs62, alpha_vr
+ xvmulsp vs3, vs63, alpha_vr
+#else
+ xvmaddasp vs0, vs60, alpha_vr
+ xvmaddasp vs1, vs61, alpha_vr
+ xvmaddasp vs2, vs62, alpha_vr
+ xvmaddasp vs3, vs63, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=8
+**********************************************************************************************/
+
+.macro LOAD8x8_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs12, o0, T1
+ lxvw4x vs13, o16, T1
+ lxvw4x vs14, o32, T1
+ lxvw4x vs15, o48, T1
+
+ addi BO, BO, 128
+
+.endm
+
+.macro KERNEL8x8_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+ lxvw4x vs18, o32, T1
+ lxvw4x vs19, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs20, o0, T1
+ lxvw4x vs21, o16, T1
+ lxvw4x vs22, o32, T1
+ lxvw4x vs23, o48, T1
+
+ addi BO, BO, 128
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+ xvmulsp vs36, vs0, vs10
+ xvmulsp vs37, vs1, vs10
+
+ xvmulsp vs38, vs0, vs11
+ xvmulsp vs39, vs1, vs11
+
+ xvmulsp vs40, vs0, vs12
+ xvmulsp vs41, vs1, vs12
+
+ xvmulsp vs42, vs0, vs13
+ xvmulsp vs43, vs1, vs13
+
+ xvmulsp vs44, vs0, vs14
+ xvmulsp vs45, vs1, vs14
+
+ xvmulsp vs46, vs0, vs15
+ xvmulsp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x8_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+ lxvw4x vs18, o32, T1
+ lxvw4x vs19, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs20, o0, T1
+ lxvw4x vs21, o16, T1
+ lxvw4x vs22, o32, T1
+ lxvw4x vs23, o48, T1
+
+ addi BO, BO, 128
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+ xvmaddasp vs36, vs0, vs10
+ xvmaddasp vs37, vs1, vs10
+
+ xvmaddasp vs38, vs0, vs11
+ xvmaddasp vs39, vs1, vs11
+
+ xvmaddasp vs40, vs0, vs12
+ xvmaddasp vs41, vs1, vs12
+
+ xvmaddasp vs42, vs0, vs13
+ xvmaddasp vs43, vs1, vs13
+
+ xvmaddasp vs44, vs0, vs14
+ xvmaddasp vs45, vs1, vs14
+
+ xvmaddasp vs46, vs0, vs15
+ xvmaddasp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x8_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs12, o0, T1
+ lxvw4x vs13, o16, T1
+ lxvw4x vs14, o32, T1
+ lxvw4x vs15, o48, T1
+
+ addi BO, BO, 128
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+ xvmaddasp vs36, vs4, vs18
+ xvmaddasp vs37, vs5, vs18
+
+ xvmaddasp vs38, vs4, vs19
+ xvmaddasp vs39, vs5, vs19
+
+ xvmaddasp vs40, vs4, vs20
+ xvmaddasp vs41, vs5, vs20
+
+ xvmaddasp vs42, vs4, vs21
+ xvmaddasp vs43, vs5, vs21
+
+ xvmaddasp vs44, vs4, vs22
+ xvmaddasp vs45, vs5, vs22
+
+ xvmaddasp vs46, vs4, vs23
+ xvmaddasp vs47, vs5, vs23
+
+
+.endm
+
+.macro KERNEL8x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+ xvmaddasp vs36, vs4, vs18
+ xvmaddasp vs37, vs5, vs18
+
+ xvmaddasp vs38, vs4, vs19
+ xvmaddasp vs39, vs5, vs19
+
+ xvmaddasp vs40, vs4, vs20
+ xvmaddasp vs41, vs5, vs20
+
+ xvmaddasp vs42, vs4, vs21
+ xvmaddasp vs43, vs5, vs21
+
+ xvmaddasp vs44, vs4, vs22
+ xvmaddasp vs45, vs5, vs22
+
+ xvmaddasp vs46, vs4, vs23
+ xvmaddasp vs47, vs5, vs23
+
+
+.endm
+
+.macro KERNEL8x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs12, o0, T1
+ lxvw4x vs13, o16, T1
+ lxvw4x vs14, o32, T1
+ lxvw4x vs15, o48, T1
+
+ addi BO, BO, 128
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+ xvmulsp vs36, vs0, vs10
+ xvmulsp vs37, vs1, vs10
+
+ xvmulsp vs38, vs0, vs11
+ xvmulsp vs39, vs1, vs11
+
+ xvmulsp vs40, vs0, vs12
+ xvmulsp vs41, vs1, vs12
+
+ xvmulsp vs42, vs0, vs13
+ xvmulsp vs43, vs1, vs13
+
+ xvmulsp vs44, vs0, vs14
+ xvmulsp vs45, vs1, vs14
+
+ xvmulsp vs46, vs0, vs15
+ xvmulsp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x8_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs12, o0, T1
+ lxvw4x vs13, o16, T1
+ lxvw4x vs14, o32, T1
+ lxvw4x vs15, o48, T1
+
+ addi BO, BO, 128
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+ xvmaddasp vs36, vs0, vs10
+ xvmaddasp vs37, vs1, vs10
+
+ xvmaddasp vs38, vs0, vs11
+ xvmaddasp vs39, vs1, vs11
+
+ xvmaddasp vs40, vs0, vs12
+ xvmaddasp vs41, vs1, vs12
+
+ xvmaddasp vs42, vs0, vs13
+ xvmaddasp vs43, vs1, vs13
+
+ xvmaddasp vs44, vs0, vs14
+ xvmaddasp vs45, vs1, vs14
+
+ xvmaddasp vs46, vs0, vs15
+ xvmaddasp vs47, vs1, vs15
+
+
+.endm
+
+.macro SAVE8x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs34, alpha_vr
+ xvmulsp vs1, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs34, alpha_vr
+ xvmaddasp vs1, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs36, alpha_vr
+ xvmulsp vs1, vs37, alpha_vr
+#else
+ xvmaddasp vs0, vs36, alpha_vr
+ xvmaddasp vs1, vs37, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs38, alpha_vr
+ xvmulsp vs1, vs39, alpha_vr
+#else
+ xvmaddasp vs0, vs38, alpha_vr
+ xvmaddasp vs1, vs39, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs40, alpha_vr
+ xvmulsp vs1, vs41, alpha_vr
+#else
+ xvmaddasp vs0, vs40, alpha_vr
+ xvmaddasp vs1, vs41, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs42, alpha_vr
+ xvmulsp vs1, vs43, alpha_vr
+#else
+ xvmaddasp vs0, vs42, alpha_vr
+ xvmaddasp vs1, vs43, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs44, alpha_vr
+ xvmulsp vs1, vs45, alpha_vr
+#else
+ xvmaddasp vs0, vs44, alpha_vr
+ xvmaddasp vs1, vs45, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs46, alpha_vr
+ xvmulsp vs1, vs47, alpha_vr
+#else
+ xvmaddasp vs0, vs46, alpha_vr
+ xvmaddasp vs1, vs47, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=4
+**********************************************************************************************/
+
+.macro LOAD8x4_1
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs12, o0, T1
+ lxvw4x vs13, o16, T1
+ lxvw4x vs14, o32, T1
+ lxvw4x vs15, o48, T1
+
+ addi BO, BO, 128
+
+.endm
+
+.macro KERNEL8x4_I1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+ lxvw4x vs18, o32, T1
+ lxvw4x vs19, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs20, o0, T1
+ lxvw4x vs21, o16, T1
+ lxvw4x vs22, o32, T1
+ lxvw4x vs23, o48, T1
+
+ addi BO, BO, 128
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+ xvmulsp vs34, vs0, vs10
+
+ xvmulsp vs35, vs0, vs11
+
+ xvmulsp vs36, vs0, vs12
+
+ xvmulsp vs37, vs0, vs13
+
+ xvmulsp vs38, vs0, vs14
+
+ xvmulsp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x4_1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+ lxvw4x vs18, o32, T1
+ lxvw4x vs19, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs20, o0, T1
+ lxvw4x vs21, o16, T1
+ lxvw4x vs22, o32, T1
+ lxvw4x vs23, o48, T1
+
+ addi BO, BO, 128
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+ xvmaddasp vs34, vs0, vs10
+
+ xvmaddasp vs35, vs0, vs11
+
+ xvmaddasp vs36, vs0, vs12
+
+ xvmaddasp vs37, vs0, vs13
+
+ xvmaddasp vs38, vs0, vs14
+
+ xvmaddasp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x4_2
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs12, o0, T1
+ lxvw4x vs13, o16, T1
+ lxvw4x vs14, o32, T1
+ lxvw4x vs15, o48, T1
+
+ addi BO, BO, 128
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+ xvmaddasp vs34, vs4, vs18
+
+ xvmaddasp vs35, vs4, vs19
+
+ xvmaddasp vs36, vs4, vs20
+
+ xvmaddasp vs37, vs4, vs21
+
+ xvmaddasp vs38, vs4, vs22
+
+ xvmaddasp vs39, vs4, vs23
+
+
+.endm
+
+.macro KERNEL8x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+ xvmaddasp vs34, vs4, vs18
+
+ xvmaddasp vs35, vs4, vs19
+
+ xvmaddasp vs36, vs4, vs20
+
+ xvmaddasp vs37, vs4, vs21
+
+ xvmaddasp vs38, vs4, vs22
+
+ xvmaddasp vs39, vs4, vs23
+
+
+.endm
+
+.macro KERNEL8x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs12, o0, T1
+ lxvw4x vs13, o16, T1
+ lxvw4x vs14, o32, T1
+ lxvw4x vs15, o48, T1
+
+ addi BO, BO, 128
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+ xvmulsp vs34, vs0, vs10
+
+ xvmulsp vs35, vs0, vs11
+
+ xvmulsp vs36, vs0, vs12
+
+ xvmulsp vs37, vs0, vs13
+
+ xvmulsp vs38, vs0, vs14
+
+ xvmulsp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x4_SUB1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxvw4x vs12, o0, T1
+ lxvw4x vs13, o16, T1
+ lxvw4x vs14, o32, T1
+ lxvw4x vs15, o48, T1
+
+ addi BO, BO, 128
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+ xvmaddasp vs34, vs0, vs10
+
+ xvmaddasp vs35, vs0, vs11
+
+ xvmaddasp vs36, vs0, vs12
+
+ xvmaddasp vs37, vs0, vs13
+
+ xvmaddasp vs38, vs0, vs14
+
+ xvmaddasp vs39, vs0, vs15
+
+
+.endm
+
+.macro SAVE8x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs33, alpha_vr
+#else
+ xvmaddasp vs0, vs33, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs34, alpha_vr
+#else
+ xvmaddasp vs0, vs34, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs36, alpha_vr
+#else
+ xvmaddasp vs0, vs36, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs37, alpha_vr
+#else
+ xvmaddasp vs0, vs37, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs38, alpha_vr
+#else
+ xvmaddasp vs0, vs38, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs39, alpha_vr
+#else
+ xvmaddasp vs0, vs39, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=2
+**********************************************************************************************/
+
+.macro LOAD8x2_1
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o16, T1
+ lxsspx vs14, o32, T1
+ lxsspx vs15, o48, T1
+
+
+ addi BO, BO, 128
+
+.endm
+
+.macro KERNEL8x2_I1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o16, T1
+ lxsspx vs18, o32, T1
+ lxsspx vs19, o48, T1
+
+ addi T1, T1, 64
+
+ lxsspx vs20, o0, T1
+ lxsspx vs21, o16, T1
+ lxsspx vs22, o32, T1
+ lxsspx vs23, o48, T1
+
+
+ addi BO, BO, 128
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+ xsmuldp vs34, vs0, vs9
+ xsmuldp vs35, vs1, vs9
+
+ xsmuldp vs36, vs0, vs10
+ xsmuldp vs37, vs1, vs10
+
+ xsmuldp vs38, vs0, vs11
+ xsmuldp vs39, vs1, vs11
+
+ xsmuldp vs40, vs0, vs12
+ xsmuldp vs41, vs1, vs12
+
+ xsmuldp vs42, vs0, vs13
+ xsmuldp vs43, vs1, vs13
+
+ xsmuldp vs44, vs0, vs14
+ xsmuldp vs45, vs1, vs14
+
+ xsmuldp vs46, vs0, vs15
+ xsmuldp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x2_1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o16, T1
+ lxsspx vs18, o32, T1
+ lxsspx vs19, o48, T1
+
+ addi T1, T1, 64
+
+ lxsspx vs20, o0, T1
+ lxsspx vs21, o16, T1
+ lxsspx vs22, o32, T1
+ lxsspx vs23, o48, T1
+
+
+ addi BO, BO, 128
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+ xsmaddadp vs34, vs0, vs9
+ xsmaddadp vs35, vs1, vs9
+
+ xsmaddadp vs36, vs0, vs10
+ xsmaddadp vs37, vs1, vs10
+
+ xsmaddadp vs38, vs0, vs11
+ xsmaddadp vs39, vs1, vs11
+
+ xsmaddadp vs40, vs0, vs12
+ xsmaddadp vs41, vs1, vs12
+
+ xsmaddadp vs42, vs0, vs13
+ xsmaddadp vs43, vs1, vs13
+
+ xsmaddadp vs44, vs0, vs14
+ xsmaddadp vs45, vs1, vs14
+
+ xsmaddadp vs46, vs0, vs15
+ xsmaddadp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x2_2
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o16, T1
+ lxsspx vs14, o32, T1
+ lxsspx vs15, o48, T1
+
+
+ addi BO, BO, 128
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+ xsmaddadp vs34, vs4, vs17
+ xsmaddadp vs35, vs5, vs17
+
+ xsmaddadp vs36, vs4, vs18
+ xsmaddadp vs37, vs5, vs18
+
+ xsmaddadp vs38, vs4, vs19
+ xsmaddadp vs39, vs5, vs19
+
+ xsmaddadp vs40, vs4, vs20
+ xsmaddadp vs41, vs5, vs20
+
+ xsmaddadp vs42, vs4, vs21
+ xsmaddadp vs43, vs5, vs21
+
+ xsmaddadp vs44, vs4, vs22
+ xsmaddadp vs45, vs5, vs22
+
+ xsmaddadp vs46, vs4, vs23
+ xsmaddadp vs47, vs5, vs23
+
+
+.endm
+
+.macro KERNEL8x2_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+ xsmaddadp vs34, vs4, vs17
+ xsmaddadp vs35, vs5, vs17
+
+ xsmaddadp vs36, vs4, vs18
+ xsmaddadp vs37, vs5, vs18
+
+ xsmaddadp vs38, vs4, vs19
+ xsmaddadp vs39, vs5, vs19
+
+ xsmaddadp vs40, vs4, vs20
+ xsmaddadp vs41, vs5, vs20
+
+ xsmaddadp vs42, vs4, vs21
+ xsmaddadp vs43, vs5, vs21
+
+ xsmaddadp vs44, vs4, vs22
+ xsmaddadp vs45, vs5, vs22
+
+ xsmaddadp vs46, vs4, vs23
+ xsmaddadp vs47, vs5, vs23
+
+
+.endm
+
+.macro KERNEL8x2_SUBI1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o16, T1
+ lxsspx vs14, o32, T1
+ lxsspx vs15, o48, T1
+
+
+ addi BO, BO, 128
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+ xsmuldp vs34, vs0, vs9
+ xsmuldp vs35, vs1, vs9
+
+ xsmuldp vs36, vs0, vs10
+ xsmuldp vs37, vs1, vs10
+
+ xsmuldp vs38, vs0, vs11
+ xsmuldp vs39, vs1, vs11
+
+ xsmuldp vs40, vs0, vs12
+ xsmuldp vs41, vs1, vs12
+
+ xsmuldp vs42, vs0, vs13
+ xsmuldp vs43, vs1, vs13
+
+ xsmuldp vs44, vs0, vs14
+ xsmuldp vs45, vs1, vs14
+
+ xsmuldp vs46, vs0, vs15
+ xsmuldp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x2_SUB1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o16, T1
+ lxsspx vs14, o32, T1
+ lxsspx vs15, o48, T1
+
+
+ addi BO, BO, 128
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+ xsmaddadp vs34, vs0, vs9
+ xsmaddadp vs35, vs1, vs9
+
+ xsmaddadp vs36, vs0, vs10
+ xsmaddadp vs37, vs1, vs10
+
+ xsmaddadp vs38, vs0, vs11
+ xsmaddadp vs39, vs1, vs11
+
+ xsmaddadp vs40, vs0, vs12
+ xsmaddadp vs41, vs1, vs12
+
+ xsmaddadp vs42, vs0, vs13
+ xsmaddadp vs43, vs1, vs13
+
+ xsmaddadp vs44, vs0, vs14
+ xsmaddadp vs45, vs1, vs14
+
+ xsmaddadp vs46, vs0, vs15
+ xsmaddadp vs47, vs1, vs15
+
+
+.endm
+
+.macro SAVE8x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+ xsmuldp vs1, vs33, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+ xsmaddadp vs1, vs33, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs34, alpha_r
+ xsmuldp vs1, vs35, alpha_r
+#else
+ xsmaddadp vs0, vs34, alpha_r
+ xsmaddadp vs1, vs35, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs36, alpha_r
+ xsmuldp vs1, vs37, alpha_r
+#else
+ xsmaddadp vs0, vs36, alpha_r
+ xsmaddadp vs1, vs37, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs38, alpha_r
+ xsmuldp vs1, vs39, alpha_r
+#else
+ xsmaddadp vs0, vs38, alpha_r
+ xsmaddadp vs1, vs39, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs40, alpha_r
+ xsmuldp vs1, vs41, alpha_r
+#else
+ xsmaddadp vs0, vs40, alpha_r
+ xsmaddadp vs1, vs41, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs42, alpha_r
+ xsmuldp vs1, vs43, alpha_r
+#else
+ xsmaddadp vs0, vs42, alpha_r
+ xsmaddadp vs1, vs43, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs44, alpha_r
+ xsmuldp vs1, vs45, alpha_r
+#else
+ xsmaddadp vs0, vs44, alpha_r
+ xsmaddadp vs1, vs45, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs46, alpha_r
+ xsmuldp vs1, vs47, alpha_r
+#else
+ xsmaddadp vs0, vs46, alpha_r
+ xsmaddadp vs1, vs47, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=1
+**********************************************************************************************/
+
+.macro LOAD8x1_1
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o16, T1
+ lxsspx vs14, o32, T1
+ lxsspx vs15, o48, T1
+
+
+ addi BO, BO, 128
+
+.endm
+
+.macro KERNEL8x1_I1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o16, T1
+ lxsspx vs18, o32, T1
+ lxsspx vs19, o48, T1
+
+ addi T1, T1, 64
+
+ lxsspx vs20, o0, T1
+ lxsspx vs21, o16, T1
+ lxsspx vs22, o32, T1
+ lxsspx vs23, o48, T1
+
+
+ addi BO, BO, 128
+
+
+ xsmuldp vs32, vs0, vs8
+
+ xsmuldp vs33, vs0, vs9
+
+ xsmuldp vs34, vs0, vs10
+
+ xsmuldp vs35, vs0, vs11
+
+ xsmuldp vs36, vs0, vs12
+
+ xsmuldp vs37, vs0, vs13
+
+ xsmuldp vs38, vs0, vs14
+
+ xsmuldp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x1_1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o16, T1
+ lxsspx vs18, o32, T1
+ lxsspx vs19, o48, T1
+
+ addi T1, T1, 64
+
+ lxsspx vs20, o0, T1
+ lxsspx vs21, o16, T1
+ lxsspx vs22, o32, T1
+ lxsspx vs23, o48, T1
+
+
+ addi BO, BO, 128
+
+
+ xsmaddadp vs32, vs0, vs8
+
+ xsmaddadp vs33, vs0, vs9
+
+ xsmaddadp vs34, vs0, vs10
+
+ xsmaddadp vs35, vs0, vs11
+
+ xsmaddadp vs36, vs0, vs12
+
+ xsmaddadp vs37, vs0, vs13
+
+ xsmaddadp vs38, vs0, vs14
+
+ xsmaddadp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x1_2
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o16, T1
+ lxsspx vs14, o32, T1
+ lxsspx vs15, o48, T1
+
+
+ addi BO, BO, 128
+
+
+ xsmaddadp vs32, vs4, vs16
+
+ xsmaddadp vs33, vs4, vs17
+
+ xsmaddadp vs34, vs4, vs18
+
+ xsmaddadp vs35, vs4, vs19
+
+ xsmaddadp vs36, vs4, vs20
+
+ xsmaddadp vs37, vs4, vs21
+
+ xsmaddadp vs38, vs4, vs22
+
+ xsmaddadp vs39, vs4, vs23
+
+
+.endm
+
+.macro KERNEL8x1_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+
+ xsmaddadp vs33, vs4, vs17
+
+ xsmaddadp vs34, vs4, vs18
+
+ xsmaddadp vs35, vs4, vs19
+
+ xsmaddadp vs36, vs4, vs20
+
+ xsmaddadp vs37, vs4, vs21
+
+ xsmaddadp vs38, vs4, vs22
+
+ xsmaddadp vs39, vs4, vs23
+
+
+.endm
+
+.macro KERNEL8x1_SUBI1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o16, T1
+ lxsspx vs14, o32, T1
+ lxsspx vs15, o48, T1
+
+
+ addi BO, BO, 128
+
+
+ xsmuldp vs32, vs0, vs8
+
+ xsmuldp vs33, vs0, vs9
+
+ xsmuldp vs34, vs0, vs10
+
+ xsmuldp vs35, vs0, vs11
+
+ xsmuldp vs36, vs0, vs12
+
+ xsmuldp vs37, vs0, vs13
+
+ xsmuldp vs38, vs0, vs14
+
+ xsmuldp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x1_SUB1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+ addi T1, T1, 64
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o16, T1
+ lxsspx vs14, o32, T1
+ lxsspx vs15, o48, T1
+
+
+ addi BO, BO, 128
+
+
+ xsmaddadp vs32, vs0, vs8
+
+ xsmaddadp vs33, vs0, vs9
+
+ xsmaddadp vs34, vs0, vs10
+
+ xsmaddadp vs35, vs0, vs11
+
+ xsmaddadp vs36, vs0, vs12
+
+ xsmaddadp vs37, vs0, vs13
+
+ xsmaddadp vs38, vs0, vs14
+
+ xsmaddadp vs39, vs0, vs15
+
+
+.endm
+
+.macro SAVE8x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs33, alpha_r
+#else
+ xsmaddadp vs0, vs33, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs34, alpha_r
+#else
+ xsmaddadp vs0, vs34, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs35, alpha_r
+#else
+ xsmaddadp vs0, vs35, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs36, alpha_r
+#else
+ xsmaddadp vs0, vs36, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs37, alpha_r
+#else
+ xsmaddadp vs0, vs37, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs38, alpha_r
+#else
+ xsmaddadp vs0, vs38, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs39, alpha_r
+#else
+ xsmaddadp vs0, vs39, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro LOAD4x16_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi BO, BO, 64
+
+.endm
+
+.macro KERNEL4x16_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+ lxvw4x vs18, o32, T1
+ lxvw4x vs19, o48, T1
+
+ addi BO, BO, 64
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+ xvmulsp vs40, vs0, vs10
+ xvmulsp vs41, vs1, vs10
+ xvmulsp vs42, vs2, vs10
+ xvmulsp vs43, vs3, vs10
+
+ xvmulsp vs44, vs0, vs11
+ xvmulsp vs45, vs1, vs11
+ xvmulsp vs46, vs2, vs11
+ xvmulsp vs47, vs3, vs11
+
+
+.endm
+
+.macro KERNEL4x16_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+ lxvw4x vs18, o32, T1
+ lxvw4x vs19, o48, T1
+
+ addi BO, BO, 64
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+ xvmaddasp vs40, vs0, vs10
+ xvmaddasp vs41, vs1, vs10
+ xvmaddasp vs42, vs2, vs10
+ xvmaddasp vs43, vs3, vs10
+
+ xvmaddasp vs44, vs0, vs11
+ xvmaddasp vs45, vs1, vs11
+ xvmaddasp vs46, vs2, vs11
+ xvmaddasp vs47, vs3, vs11
+
+
+.endm
+
+.macro KERNEL4x16_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi BO, BO, 64
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+ xvmaddasp vs40, vs4, vs18
+ xvmaddasp vs41, vs5, vs18
+ xvmaddasp vs42, vs6, vs18
+ xvmaddasp vs43, vs7, vs18
+
+ xvmaddasp vs44, vs4, vs19
+ xvmaddasp vs45, vs5, vs19
+ xvmaddasp vs46, vs6, vs19
+ xvmaddasp vs47, vs7, vs19
+
+
+.endm
+
+.macro KERNEL4x16_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+ xvmaddasp vs40, vs4, vs18
+ xvmaddasp vs41, vs5, vs18
+ xvmaddasp vs42, vs6, vs18
+ xvmaddasp vs43, vs7, vs18
+
+ xvmaddasp vs44, vs4, vs19
+ xvmaddasp vs45, vs5, vs19
+ xvmaddasp vs46, vs6, vs19
+ xvmaddasp vs47, vs7, vs19
+
+
+.endm
+
+.macro KERNEL4x16_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi BO, BO, 64
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+ xvmulsp vs40, vs0, vs10
+ xvmulsp vs41, vs1, vs10
+ xvmulsp vs42, vs2, vs10
+ xvmulsp vs43, vs3, vs10
+
+ xvmulsp vs44, vs0, vs11
+ xvmulsp vs45, vs1, vs11
+ xvmulsp vs46, vs2, vs11
+ xvmulsp vs47, vs3, vs11
+
+
+.endm
+
+.macro KERNEL4x16_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi BO, BO, 64
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+ xvmaddasp vs40, vs0, vs10
+ xvmaddasp vs41, vs1, vs10
+ xvmaddasp vs42, vs2, vs10
+ xvmaddasp vs43, vs3, vs10
+
+ xvmaddasp vs44, vs0, vs11
+ xvmaddasp vs45, vs1, vs11
+ xvmaddasp vs46, vs2, vs11
+ xvmaddasp vs47, vs3, vs11
+
+
+.endm
+
+.macro SAVE4x16
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+ xvmulsp vs2, vs34, alpha_vr
+ xvmulsp vs3, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+ xvmaddasp vs2, vs34, alpha_vr
+ xvmaddasp vs3, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs36, alpha_vr
+ xvmulsp vs1, vs37, alpha_vr
+ xvmulsp vs2, vs38, alpha_vr
+ xvmulsp vs3, vs39, alpha_vr
+#else
+ xvmaddasp vs0, vs36, alpha_vr
+ xvmaddasp vs1, vs37, alpha_vr
+ xvmaddasp vs2, vs38, alpha_vr
+ xvmaddasp vs3, vs39, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs40, alpha_vr
+ xvmulsp vs1, vs41, alpha_vr
+ xvmulsp vs2, vs42, alpha_vr
+ xvmulsp vs3, vs43, alpha_vr
+#else
+ xvmaddasp vs0, vs40, alpha_vr
+ xvmaddasp vs1, vs41, alpha_vr
+ xvmaddasp vs2, vs42, alpha_vr
+ xvmaddasp vs3, vs43, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs44, alpha_vr
+ xvmulsp vs1, vs45, alpha_vr
+ xvmulsp vs2, vs46, alpha_vr
+ xvmulsp vs3, vs47, alpha_vr
+#else
+ xvmaddasp vs0, vs44, alpha_vr
+ xvmaddasp vs1, vs45, alpha_vr
+ xvmaddasp vs2, vs46, alpha_vr
+ xvmaddasp vs3, vs47, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi BO, BO, 64
+
+.endm
+
+.macro KERNEL4x8_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+ lxvw4x vs18, o32, T1
+ lxvw4x vs19, o48, T1
+
+ addi BO, BO, 64
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+ xvmulsp vs36, vs0, vs10
+ xvmulsp vs37, vs1, vs10
+
+ xvmulsp vs38, vs0, vs11
+ xvmulsp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x8_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+ lxvw4x vs18, o32, T1
+ lxvw4x vs19, o48, T1
+
+ addi BO, BO, 64
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+ xvmaddasp vs36, vs0, vs10
+ xvmaddasp vs37, vs1, vs10
+
+ xvmaddasp vs38, vs0, vs11
+ xvmaddasp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x8_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi BO, BO, 64
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+ xvmaddasp vs36, vs4, vs18
+ xvmaddasp vs37, vs5, vs18
+
+ xvmaddasp vs38, vs4, vs19
+ xvmaddasp vs39, vs5, vs19
+
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+ xvmaddasp vs36, vs4, vs18
+ xvmaddasp vs37, vs5, vs18
+
+ xvmaddasp vs38, vs4, vs19
+ xvmaddasp vs39, vs5, vs19
+
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi BO, BO, 64
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+ xvmulsp vs36, vs0, vs10
+ xvmulsp vs37, vs1, vs10
+
+ xvmulsp vs38, vs0, vs11
+ xvmulsp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi BO, BO, 64
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+ xvmaddasp vs36, vs0, vs10
+ xvmaddasp vs37, vs1, vs10
+
+ xvmaddasp vs38, vs0, vs11
+ xvmaddasp vs39, vs1, vs11
+
+
+.endm
+
+.macro SAVE4x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs34, alpha_vr
+ xvmulsp vs1, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs34, alpha_vr
+ xvmaddasp vs1, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs36, alpha_vr
+ xvmulsp vs1, vs37, alpha_vr
+#else
+ xvmaddasp vs0, vs36, alpha_vr
+ xvmaddasp vs1, vs37, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs38, alpha_vr
+ xvmulsp vs1, vs39, alpha_vr
+#else
+ xvmaddasp vs0, vs38, alpha_vr
+ xvmaddasp vs1, vs39, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi BO, BO, 64
+
+.endm
+
+.macro KERNEL4x4_I1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+ lxvw4x vs18, o32, T1
+ lxvw4x vs19, o48, T1
+
+ addi BO, BO, 64
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+ xvmulsp vs34, vs0, vs10
+
+ xvmulsp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x4_1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+ lxvw4x vs18, o32, T1
+ lxvw4x vs19, o48, T1
+
+ addi BO, BO, 64
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+ xvmaddasp vs34, vs0, vs10
+
+ xvmaddasp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x4_2
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi BO, BO, 64
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+ xvmaddasp vs34, vs4, vs18
+
+ xvmaddasp vs35, vs4, vs19
+
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+ xvmaddasp vs34, vs4, vs18
+
+ xvmaddasp vs35, vs4, vs19
+
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi BO, BO, 64
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+ xvmulsp vs34, vs0, vs10
+
+ xvmulsp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+ lxvw4x vs10, o32, T1
+ lxvw4x vs11, o48, T1
+
+ addi BO, BO, 64
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+ xvmaddasp vs34, vs0, vs10
+
+ xvmaddasp vs35, vs0, vs11
+
+
+.endm
+
+.macro SAVE4x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs33, alpha_vr
+#else
+ xvmaddasp vs0, vs33, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs34, alpha_vr
+#else
+ xvmaddasp vs0, vs34, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro LOAD4x2_1
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+
+ addi BO, BO, 64
+
+.endm
+
+.macro KERNEL4x2_I1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o16, T1
+ lxsspx vs18, o32, T1
+ lxsspx vs19, o48, T1
+
+
+ addi BO, BO, 64
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+ xsmuldp vs34, vs0, vs9
+ xsmuldp vs35, vs1, vs9
+
+ xsmuldp vs36, vs0, vs10
+ xsmuldp vs37, vs1, vs10
+
+ xsmuldp vs38, vs0, vs11
+ xsmuldp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x2_1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o16, T1
+ lxsspx vs18, o32, T1
+ lxsspx vs19, o48, T1
+
+
+ addi BO, BO, 64
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+ xsmaddadp vs34, vs0, vs9
+ xsmaddadp vs35, vs1, vs9
+
+ xsmaddadp vs36, vs0, vs10
+ xsmaddadp vs37, vs1, vs10
+
+ xsmaddadp vs38, vs0, vs11
+ xsmaddadp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x2_2
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+
+ addi BO, BO, 64
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+ xsmaddadp vs34, vs4, vs17
+ xsmaddadp vs35, vs5, vs17
+
+ xsmaddadp vs36, vs4, vs18
+ xsmaddadp vs37, vs5, vs18
+
+ xsmaddadp vs38, vs4, vs19
+ xsmaddadp vs39, vs5, vs19
+
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+ xsmaddadp vs34, vs4, vs17
+ xsmaddadp vs35, vs5, vs17
+
+ xsmaddadp vs36, vs4, vs18
+ xsmaddadp vs37, vs5, vs18
+
+ xsmaddadp vs38, vs4, vs19
+ xsmaddadp vs39, vs5, vs19
+
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+
+ addi BO, BO, 64
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+ xsmuldp vs34, vs0, vs9
+ xsmuldp vs35, vs1, vs9
+
+ xsmuldp vs36, vs0, vs10
+ xsmuldp vs37, vs1, vs10
+
+ xsmuldp vs38, vs0, vs11
+ xsmuldp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+
+ addi BO, BO, 64
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+ xsmaddadp vs34, vs0, vs9
+ xsmaddadp vs35, vs1, vs9
+
+ xsmaddadp vs36, vs0, vs10
+ xsmaddadp vs37, vs1, vs10
+
+ xsmaddadp vs38, vs0, vs11
+ xsmaddadp vs39, vs1, vs11
+
+
+.endm
+
+.macro SAVE4x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+ xsmuldp vs1, vs33, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+ xsmaddadp vs1, vs33, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs34, alpha_r
+ xsmuldp vs1, vs35, alpha_r
+#else
+ xsmaddadp vs0, vs34, alpha_r
+ xsmaddadp vs1, vs35, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs36, alpha_r
+ xsmuldp vs1, vs37, alpha_r
+#else
+ xsmaddadp vs0, vs36, alpha_r
+ xsmaddadp vs1, vs37, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs38, alpha_r
+ xsmuldp vs1, vs39, alpha_r
+#else
+ xsmaddadp vs0, vs38, alpha_r
+ xsmaddadp vs1, vs39, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro LOAD4x1_1
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+
+ addi BO, BO, 64
+
+.endm
+
+.macro KERNEL4x1_I1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o16, T1
+ lxsspx vs18, o32, T1
+ lxsspx vs19, o48, T1
+
+
+ addi BO, BO, 64
+
+
+ xsmuldp vs32, vs0, vs8
+
+ xsmuldp vs33, vs0, vs9
+
+ xsmuldp vs34, vs0, vs10
+
+ xsmuldp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x1_1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o16, T1
+ lxsspx vs18, o32, T1
+ lxsspx vs19, o48, T1
+
+
+ addi BO, BO, 64
+
+
+ xsmaddadp vs32, vs0, vs8
+
+ xsmaddadp vs33, vs0, vs9
+
+ xsmaddadp vs34, vs0, vs10
+
+ xsmaddadp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x1_2
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+
+ addi BO, BO, 64
+
+
+ xsmaddadp vs32, vs4, vs16
+
+ xsmaddadp vs33, vs4, vs17
+
+ xsmaddadp vs34, vs4, vs18
+
+ xsmaddadp vs35, vs4, vs19
+
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+
+ xsmaddadp vs33, vs4, vs17
+
+ xsmaddadp vs34, vs4, vs18
+
+ xsmaddadp vs35, vs4, vs19
+
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+
+ addi BO, BO, 64
+
+
+ xsmuldp vs32, vs0, vs8
+
+ xsmuldp vs33, vs0, vs9
+
+ xsmuldp vs34, vs0, vs10
+
+ xsmuldp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+ lxsspx vs10, o32, T1
+ lxsspx vs11, o48, T1
+
+
+ addi BO, BO, 64
+
+
+ xsmaddadp vs32, vs0, vs8
+
+ xsmaddadp vs33, vs0, vs9
+
+ xsmaddadp vs34, vs0, vs10
+
+ xsmaddadp vs35, vs0, vs11
+
+
+.endm
+
+.macro SAVE4x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs33, alpha_r
+#else
+ xsmaddadp vs0, vs33, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs34, alpha_r
+#else
+ xsmaddadp vs0, vs34, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs35, alpha_r
+#else
+ xsmaddadp vs0, vs35, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+.macro LOAD2x16_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL2x16_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+
+.endm
+
+.macro KERNEL2x16_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+
+.endm
+
+.macro KERNEL2x16_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+
+.endm
+
+.macro SAVE2x16
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+ xvmulsp vs2, vs34, alpha_vr
+ xvmulsp vs3, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+ xvmaddasp vs2, vs34, alpha_vr
+ xvmaddasp vs3, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs36, alpha_vr
+ xvmulsp vs1, vs37, alpha_vr
+ xvmulsp vs2, vs38, alpha_vr
+ xvmulsp vs3, vs39, alpha_vr
+#else
+ xvmaddasp vs0, vs36, alpha_vr
+ xvmaddasp vs1, vs37, alpha_vr
+ xvmaddasp vs2, vs38, alpha_vr
+ xvmaddasp vs3, vs39, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL2x8_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+
+.endm
+
+.macro SAVE2x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs34, alpha_vr
+ xvmulsp vs1, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs34, alpha_vr
+ xvmaddasp vs1, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL2x4_I1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+ lxvw4x vs17, o16, T1
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+ lxvw4x vs9, o16, T1
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+
+.endm
+
+.macro SAVE2x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs33, alpha_vr
+#else
+ xvmaddasp vs0, vs33, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL2x2_I1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o16, T1
+
+
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+ xsmuldp vs34, vs0, vs9
+ xsmuldp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o16, T1
+
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+ xsmaddadp vs34, vs0, vs9
+ xsmaddadp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+ xsmaddadp vs34, vs4, vs17
+ xsmaddadp vs35, vs5, vs17
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+ xsmaddadp vs34, vs4, vs17
+ xsmaddadp vs35, vs5, vs17
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+
+
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+ xsmuldp vs34, vs0, vs9
+ xsmuldp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+ xsmaddadp vs34, vs0, vs9
+ xsmaddadp vs35, vs1, vs9
+
+
+.endm
+
+.macro SAVE2x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+ xsmuldp vs1, vs33, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+ xsmaddadp vs1, vs33, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs34, alpha_r
+ xsmuldp vs1, vs35, alpha_r
+#else
+ xsmaddadp vs0, vs34, alpha_r
+ xsmaddadp vs1, vs35, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL2x1_I1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o16, T1
+
+
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs8
+
+ xsmuldp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o16, T1
+
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs8
+
+ xsmaddadp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs4, vs16
+
+ xsmaddadp vs33, vs4, vs17
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+
+ xsmaddadp vs33, vs4, vs17
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+
+
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs8
+
+ xsmuldp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o16, T1
+
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs8
+
+ xsmaddadp vs33, vs0, vs9
+
+
+.endm
+
+.macro SAVE2x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs33, alpha_r
+#else
+ xsmaddadp vs0, vs33, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+.macro LOAD1x16_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL1x16_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+
+.endm
+
+.macro KERNEL1x16_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+
+.endm
+
+.macro KERNEL1x16_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+
+.endm
+
+.macro SAVE1x16
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+ xvmulsp vs2, vs34, alpha_vr
+ xvmulsp vs3, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+ xvmaddasp vs2, vs34, alpha_vr
+ xvmaddasp vs3, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL1x8_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+
+.endm
+
+.macro SAVE1x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL1x4_I1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs16, o0, T1
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs4, vs16
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ mr T1, BO
+
+ lxvw4x vs8, o0, T1
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+
+
+.endm
+
+.macro SAVE1x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL1x2_I1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+
+
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+
+
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+
+.endm
+
+.macro SAVE1x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+ xsmuldp vs1, vs33, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+ xsmaddadp vs1, vs33, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL1x1_I1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+
+
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+
+
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs4, vs16
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs8
+
+
+.endm
+
+.macro SAVE1x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 4
+
+.endm
+
diff --git a/kernel/power/srot.c b/kernel/power/srot.c
new file mode 100644
index 000000000..d464846a4
--- /dev/null
+++ b/kernel/power/srot.c
@@ -0,0 +1,167 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/26 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+
+#include "common.h"
+
+#pragma GCC optimize "O1"
+
+#if defined(POWER8)
+#include "srot_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+{
+
+ BLASLONG i=0;
+ FLOAT f0, f1, f2, f3;
+ FLOAT x00, x01, x02, x03;
+ FLOAT g0, g1, g2, g3;
+ FLOAT y00, y01, y02, y03;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ FLOAT c1=*c;
+ FLOAT s1=*s;
+
+ while ( i 0 )
+ {
+ c1[0]=c;
+ c1[1]=c;
+ c1[2]=c;
+ c1[3]=c;
+ s1[0]=s;
+ s1[1]=s;
+ s1[2]=s;
+ s1[3]=s;
+ srot_kernel_16(n1, x1, y1, c1, s1);
+ i=n1;
+ }
+
+ while(i < n)
+ {
+ temp = c*x[i] + s*y[i] ;
+ y[i] = c*y[i] - s*x[i] ;
+ x[i] = temp ;
+
+ i++ ;
+
+ }
+
+
+ }
+ else
+ {
+
+ while(i < n)
+ {
+ temp = c*x[ix] + s*y[iy] ;
+ y[iy] = c*y[iy] - s*x[ix] ;
+ x[ix] = temp ;
+
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c
new file mode 100644
index 000000000..ade65500f
--- /dev/null
+++ b/kernel/power/srot_microk_power8.c
@@ -0,0 +1,208 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+*
+* I don't use fused multiply-add ( precision problems with lapack )
+*
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline));
+
+static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ FLOAT *x2=x+1;
+ FLOAT *y2=y+1;
+
+ __asm__ __volatile__
+ (
+
+ "lxvw4x 36 , 0, %3 \n\t" // load c
+ "lxvw4x 37 , 0, %4 \n\t" // load s
+ "addi %8 , %8, -4 \n\t"
+ "addi %9 , %9, -4 \n\t"
+
+ "lxvw4x 32, 0, %1 \n\t" // load x
+ "lxvw4x 33, %5, %1 \n\t"
+ "lxvw4x 34, %6, %1 \n\t"
+ "lxvw4x 35, %7, %1 \n\t"
+
+ "lxvw4x 40, 0, %2 \n\t" // load y
+ "lxvw4x 41, %5, %2 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+
+ "addi %1, %1, 64 \n\t"
+ "addi %2, %2, 64 \n\t"
+
+ "addic. %0 , %0 , -16 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "xvmulsp 48, 32, 36 \n\t" // c * x
+ "xvmulsp 49, 33, 36 \n\t"
+ "xvmulsp 50, 34, 36 \n\t"
+ "xvmulsp 51, 35, 36 \n\t"
+
+ "xvmulsp 56, 40, 36 \n\t" // c * y
+ "xvmulsp 57, 41, 36 \n\t"
+ "xvmulsp 58, 42, 36 \n\t"
+ "xvmulsp 59, 43, 36 \n\t"
+
+ "xvmulsp 52, 32, 37 \n\t" // s * x
+ "xvmulsp 53, 33, 37 \n\t"
+
+ "lxvw4x 32, 0, %1 \n\t" // load x
+ "lxvw4x 33, %5, %1 \n\t"
+
+ "xvmulsp 54, 34, 37 \n\t"
+ "xvmulsp 55, 35, 37 \n\t"
+
+ "lxvw4x 34, %6, %1 \n\t"
+ "lxvw4x 35, %7, %1 \n\t"
+
+ "xvmulsp 60, 40, 37 \n\t" // s * y
+ "xvmulsp 61, 41, 37 \n\t"
+
+ "lxvw4x 40, 0, %2 \n\t" // load y
+ "lxvw4x 41, %5, %2 \n\t"
+
+ "xvmulsp 62, 42, 37 \n\t"
+ "xvmulsp 63, 43, 37 \n\t"
+
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+
+ "xvaddsp 48, 48 , 60 \n\t" // c * x + s * y
+ "xvaddsp 49, 49 , 61 \n\t" // c * x + s * y
+
+ "addi %1, %1, 64 \n\t"
+ "addi %2, %2, 64 \n\t"
+
+ "xvaddsp 50, 50 , 62 \n\t" // c * x + s * y
+ "xvaddsp 51, 51 , 63 \n\t" // c * x + s * y
+
+ "xvsubsp 56, 56 , 52 \n\t" // c * y - s * x
+ "xvsubsp 57, 57 , 53 \n\t" // c * y - s * x
+ "xvsubsp 58, 58 , 54 \n\t" // c * y - s * x
+ "xvsubsp 59, 59 , 55 \n\t" // c * y - s * x
+
+ "stxvw4x 48, 0, %8 \n\t" // store x
+ "stxvw4x 49, %5, %8 \n\t"
+ "stxvw4x 50, %6, %8 \n\t"
+ "stxvw4x 51, %7, %8 \n\t"
+
+ "stxvw4x 56, 0, %9 \n\t" // store y
+ "stxvw4x 57, %5, %9 \n\t"
+ "stxvw4x 58, %6, %9 \n\t"
+ "stxvw4x 59, %7, %9 \n\t"
+
+ "addi %8, %8, 64 \n\t"
+ "addi %9, %9, 64 \n\t"
+
+ "addic. %0 , %0 , -16 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ "xvmulsp 48, 32, 36 \n\t" // c * x
+ "xvmulsp 49, 33, 36 \n\t"
+ "xvmulsp 50, 34, 36 \n\t"
+ "xvmulsp 51, 35, 36 \n\t"
+
+ "xvmulsp 56, 40, 36 \n\t" // c * y
+ "xvmulsp 57, 41, 36 \n\t"
+ "xvmulsp 58, 42, 36 \n\t"
+ "xvmulsp 59, 43, 36 \n\t"
+
+ "xvmulsp 52, 32, 37 \n\t" // s * x
+ "xvmulsp 53, 33, 37 \n\t"
+ "xvmulsp 54, 34, 37 \n\t"
+ "xvmulsp 55, 35, 37 \n\t"
+
+ "xvmulsp 60, 40, 37 \n\t" // s * y
+ "xvmulsp 61, 41, 37 \n\t"
+ "xvmulsp 62, 42, 37 \n\t"
+ "xvmulsp 63, 43, 37 \n\t"
+
+ "xvaddsp 48, 48 , 60 \n\t" // c * x + s * y
+ "xvaddsp 49, 49 , 61 \n\t" // c * x + s * y
+ "xvaddsp 50, 50 , 62 \n\t" // c * x + s * y
+ "xvaddsp 51, 51 , 63 \n\t" // c * x + s * y
+
+ "xvsubsp 56, 56 , 52 \n\t" // c * y - s * x
+ "xvsubsp 57, 57 , 53 \n\t" // c * y - s * x
+ "xvsubsp 58, 58 , 54 \n\t" // c * y - s * x
+ "xvsubsp 59, 59 , 55 \n\t" // c * y - s * x
+
+ "stxvw4x 48, 0, %8 \n\t" // store x
+ "stxvw4x 49, %5, %8 \n\t"
+ "stxvw4x 50, %6, %8 \n\t"
+ "stxvw4x 51, %7, %8 \n\t"
+
+ "stxvw4x 56, 0, %9 \n\t" // store y
+ "stxvw4x 57, %5, %9 \n\t"
+ "stxvw4x 58, %6, %9 \n\t"
+ "stxvw4x 59, %7, %9 \n\t"
+
+
+
+ :
+ :
+ "r" (i), // 0
+ "r" (x1), // 1
+ "r" (y1), // 2
+ "r" (c), // 3
+ "r" (s), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (x2), // 8
+ "r" (y2) // 9
+ : "cr0", "%0", "%1" , "%2", "%8", "%9", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c
new file mode 100644
index 000000000..c6ef5e969
--- /dev/null
+++ b/kernel/power/sscal.c
@@ -0,0 +1,179 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8)
+#include "sscal_microk_power8.c"
+#endif
+
+
+#if !defined(HAVE_KERNEL_16)
+
+static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x )
+{
+
+ BLASLONG i;
+ FLOAT alpha = *da;
+
+ for( i=0; i 0 )
+ {
+ alpha[0]=da;
+ alpha[1]=da;
+ alpha[2]=da;
+ alpha[3]=da;
+ sscal_kernel_16_zero(n1 , alpha , x);
+ j=n1;
+ }
+
+ while(j < n)
+ {
+
+ x[j]=0.0;
+ j++;
+ }
+
+ }
+ else
+ {
+
+ BLASLONG n1 = n & -32;
+ if ( n1 > 0 )
+ {
+ alpha[0]=da;
+ alpha[1]=da;
+ alpha[2]=da;
+ alpha[3]=da;
+ sscal_kernel_16(n1 , alpha , x);
+ j=n1;
+ }
+ while(j < n)
+ {
+
+ x[j] = da * x[j] ;
+ j++;
+ }
+ }
+
+
+ }
+ else
+ {
+
+ if ( da == 0.0 )
+ {
+
+ while(j < n)
+ {
+
+ x[i]=0.0;
+ i += inc_x ;
+ j++;
+ }
+
+ }
+ else
+ {
+
+ while(j < n)
+ {
+
+ x[i] = da * x[i] ;
+ i += inc_x ;
+ j++;
+ }
+ }
+
+ }
+ return 0;
+
+}
+
+
diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c
new file mode 100644
index 000000000..963cec777
--- /dev/null
+++ b/kernel/power/sscal_microk_power8.c
@@ -0,0 +1,218 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
+
+static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ FLOAT *x2=x+1;
+ BLASLONG pre = 384;
+
+ __asm__ __volatile__
+ (
+
+ "lxvw4x 32, 0, %3 \n\t"
+ "addi %1, %1, -4 \n\t"
+
+ "dcbt %2, %4 \n\t"
+
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -32 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "dcbt %2, %4 \n\t"
+
+ "xvmulsp 48, 40, 32 \n\t"
+ "xvmulsp 49, 41, 32 \n\t"
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+ "xvmulsp 50, 42, 32 \n\t"
+ "xvmulsp 51, 43, 32 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+ "xvmulsp 52, 44, 32 \n\t"
+ "xvmulsp 53, 45, 32 \n\t"
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+ "xvmulsp 54, 46, 32 \n\t"
+ "xvmulsp 55, 47, 32 \n\t"
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+
+ "stxvw4x 48, 0, %1 \n\t"
+ "stxvw4x 49, %5, %1 \n\t"
+ "stxvw4x 50, %6, %1 \n\t"
+ "stxvw4x 51, %7, %1 \n\t"
+ "stxvw4x 52, %8, %1 \n\t"
+ "stxvw4x 53, %9, %1 \n\t"
+ "stxvw4x 54, %10, %1 \n\t"
+ "stxvw4x 55, %11, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -32 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ "xvmulsp 48, 40, 32 \n\t"
+ "xvmulsp 49, 41, 32 \n\t"
+ "xvmulsp 50, 42, 32 \n\t"
+ "xvmulsp 51, 43, 32 \n\t"
+ "xvmulsp 52, 44, 32 \n\t"
+ "xvmulsp 53, 45, 32 \n\t"
+ "xvmulsp 54, 46, 32 \n\t"
+ "xvmulsp 55, 47, 32 \n\t"
+
+ "stxvw4x 48, 0, %1 \n\t"
+ "stxvw4x 49, %5, %1 \n\t"
+ "stxvw4x 50, %6, %1 \n\t"
+ "stxvw4x 51, %7, %1 \n\t"
+ "stxvw4x 52, %8, %1 \n\t"
+ "stxvw4x 53, %9, %1 \n\t"
+ "stxvw4x 54, %10, %1 \n\t"
+ "stxvw4x 55, %11, %1 \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (x2), // 1
+ "r" (x1), // 2
+ "r" (alpha), // 3
+ "r" (pre), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2" , "%1", "memory"
+ );
+
+}
+
+
+static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
+
+static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ FLOAT *x2=x+1;
+ BLASLONG pre = 384;
+
+ __asm__ __volatile__
+ (
+
+ "xxlxor 32 , 32 , 32 \n\t"
+ "addi %1, %1, -4 \n\t"
+
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "stxvw4x 32, 0, %1 \n\t"
+ "stxvw4x 32, %5, %1 \n\t"
+ "stxvw4x 32, %6, %1 \n\t"
+ "stxvw4x 32, %7, %1 \n\t"
+ "stxvw4x 32, %8, %1 \n\t"
+ "stxvw4x 32, %9, %1 \n\t"
+ "stxvw4x 32, %10, %1 \n\t"
+ "stxvw4x 32, %11, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+
+ "addic. %0 , %0 , -32 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (x2), // 1
+ "r" (x1), // 2
+ "r" (alpha), // 3
+ "r" (pre), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2" , "%1", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c
new file mode 100644
index 000000000..932652b37
--- /dev/null
+++ b/kernel/power/sswap.c
@@ -0,0 +1,154 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8)
+#include "sswap_microk_power8.c"
+#endif
+
+#ifndef HAVE_KERNEL_32
+
+static void sswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG i=0;
+ FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+ FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+
+ while ( i 0 )
+ {
+ sswap_kernel_32(n1, x, y);
+ i=n1;
+ }
+
+ while(i < n)
+ {
+ temp = y[i];
+ y[i] = x[i] ;
+ x[i] = temp;
+ i++ ;
+
+ }
+
+
+ }
+ else
+ {
+
+ while(i < n)
+ {
+ temp = y[iy];
+ y[iy] = x[ix] ;
+ x[ix] = temp;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+
+ }
+ return(0);
+
+
+}
+
+
diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c
new file mode 100644
index 000000000..c48e743de
--- /dev/null
+++ b/kernel/power/sswap_microk_power8.c
@@ -0,0 +1,136 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_32 1
+
+static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ FLOAT *x2=x+1;
+ FLOAT *y2=y+1;
+ BLASLONG pre = 384;
+ BLASLONG alpha=0;
+
+ __asm__ __volatile__
+ (
+
+ "addi %3, %3, -4 \n\t"
+ "addi %4, %4, -4 \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "lxvw4x 32, 0, %2 \n\t"
+ "lxvw4x 33, %5, %2 \n\t"
+ "lxvw4x 34, %6, %2 \n\t"
+ "lxvw4x 35, %7, %2 \n\t"
+ "lxvw4x 36, %8, %2 \n\t"
+ "lxvw4x 37, %9, %2 \n\t"
+ "lxvw4x 38, %10, %2 \n\t"
+ "lxvw4x 39, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "lxvw4x 48, 0, %1 \n\t"
+ "lxvw4x 49, %5, %1 \n\t"
+ "lxvw4x 50, %6, %1 \n\t"
+ "lxvw4x 51, %7, %1 \n\t"
+ "lxvw4x 52, %8, %1 \n\t"
+ "lxvw4x 53, %9, %1 \n\t"
+ "lxvw4x 54, %10, %1 \n\t"
+ "lxvw4x 55, %11, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+
+ "stxvw4x 32, 0, %3 \n\t"
+ "stxvw4x 33, %5, %3 \n\t"
+ "stxvw4x 34, %6, %3 \n\t"
+ "stxvw4x 35, %7, %3 \n\t"
+ "stxvw4x 36, %8, %3 \n\t"
+ "stxvw4x 37, %9, %3 \n\t"
+ "stxvw4x 38, %10, %3 \n\t"
+ "stxvw4x 39, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvw4x 48, 0, %4 \n\t"
+ "stxvw4x 49, %5, %4 \n\t"
+ "stxvw4x 50, %6, %4 \n\t"
+ "stxvw4x 51, %7, %4 \n\t"
+ "stxvw4x 52, %8, %4 \n\t"
+ "stxvw4x 53, %9, %4 \n\t"
+ "stxvw4x 54, %10, %4 \n\t"
+ "stxvw4x 55, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "addic. %0 , %0 , -32 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (y1), // 1
+ "r" (x1), // 2
+ "r" (y2), // 3
+ "r" (x2), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S
new file mode 100644
index 000000000..f756d5d92
--- /dev/null
+++ b/kernel/power/strmm_kernel_16x8_power8.S
@@ -0,0 +1,369 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD lwz
+#else
+#define LOAD ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 340
+#define ALPHA_SP 296(SP)
+#define FZERO 304(SP)
+#else
+#define STACKSIZE 240
+#define ALPHA_SP 224(SP)
+#define FZERO 232(SP)
+#endif
+
+#define M r3
+#define N r4
+#define K r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A r6
+#define B r7
+#define C r8
+#define LDC r9
+#define OFFSET r10
+#else
+#define A r7
+#define B r8
+#define C r9
+#define LDC r10
+#define OFFSET r6
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A r8
+#define B r9
+#define C r10
+#define LDC r7
+#define OFFSET r6
+#else
+#define A r7
+#define B r8
+#define C r9
+#define LDC r10
+#define OFFSET r6
+#endif
+#endif
+
+#define alpha_r vs30
+#define alpha_vr vs31
+
+#define o0 0
+
+#define TBUFFER r13
+#define o12 r14
+#define o4 r15
+#define K1 r16
+#define o8 r17
+#define L r18
+#define T1 r19
+#define KK r20
+#define KKK r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define o16 r27
+#define o32 r28
+#define o48 r29
+
+#define PRE r30
+#define T2 r31
+
+#include "strmm_macros_16x8_power8.S"
+
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+#ifdef __64BIT__
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+ std r13, 288(SP)
+#else
+ stw r31, 144(SP)
+ stw r30, 148(SP)
+ stw r29, 152(SP)
+ stw r28, 156(SP)
+ stw r27, 160(SP)
+ stw r26, 164(SP)
+ stw r25, 168(SP)
+ stw r24, 172(SP)
+ stw r23, 176(SP)
+ stw r22, 180(SP)
+ stw r21, 184(SP)
+ stw r20, 188(SP)
+ stw r19, 192(SP)
+ stw r18, 196(SP)
+ stw r17, 200(SP)
+ stw r16, 204(SP)
+ stw r15, 208(SP)
+ stw r14, 212(SP)
+ stw r13, 216(SP)
+#endif
+
+ // stfd f1, ALPHA_SP
+ // stw r0, FZERO
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+ slwi LDC, LDC, BASE_SHIFT
+
+#if defined(TRMMKERNEL)
+#if defined(linux) && defined(__64BIT__)
+ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+ lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#else
+ lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#endif
+
+ mr KK, OFFSET
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg KK, KK
+#endif
+
+
+ cmpwi cr0, M, 0
+ ble L999_H1
+ cmpwi cr0, N, 0
+ ble L999_H1
+ cmpwi cr0, K, 0
+ ble L999_H1
+
+ li PRE, 256
+ li o4 , 4
+ li o8 , 8
+ li o12, 12
+ li o16, 16
+ li o32, 32
+ li o48, 48
+ addi TBUFFER, SP, 320
+
+ addi T1, SP, 300
+ stxsspx f1, o0 , T1
+ stxsspx f1, o4 , T1
+ stxsspx f1, o8 , T1
+ stxsspx f1, o12 , T1
+
+ lxsspx alpha_r, o0, T1
+ lxvw4x alpha_vr, o0, T1
+
+
+
+#include "strmm_logic_16x8_power8.S"
+
+L999:
+ addi r3, 0, 0
+
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+#ifdef __64BIT__
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+ ld r13, 288(SP)
+#else
+ lwz r31, 144(SP)
+ lwz r30, 148(SP)
+ lwz r29, 152(SP)
+ lwz r28, 156(SP)
+ lwz r27, 160(SP)
+ lwz r26, 164(SP)
+ lwz r25, 168(SP)
+ lwz r24, 172(SP)
+ lwz r23, 176(SP)
+ lwz r22, 180(SP)
+ lwz r21, 184(SP)
+ lwz r20, 188(SP)
+ lwz r19, 192(SP)
+ lwz r18, 196(SP)
+ lwz r17, 200(SP)
+ lwz r16, 204(SP)
+ lwz r15, 208(SP)
+ lwz r14, 212(SP)
+ lwz r13, 216(SP)
+#endif
+
+ addi SP, SP, STACKSIZE
+
+ blr
+
+ EPILOGUE
+#endif
diff --git a/kernel/power/strmm_logic_16x8_power8.S b/kernel/power/strmm_logic_16x8_power8.S
new file mode 100644
index 000000000..fb2d3f94b
--- /dev/null
+++ b/kernel/power/strmm_logic_16x8_power8.S
@@ -0,0 +1,2968 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+ srawi. J, N, 3
+ ble STRMM_L8_END
+
+STRMM_L8_BEGIN:
+
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 3
+ add C, C, T1
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 4
+ ble STRMM_L8x16_END
+
+STRMM_L8x16_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L8x16_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L8x16_SUB4
+
+STRMM_L8x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_I1
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+
+ addic. L, L, -2
+ ble STRMM_L8x16_LOOP_END
+
+ .align 5
+
+STRMM_L8x16_LOOP:
+
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+
+ addic. L, L, -1
+ bgt STRMM_L8x16_LOOP
+
+STRMM_L8x16_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt AO, PRE
+ KERNEL8x16_1
+ KERNEL8x16_E2
+
+ b STRMM_L8x16_SUB1
+
+STRMM_L8x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL8x16_SUBI1
+ dcbt AO, PRE
+ KERNEL8x16_SUB1
+ dcbt AO, PRE
+ KERNEL8x16_SUB1
+ dcbt AO, PRE
+ KERNEL8x16_SUB1
+
+ KERNEL8x16_SUB1
+ KERNEL8x16_SUB1
+ KERNEL8x16_SUB1
+ KERNEL8x16_SUB1
+
+ b STRMM_L8x16_SUB1
+
+STRMM_L8x16_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL8x16_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L8x16_SAVE
+ b STRMM_L8x16_SUB2
+
+STRMM_L8x16_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L8x16_SAVE
+
+STRMM_L8x16_SUB2:
+
+ KERNEL8x16_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L8x16_SUB2
+
+STRMM_L8x16_SAVE:
+
+ SAVE8x16
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 16 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt STRMM_L8x16_BEGIN
+
+STRMM_L8x16_END:
+
+STRMM_L8x8_BEGIN:
+ andi. T2, M, 15
+ ble STRMM_L8x1_END
+
+ andi. T1, M, 8
+ ble STRMM_L8x8_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L8x8_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L8x8_SUB4
+
+STRMM_L8x8_LOOP_START:
+
+ LOAD8x8_1
+ KERNEL8x8_I1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_2
+
+ KERNEL8x8_1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_2
+
+ addic. L, L, -2
+ ble STRMM_L8x8_LOOP_END
+
+ .align 5
+
+STRMM_L8x8_LOOP:
+
+ KERNEL8x8_1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_2
+
+ KERNEL8x8_1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_2
+
+ addic. L, L, -1
+ bgt STRMM_L8x8_LOOP
+
+STRMM_L8x8_LOOP_END:
+
+ KERNEL8x8_1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_2
+
+ KERNEL8x8_1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_E2
+
+ b STRMM_L8x8_SUB1
+
+STRMM_L8x8_SUB4:
+
+ KERNEL8x8_SUBI1
+ KERNEL8x8_SUB1
+ KERNEL8x8_SUB1
+ KERNEL8x8_SUB1
+
+ KERNEL8x8_SUB1
+ KERNEL8x8_SUB1
+ KERNEL8x8_SUB1
+ KERNEL8x8_SUB1
+
+ b STRMM_L8x8_SUB1
+
+STRMM_L8x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL8x8_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L8x8_SAVE
+ b STRMM_L8x8_SUB2
+
+STRMM_L8x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L8x8_SAVE
+
+STRMM_L8x8_SUB2:
+
+ KERNEL8x8_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L8x8_SUB2
+
+STRMM_L8x8_SAVE:
+
+ SAVE8x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+STRMM_L8x8_END:
+
+STRMM_L8x4_BEGIN:
+
+ andi. T1, M, 4
+ ble STRMM_L8x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L8x4_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L8x4_SUB4
+
+STRMM_L8x4_LOOP_START:
+
+ LOAD8x4_1
+ KERNEL8x4_I1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_2
+
+ KERNEL8x4_1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_2
+
+ addic. L, L, -2
+ ble STRMM_L8x4_LOOP_END
+
+ .align 5
+
+STRMM_L8x4_LOOP:
+
+ KERNEL8x4_1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_2
+
+ KERNEL8x4_1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_2
+
+ addic. L, L, -1
+ bgt STRMM_L8x4_LOOP
+
+STRMM_L8x4_LOOP_END:
+
+ KERNEL8x4_1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_2
+
+ KERNEL8x4_1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_E2
+
+ b STRMM_L8x4_SUB1
+
+STRMM_L8x4_SUB4:
+
+ KERNEL8x4_SUBI1
+ KERNEL8x4_SUB1
+ KERNEL8x4_SUB1
+ KERNEL8x4_SUB1
+
+ KERNEL8x4_SUB1
+ KERNEL8x4_SUB1
+ KERNEL8x4_SUB1
+ KERNEL8x4_SUB1
+
+ b STRMM_L8x4_SUB1
+
+STRMM_L8x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL8x4_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L8x4_SAVE
+ b STRMM_L8x4_SUB2
+
+STRMM_L8x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L8x4_SAVE
+
+STRMM_L8x4_SUB2:
+
+ KERNEL8x4_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L8x4_SUB2
+
+STRMM_L8x4_SAVE:
+
+ SAVE8x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+STRMM_L8x4_END:
+
+STRMM_L8x2_BEGIN:
+
+ andi. T1, M, 2
+ ble STRMM_L8x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 3 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L8x2_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L8x2_SUB4
+
+STRMM_L8x2_LOOP_START:
+
+ LOAD8x2_1
+ KERNEL8x2_I1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_2
+
+ KERNEL8x2_1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_2
+
+ addic. L, L, -2
+ ble STRMM_L8x2_LOOP_END
+
+ .align 5
+
+STRMM_L8x2_LOOP:
+
+ KERNEL8x2_1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_2
+
+ KERNEL8x2_1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_2
+
+ addic. L, L, -1
+ bgt STRMM_L8x2_LOOP
+
+STRMM_L8x2_LOOP_END:
+
+ KERNEL8x2_1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_2
+
+ KERNEL8x2_1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_E2
+
+ b STRMM_L8x2_SUB1
+
+STRMM_L8x2_SUB4:
+
+ KERNEL8x2_SUBI1
+ KERNEL8x2_SUB1
+ KERNEL8x2_SUB1
+ KERNEL8x2_SUB1
+
+ KERNEL8x2_SUB1
+ KERNEL8x2_SUB1
+ KERNEL8x2_SUB1
+ KERNEL8x2_SUB1
+
+ b STRMM_L8x2_SUB1
+
+STRMM_L8x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL8x2_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L8x2_SAVE
+ b STRMM_L8x2_SUB2
+
+STRMM_L8x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L8x2_SAVE
+
+STRMM_L8x2_SUB2:
+
+ KERNEL8x2_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L8x2_SUB2
+
+STRMM_L8x2_SAVE:
+
+ SAVE8x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+STRMM_L8x2_END:
+
+STRMM_L8x1_BEGIN:
+
+ andi. T1, M, 1
+ ble STRMM_L8x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 2 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L8x1_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L8x1_SUB4
+
+STRMM_L8x1_LOOP_START:
+
+ LOAD8x1_1
+ KERNEL8x1_I1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_2
+
+ KERNEL8x1_1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_2
+
+ addic. L, L, -2
+ ble STRMM_L8x1_LOOP_END
+
+ .align 5
+
+STRMM_L8x1_LOOP:
+
+ KERNEL8x1_1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_2
+
+ KERNEL8x1_1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_2
+
+ addic. L, L, -1
+ bgt STRMM_L8x1_LOOP
+
+STRMM_L8x1_LOOP_END:
+
+ KERNEL8x1_1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_2
+
+ KERNEL8x1_1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_E2
+
+ b STRMM_L8x1_SUB1
+
+STRMM_L8x1_SUB4:
+
+ KERNEL8x1_SUBI1
+ KERNEL8x1_SUB1
+ KERNEL8x1_SUB1
+ KERNEL8x1_SUB1
+
+ KERNEL8x1_SUB1
+ KERNEL8x1_SUB1
+ KERNEL8x1_SUB1
+ KERNEL8x1_SUB1
+
+ b STRMM_L8x1_SUB1
+
+STRMM_L8x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL8x1_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L8x1_SAVE
+ b STRMM_L8x1_SUB2
+
+STRMM_L8x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L8x1_SAVE
+
+STRMM_L8x1_SUB2:
+
+ KERNEL8x1_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L8x1_SUB2
+
+STRMM_L8x1_SAVE:
+
+ SAVE8x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+STRMM_L8x1_END:
+
+ slwi T1, K, 5
+ add B, B, T1
+
+#if !defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in B
+#endif
+
+
+ addic. J, J, -1
+ bgt STRMM_L8_BEGIN
+
+ andi. T2, N, 7
+ ble L999
+
+STRMM_L8_END:
+
+ b STRMM_L4_BEGIN
+
+L999_H1:
+
+ b L999
+
+STRMM_L4_BEGIN:
+
+ andi. T1, N, 4
+ ble STRMM_L4_END
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 2
+ add C, C, T1
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 4
+ ble STRMM_L4x16_END
+
+STRMM_L4x16_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L4x16_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L4x16_SUB4
+
+STRMM_L4x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_I1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ addic. L, L, -2
+ ble STRMM_L4x16_LOOP_END
+
+ .align 5
+
+STRMM_L4x16_LOOP:
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ addic. L, L, -1
+ bgt STRMM_L4x16_LOOP
+
+STRMM_L4x16_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ KERNEL4x16_E2
+
+ b STRMM_L4x16_SUB1
+
+STRMM_L4x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL4x16_SUBI1
+ dcbt AO, PRE
+ KERNEL4x16_SUB1
+ dcbt AO, PRE
+ KERNEL4x16_SUB1
+ dcbt AO, PRE
+ KERNEL4x16_SUB1
+
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+
+ b STRMM_L4x16_SUB1
+
+STRMM_L4x16_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x16_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L4x16_SAVE
+ b STRMM_L4x16_SUB2
+
+STRMM_L4x16_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L4x16_SAVE
+
+STRMM_L4x16_SUB2:
+
+ KERNEL4x16_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L4x16_SUB2
+
+STRMM_L4x16_SAVE:
+
+ SAVE4x16
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 16 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt STRMM_L4x16_BEGIN
+
+STRMM_L4x16_END:
+
+STRMM_L4x8_BEGIN:
+ andi. T2, M, 15
+ ble STRMM_L4x1_END
+
+ andi. T1, M, 8
+ ble STRMM_L4x8_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L4x8_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L4x8_SUB4
+
+STRMM_L4x8_LOOP_START:
+
+ LOAD4x8_1
+ KERNEL4x8_I1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ addic. L, L, -2
+ ble STRMM_L4x8_LOOP_END
+
+ .align 5
+
+STRMM_L4x8_LOOP:
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ addic. L, L, -1
+ bgt STRMM_L4x8_LOOP
+
+STRMM_L4x8_LOOP_END:
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_E2
+
+ b STRMM_L4x8_SUB1
+
+STRMM_L4x8_SUB4:
+
+ KERNEL4x8_SUBI1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+
+ b STRMM_L4x8_SUB1
+
+STRMM_L4x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x8_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L4x8_SAVE
+ b STRMM_L4x8_SUB2
+
+STRMM_L4x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L4x8_SAVE
+
+STRMM_L4x8_SUB2:
+
+ KERNEL4x8_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L4x8_SUB2
+
+STRMM_L4x8_SAVE:
+
+ SAVE4x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+STRMM_L4x8_END:
+
+STRMM_L4x4_BEGIN:
+
+ andi. T1, M, 4
+ ble STRMM_L4x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L4x4_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L4x4_SUB4
+
+STRMM_L4x4_LOOP_START:
+
+ LOAD4x4_1
+ KERNEL4x4_I1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ addic. L, L, -2
+ ble STRMM_L4x4_LOOP_END
+
+ .align 5
+
+STRMM_L4x4_LOOP:
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ addic. L, L, -1
+ bgt STRMM_L4x4_LOOP
+
+STRMM_L4x4_LOOP_END:
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_E2
+
+ b STRMM_L4x4_SUB1
+
+STRMM_L4x4_SUB4:
+
+ KERNEL4x4_SUBI1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+
+ b STRMM_L4x4_SUB1
+
+STRMM_L4x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x4_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L4x4_SAVE
+ b STRMM_L4x4_SUB2
+
+STRMM_L4x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L4x4_SAVE
+
+STRMM_L4x4_SUB2:
+
+ KERNEL4x4_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L4x4_SUB2
+
+STRMM_L4x4_SAVE:
+
+ SAVE4x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+STRMM_L4x4_END:
+
+STRMM_L4x2_BEGIN:
+
+ andi. T1, M, 2
+ ble STRMM_L4x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 3 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L4x2_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L4x2_SUB4
+
+STRMM_L4x2_LOOP_START:
+
+ LOAD4x2_1
+ KERNEL4x2_I1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ addic. L, L, -2
+ ble STRMM_L4x2_LOOP_END
+
+ .align 5
+
+STRMM_L4x2_LOOP:
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ addic. L, L, -1
+ bgt STRMM_L4x2_LOOP
+
+STRMM_L4x2_LOOP_END:
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_E2
+
+ b STRMM_L4x2_SUB1
+
+STRMM_L4x2_SUB4:
+
+ KERNEL4x2_SUBI1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+
+ b STRMM_L4x2_SUB1
+
+STRMM_L4x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x2_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L4x2_SAVE
+ b STRMM_L4x2_SUB2
+
+STRMM_L4x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L4x2_SAVE
+
+STRMM_L4x2_SUB2:
+
+ KERNEL4x2_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L4x2_SUB2
+
+STRMM_L4x2_SAVE:
+
+ SAVE4x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+STRMM_L4x2_END:
+
+STRMM_L4x1_BEGIN:
+
+ andi. T1, M, 1
+ ble STRMM_L4x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 2 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L4x1_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L4x1_SUB4
+
+STRMM_L4x1_LOOP_START:
+
+ LOAD4x1_1
+ KERNEL4x1_I1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ addic. L, L, -2
+ ble STRMM_L4x1_LOOP_END
+
+ .align 5
+
+STRMM_L4x1_LOOP:
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ addic. L, L, -1
+ bgt STRMM_L4x1_LOOP
+
+STRMM_L4x1_LOOP_END:
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_E2
+
+ b STRMM_L4x1_SUB1
+
+STRMM_L4x1_SUB4:
+
+ KERNEL4x1_SUBI1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+
+ b STRMM_L4x1_SUB1
+
+STRMM_L4x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x1_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L4x1_SAVE
+ b STRMM_L4x1_SUB2
+
+STRMM_L4x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L4x1_SAVE
+
+STRMM_L4x1_SUB2:
+
+ KERNEL4x1_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L4x1_SUB2
+
+STRMM_L4x1_SAVE:
+
+ SAVE4x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+STRMM_L4x1_END:
+
+ slwi T1, K, 4
+ add B, B, T1
+
+#if !defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in B
+#endif
+
+
+STRMM_L4_END:
+STRMM_L2_BEGIN:
+
+ andi. T1, N, 2
+ ble STRMM_L2_END
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 1
+ add C, C, T1
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 4
+ ble STRMM_L2x16_END
+
+STRMM_L2x16_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L2x16_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L2x16_SUB4
+
+STRMM_L2x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_I1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ addic. L, L, -2
+ ble STRMM_L2x16_LOOP_END
+
+ .align 5
+
+STRMM_L2x16_LOOP:
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ addic. L, L, -1
+ bgt STRMM_L2x16_LOOP
+
+STRMM_L2x16_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ KERNEL2x16_E2
+
+ b STRMM_L2x16_SUB1
+
+STRMM_L2x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL2x16_SUBI1
+ dcbt AO, PRE
+ KERNEL2x16_SUB1
+ dcbt AO, PRE
+ KERNEL2x16_SUB1
+ dcbt AO, PRE
+ KERNEL2x16_SUB1
+
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+
+ b STRMM_L2x16_SUB1
+
+STRMM_L2x16_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x16_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L2x16_SAVE
+ b STRMM_L2x16_SUB2
+
+STRMM_L2x16_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L2x16_SAVE
+
+STRMM_L2x16_SUB2:
+
+ KERNEL2x16_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L2x16_SUB2
+
+STRMM_L2x16_SAVE:
+
+ SAVE2x16
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 16 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt STRMM_L2x16_BEGIN
+
+STRMM_L2x16_END:
+
+STRMM_L2x8_BEGIN:
+ andi. T2, M, 15
+ ble STRMM_L2x1_END
+
+ andi. T1, M, 8
+ ble STRMM_L2x8_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L2x8_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L2x8_SUB4
+
+STRMM_L2x8_LOOP_START:
+
+ LOAD2x8_1
+ KERNEL2x8_I1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ addic. L, L, -2
+ ble STRMM_L2x8_LOOP_END
+
+ .align 5
+
+STRMM_L2x8_LOOP:
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ addic. L, L, -1
+ bgt STRMM_L2x8_LOOP
+
+STRMM_L2x8_LOOP_END:
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_E2
+
+ b STRMM_L2x8_SUB1
+
+STRMM_L2x8_SUB4:
+
+ KERNEL2x8_SUBI1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ b STRMM_L2x8_SUB1
+
+STRMM_L2x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x8_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L2x8_SAVE
+ b STRMM_L2x8_SUB2
+
+STRMM_L2x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L2x8_SAVE
+
+STRMM_L2x8_SUB2:
+
+ KERNEL2x8_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L2x8_SUB2
+
+STRMM_L2x8_SAVE:
+
+ SAVE2x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+STRMM_L2x8_END:
+
+STRMM_L2x4_BEGIN:
+
+ andi. T1, M, 4
+ ble STRMM_L2x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L2x4_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L2x4_SUB4
+
+STRMM_L2x4_LOOP_START:
+
+ LOAD2x4_1
+ KERNEL2x4_I1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -2
+ ble STRMM_L2x4_LOOP_END
+
+ .align 5
+
+STRMM_L2x4_LOOP:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -1
+ bgt STRMM_L2x4_LOOP
+
+STRMM_L2x4_LOOP_END:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_E2
+
+ b STRMM_L2x4_SUB1
+
+STRMM_L2x4_SUB4:
+
+ KERNEL2x4_SUBI1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ b STRMM_L2x4_SUB1
+
+STRMM_L2x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x4_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L2x4_SAVE
+ b STRMM_L2x4_SUB2
+
+STRMM_L2x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L2x4_SAVE
+
+STRMM_L2x4_SUB2:
+
+ KERNEL2x4_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L2x4_SUB2
+
+STRMM_L2x4_SAVE:
+
+ SAVE2x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+STRMM_L2x4_END:
+
+STRMM_L2x2_BEGIN:
+
+ andi. T1, M, 2
+ ble STRMM_L2x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 3 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L2x2_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L2x2_SUB4
+
+STRMM_L2x2_LOOP_START:
+
+ LOAD2x2_1
+ KERNEL2x2_I1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -2
+ ble STRMM_L2x2_LOOP_END
+
+ .align 5
+
+STRMM_L2x2_LOOP:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -1
+ bgt STRMM_L2x2_LOOP
+
+STRMM_L2x2_LOOP_END:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_E2
+
+ b STRMM_L2x2_SUB1
+
+STRMM_L2x2_SUB4:
+
+ KERNEL2x2_SUBI1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ b STRMM_L2x2_SUB1
+
+STRMM_L2x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x2_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L2x2_SAVE
+ b STRMM_L2x2_SUB2
+
+STRMM_L2x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L2x2_SAVE
+
+STRMM_L2x2_SUB2:
+
+ KERNEL2x2_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L2x2_SUB2
+
+STRMM_L2x2_SAVE:
+
+ SAVE2x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+STRMM_L2x2_END:
+
+STRMM_L2x1_BEGIN:
+
+ andi. T1, M, 1
+ ble STRMM_L2x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 2 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L2x1_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L2x1_SUB4
+
+STRMM_L2x1_LOOP_START:
+
+ LOAD2x1_1
+ KERNEL2x1_I1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -2
+ ble STRMM_L2x1_LOOP_END
+
+ .align 5
+
+STRMM_L2x1_LOOP:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -1
+ bgt STRMM_L2x1_LOOP
+
+STRMM_L2x1_LOOP_END:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_E2
+
+ b STRMM_L2x1_SUB1
+
+STRMM_L2x1_SUB4:
+
+ KERNEL2x1_SUBI1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ b STRMM_L2x1_SUB1
+
+STRMM_L2x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x1_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L2x1_SAVE
+ b STRMM_L2x1_SUB2
+
+STRMM_L2x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L2x1_SAVE
+
+STRMM_L2x1_SUB2:
+
+ KERNEL2x1_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L2x1_SUB2
+
+STRMM_L2x1_SAVE:
+
+ SAVE2x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+STRMM_L2x1_END:
+
+ slwi T1, K, 3
+ add B, B, T1
+
+#if !defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in B
+#endif
+
+
+STRMM_L2_END:
+STRMM_L1_BEGIN:
+
+ andi. T1, N, 1
+ ble STRMM_L1_END
+ mr CO, C
+ mr AO, A
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 4
+ ble STRMM_L1x16_END
+
+STRMM_L1x16_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 2 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L1x16_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L1x16_SUB4
+
+STRMM_L1x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_I1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ addic. L, L, -2
+ ble STRMM_L1x16_LOOP_END
+
+ .align 5
+
+STRMM_L1x16_LOOP:
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ addic. L, L, -1
+ bgt STRMM_L1x16_LOOP
+
+STRMM_L1x16_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ KERNEL1x16_E2
+
+ b STRMM_L1x16_SUB1
+
+STRMM_L1x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL1x16_SUBI1
+ dcbt AO, PRE
+ KERNEL1x16_SUB1
+ dcbt AO, PRE
+ KERNEL1x16_SUB1
+ dcbt AO, PRE
+ KERNEL1x16_SUB1
+
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+
+ b STRMM_L1x16_SUB1
+
+STRMM_L1x16_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x16_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L1x16_SAVE
+ b STRMM_L1x16_SUB2
+
+STRMM_L1x16_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L1x16_SAVE
+
+STRMM_L1x16_SUB2:
+
+ KERNEL1x16_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L1x16_SUB2
+
+STRMM_L1x16_SAVE:
+
+ SAVE1x16
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 16 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt STRMM_L1x16_BEGIN
+
+STRMM_L1x16_END:
+
+STRMM_L1x8_BEGIN:
+ andi. T2, M, 15
+ ble STRMM_L1x1_END
+
+ andi. T1, M, 8
+ ble STRMM_L1x8_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 2 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L1x8_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L1x8_SUB4
+
+STRMM_L1x8_LOOP_START:
+
+ LOAD1x8_1
+ KERNEL1x8_I1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ addic. L, L, -2
+ ble STRMM_L1x8_LOOP_END
+
+ .align 5
+
+STRMM_L1x8_LOOP:
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ addic. L, L, -1
+ bgt STRMM_L1x8_LOOP
+
+STRMM_L1x8_LOOP_END:
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_E2
+
+ b STRMM_L1x8_SUB1
+
+STRMM_L1x8_SUB4:
+
+ KERNEL1x8_SUBI1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ b STRMM_L1x8_SUB1
+
+STRMM_L1x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x8_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L1x8_SAVE
+ b STRMM_L1x8_SUB2
+
+STRMM_L1x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L1x8_SAVE
+
+STRMM_L1x8_SUB2:
+
+ KERNEL1x8_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L1x8_SUB2
+
+STRMM_L1x8_SAVE:
+
+ SAVE1x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+STRMM_L1x8_END:
+
+STRMM_L1x4_BEGIN:
+
+ andi. T1, M, 4
+ ble STRMM_L1x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 2 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L1x4_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L1x4_SUB4
+
+STRMM_L1x4_LOOP_START:
+
+ LOAD1x4_1
+ KERNEL1x4_I1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -2
+ ble STRMM_L1x4_LOOP_END
+
+ .align 5
+
+STRMM_L1x4_LOOP:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -1
+ bgt STRMM_L1x4_LOOP
+
+STRMM_L1x4_LOOP_END:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_E2
+
+ b STRMM_L1x4_SUB1
+
+STRMM_L1x4_SUB4:
+
+ KERNEL1x4_SUBI1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ b STRMM_L1x4_SUB1
+
+STRMM_L1x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x4_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L1x4_SAVE
+ b STRMM_L1x4_SUB2
+
+STRMM_L1x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L1x4_SAVE
+
+STRMM_L1x4_SUB2:
+
+ KERNEL1x4_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L1x4_SUB2
+
+STRMM_L1x4_SAVE:
+
+ SAVE1x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+STRMM_L1x4_END:
+
+STRMM_L1x2_BEGIN:
+
+ andi. T1, M, 2
+ ble STRMM_L1x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 2 // Number of values in B shifted
+ slwi T2, KK, 3 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L1x2_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L1x2_SUB4
+
+STRMM_L1x2_LOOP_START:
+
+ LOAD1x2_1
+ KERNEL1x2_I1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -2
+ ble STRMM_L1x2_LOOP_END
+
+ .align 5
+
+STRMM_L1x2_LOOP:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -1
+ bgt STRMM_L1x2_LOOP
+
+STRMM_L1x2_LOOP_END:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_E2
+
+ b STRMM_L1x2_SUB1
+
+STRMM_L1x2_SUB4:
+
+ KERNEL1x2_SUBI1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ b STRMM_L1x2_SUB1
+
+STRMM_L1x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x2_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L1x2_SAVE
+ b STRMM_L1x2_SUB2
+
+STRMM_L1x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L1x2_SAVE
+
+STRMM_L1x2_SUB2:
+
+ KERNEL1x2_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L1x2_SUB2
+
+STRMM_L1x2_SAVE:
+
+ SAVE1x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+STRMM_L1x2_END:
+
+STRMM_L1x1_BEGIN:
+
+ andi. T1, M, 1
+ ble STRMM_L1x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 2 // Number of values in B shifted
+ slwi T2, KK, 2 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble STRMM_L1x1_SUB0
+ cmpwi cr0, L, 1
+ ble STRMM_L1x1_SUB4
+
+STRMM_L1x1_LOOP_START:
+
+ LOAD1x1_1
+ KERNEL1x1_I1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -2
+ ble STRMM_L1x1_LOOP_END
+
+ .align 5
+
+STRMM_L1x1_LOOP:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -1
+ bgt STRMM_L1x1_LOOP
+
+STRMM_L1x1_LOOP_END:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_E2
+
+ b STRMM_L1x1_SUB1
+
+STRMM_L1x1_SUB4:
+
+ KERNEL1x1_SUBI1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ b STRMM_L1x1_SUB1
+
+STRMM_L1x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x1_SUBI1
+
+ addic. L, L, -1
+ ble STRMM_L1x1_SAVE
+ b STRMM_L1x1_SUB2
+
+STRMM_L1x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble STRMM_L1x1_SAVE
+
+STRMM_L1x1_SUB2:
+
+ KERNEL1x1_SUB1
+
+ addic. L, L, -1
+ bgt STRMM_L1x1_SUB2
+
+STRMM_L1x1_SAVE:
+
+ SAVE1x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+STRMM_L1x1_END:
+
+#if !defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in B
+#endif
+
+
+STRMM_L1_END:
diff --git a/kernel/power/strmm_macros_16x8_power8.S b/kernel/power/strmm_macros_16x8_power8.S
new file mode 100644
index 000000000..27bc1e89c
--- /dev/null
+++ b/kernel/power/strmm_macros_16x8_power8.S
@@ -0,0 +1,5840 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=16
+**********************************************************************************************/
+
+.macro LOAD8x16_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL8x16_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs20, vs29, 0
+ xxspltw vs21, vs29, 1
+ xxspltw vs22, vs29, 2
+ xxspltw vs23, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+ xvmulsp vs40, vs0, vs10
+ xvmulsp vs41, vs1, vs10
+ xvmulsp vs42, vs2, vs10
+ xvmulsp vs43, vs3, vs10
+
+ xvmulsp vs44, vs0, vs11
+ xvmulsp vs45, vs1, vs11
+ xvmulsp vs46, vs2, vs11
+ xvmulsp vs47, vs3, vs11
+
+ xvmulsp vs48, vs0, vs12
+ xvmulsp vs49, vs1, vs12
+ xvmulsp vs50, vs2, vs12
+ xvmulsp vs51, vs3, vs12
+
+ xvmulsp vs52, vs0, vs13
+ xvmulsp vs53, vs1, vs13
+ xvmulsp vs54, vs2, vs13
+ xvmulsp vs55, vs3, vs13
+
+ xvmulsp vs56, vs0, vs14
+ xvmulsp vs57, vs1, vs14
+ xvmulsp vs58, vs2, vs14
+ xvmulsp vs59, vs3, vs14
+
+ xvmulsp vs60, vs0, vs15
+ xvmulsp vs61, vs1, vs15
+ xvmulsp vs62, vs2, vs15
+ xvmulsp vs63, vs3, vs15
+
+
+.endm
+
+.macro KERNEL8x16_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs20, vs29, 0
+ xxspltw vs21, vs29, 1
+ xxspltw vs22, vs29, 2
+ xxspltw vs23, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+ xvmaddasp vs40, vs0, vs10
+ xvmaddasp vs41, vs1, vs10
+ xvmaddasp vs42, vs2, vs10
+ xvmaddasp vs43, vs3, vs10
+
+ xvmaddasp vs44, vs0, vs11
+ xvmaddasp vs45, vs1, vs11
+ xvmaddasp vs46, vs2, vs11
+ xvmaddasp vs47, vs3, vs11
+
+ xvmaddasp vs48, vs0, vs12
+ xvmaddasp vs49, vs1, vs12
+ xvmaddasp vs50, vs2, vs12
+ xvmaddasp vs51, vs3, vs12
+
+ xvmaddasp vs52, vs0, vs13
+ xvmaddasp vs53, vs1, vs13
+ xvmaddasp vs54, vs2, vs13
+ xvmaddasp vs55, vs3, vs13
+
+ xvmaddasp vs56, vs0, vs14
+ xvmaddasp vs57, vs1, vs14
+ xvmaddasp vs58, vs2, vs14
+ xvmaddasp vs59, vs3, vs14
+
+ xvmaddasp vs60, vs0, vs15
+ xvmaddasp vs61, vs1, vs15
+ xvmaddasp vs62, vs2, vs15
+ xvmaddasp vs63, vs3, vs15
+
+
+.endm
+
+.macro KERNEL8x16_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+ xvmaddasp vs40, vs4, vs18
+ xvmaddasp vs41, vs5, vs18
+ xvmaddasp vs42, vs6, vs18
+ xvmaddasp vs43, vs7, vs18
+
+ xvmaddasp vs44, vs4, vs19
+ xvmaddasp vs45, vs5, vs19
+ xvmaddasp vs46, vs6, vs19
+ xvmaddasp vs47, vs7, vs19
+
+ xvmaddasp vs48, vs4, vs20
+ xvmaddasp vs49, vs5, vs20
+ xvmaddasp vs50, vs6, vs20
+ xvmaddasp vs51, vs7, vs20
+
+ xvmaddasp vs52, vs4, vs21
+ xvmaddasp vs53, vs5, vs21
+ xvmaddasp vs54, vs6, vs21
+ xvmaddasp vs55, vs7, vs21
+
+ xvmaddasp vs56, vs4, vs22
+ xvmaddasp vs57, vs5, vs22
+ xvmaddasp vs58, vs6, vs22
+ xvmaddasp vs59, vs7, vs22
+
+ xvmaddasp vs60, vs4, vs23
+ xvmaddasp vs61, vs5, vs23
+ xvmaddasp vs62, vs6, vs23
+ xvmaddasp vs63, vs7, vs23
+
+
+.endm
+
+.macro KERNEL8x16_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+ xvmaddasp vs40, vs4, vs18
+ xvmaddasp vs41, vs5, vs18
+ xvmaddasp vs42, vs6, vs18
+ xvmaddasp vs43, vs7, vs18
+
+ xvmaddasp vs44, vs4, vs19
+ xvmaddasp vs45, vs5, vs19
+ xvmaddasp vs46, vs6, vs19
+ xvmaddasp vs47, vs7, vs19
+
+ xvmaddasp vs48, vs4, vs20
+ xvmaddasp vs49, vs5, vs20
+ xvmaddasp vs50, vs6, vs20
+ xvmaddasp vs51, vs7, vs20
+
+ xvmaddasp vs52, vs4, vs21
+ xvmaddasp vs53, vs5, vs21
+ xvmaddasp vs54, vs6, vs21
+ xvmaddasp vs55, vs7, vs21
+
+ xvmaddasp vs56, vs4, vs22
+ xvmaddasp vs57, vs5, vs22
+ xvmaddasp vs58, vs6, vs22
+ xvmaddasp vs59, vs7, vs22
+
+ xvmaddasp vs60, vs4, vs23
+ xvmaddasp vs61, vs5, vs23
+ xvmaddasp vs62, vs6, vs23
+ xvmaddasp vs63, vs7, vs23
+
+
+.endm
+
+.macro KERNEL8x16_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+ xvmulsp vs40, vs0, vs10
+ xvmulsp vs41, vs1, vs10
+ xvmulsp vs42, vs2, vs10
+ xvmulsp vs43, vs3, vs10
+
+ xvmulsp vs44, vs0, vs11
+ xvmulsp vs45, vs1, vs11
+ xvmulsp vs46, vs2, vs11
+ xvmulsp vs47, vs3, vs11
+
+ xvmulsp vs48, vs0, vs12
+ xvmulsp vs49, vs1, vs12
+ xvmulsp vs50, vs2, vs12
+ xvmulsp vs51, vs3, vs12
+
+ xvmulsp vs52, vs0, vs13
+ xvmulsp vs53, vs1, vs13
+ xvmulsp vs54, vs2, vs13
+ xvmulsp vs55, vs3, vs13
+
+ xvmulsp vs56, vs0, vs14
+ xvmulsp vs57, vs1, vs14
+ xvmulsp vs58, vs2, vs14
+ xvmulsp vs59, vs3, vs14
+
+ xvmulsp vs60, vs0, vs15
+ xvmulsp vs61, vs1, vs15
+ xvmulsp vs62, vs2, vs15
+ xvmulsp vs63, vs3, vs15
+
+
+.endm
+
+.macro KERNEL8x16_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+ xvmaddasp vs40, vs0, vs10
+ xvmaddasp vs41, vs1, vs10
+ xvmaddasp vs42, vs2, vs10
+ xvmaddasp vs43, vs3, vs10
+
+ xvmaddasp vs44, vs0, vs11
+ xvmaddasp vs45, vs1, vs11
+ xvmaddasp vs46, vs2, vs11
+ xvmaddasp vs47, vs3, vs11
+
+ xvmaddasp vs48, vs0, vs12
+ xvmaddasp vs49, vs1, vs12
+ xvmaddasp vs50, vs2, vs12
+ xvmaddasp vs51, vs3, vs12
+
+ xvmaddasp vs52, vs0, vs13
+ xvmaddasp vs53, vs1, vs13
+ xvmaddasp vs54, vs2, vs13
+ xvmaddasp vs55, vs3, vs13
+
+ xvmaddasp vs56, vs0, vs14
+ xvmaddasp vs57, vs1, vs14
+ xvmaddasp vs58, vs2, vs14
+ xvmaddasp vs59, vs3, vs14
+
+ xvmaddasp vs60, vs0, vs15
+ xvmaddasp vs61, vs1, vs15
+ xvmaddasp vs62, vs2, vs15
+ xvmaddasp vs63, vs3, vs15
+
+
+.endm
+
+.macro SAVE8x16
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+ xvmulsp vs2, vs34, alpha_vr
+ xvmulsp vs3, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+ xvmaddasp vs2, vs34, alpha_vr
+ xvmaddasp vs3, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs36, alpha_vr
+ xvmulsp vs1, vs37, alpha_vr
+ xvmulsp vs2, vs38, alpha_vr
+ xvmulsp vs3, vs39, alpha_vr
+#else
+ xvmaddasp vs0, vs36, alpha_vr
+ xvmaddasp vs1, vs37, alpha_vr
+ xvmaddasp vs2, vs38, alpha_vr
+ xvmaddasp vs3, vs39, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs40, alpha_vr
+ xvmulsp vs1, vs41, alpha_vr
+ xvmulsp vs2, vs42, alpha_vr
+ xvmulsp vs3, vs43, alpha_vr
+#else
+ xvmaddasp vs0, vs40, alpha_vr
+ xvmaddasp vs1, vs41, alpha_vr
+ xvmaddasp vs2, vs42, alpha_vr
+ xvmaddasp vs3, vs43, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs44, alpha_vr
+ xvmulsp vs1, vs45, alpha_vr
+ xvmulsp vs2, vs46, alpha_vr
+ xvmulsp vs3, vs47, alpha_vr
+#else
+ xvmaddasp vs0, vs44, alpha_vr
+ xvmaddasp vs1, vs45, alpha_vr
+ xvmaddasp vs2, vs46, alpha_vr
+ xvmaddasp vs3, vs47, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs48, alpha_vr
+ xvmulsp vs1, vs49, alpha_vr
+ xvmulsp vs2, vs50, alpha_vr
+ xvmulsp vs3, vs51, alpha_vr
+#else
+ xvmaddasp vs0, vs48, alpha_vr
+ xvmaddasp vs1, vs49, alpha_vr
+ xvmaddasp vs2, vs50, alpha_vr
+ xvmaddasp vs3, vs51, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs52, alpha_vr
+ xvmulsp vs1, vs53, alpha_vr
+ xvmulsp vs2, vs54, alpha_vr
+ xvmulsp vs3, vs55, alpha_vr
+#else
+ xvmaddasp vs0, vs52, alpha_vr
+ xvmaddasp vs1, vs53, alpha_vr
+ xvmaddasp vs2, vs54, alpha_vr
+ xvmaddasp vs3, vs55, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs56, alpha_vr
+ xvmulsp vs1, vs57, alpha_vr
+ xvmulsp vs2, vs58, alpha_vr
+ xvmulsp vs3, vs59, alpha_vr
+#else
+ xvmaddasp vs0, vs56, alpha_vr
+ xvmaddasp vs1, vs57, alpha_vr
+ xvmaddasp vs2, vs58, alpha_vr
+ xvmaddasp vs3, vs59, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs60, alpha_vr
+ xvmulsp vs1, vs61, alpha_vr
+ xvmulsp vs2, vs62, alpha_vr
+ xvmulsp vs3, vs63, alpha_vr
+#else
+ xvmaddasp vs0, vs60, alpha_vr
+ xvmaddasp vs1, vs61, alpha_vr
+ xvmaddasp vs2, vs62, alpha_vr
+ xvmaddasp vs3, vs63, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=8
+**********************************************************************************************/
+
+.macro LOAD8x8_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL8x8_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs20, vs29, 0
+ xxspltw vs21, vs29, 1
+ xxspltw vs22, vs29, 2
+ xxspltw vs23, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+ xvmulsp vs36, vs0, vs10
+ xvmulsp vs37, vs1, vs10
+
+ xvmulsp vs38, vs0, vs11
+ xvmulsp vs39, vs1, vs11
+
+ xvmulsp vs40, vs0, vs12
+ xvmulsp vs41, vs1, vs12
+
+ xvmulsp vs42, vs0, vs13
+ xvmulsp vs43, vs1, vs13
+
+ xvmulsp vs44, vs0, vs14
+ xvmulsp vs45, vs1, vs14
+
+ xvmulsp vs46, vs0, vs15
+ xvmulsp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x8_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs20, vs29, 0
+ xxspltw vs21, vs29, 1
+ xxspltw vs22, vs29, 2
+ xxspltw vs23, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+ xvmaddasp vs36, vs0, vs10
+ xvmaddasp vs37, vs1, vs10
+
+ xvmaddasp vs38, vs0, vs11
+ xvmaddasp vs39, vs1, vs11
+
+ xvmaddasp vs40, vs0, vs12
+ xvmaddasp vs41, vs1, vs12
+
+ xvmaddasp vs42, vs0, vs13
+ xvmaddasp vs43, vs1, vs13
+
+ xvmaddasp vs44, vs0, vs14
+ xvmaddasp vs45, vs1, vs14
+
+ xvmaddasp vs46, vs0, vs15
+ xvmaddasp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x8_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+ xvmaddasp vs36, vs4, vs18
+ xvmaddasp vs37, vs5, vs18
+
+ xvmaddasp vs38, vs4, vs19
+ xvmaddasp vs39, vs5, vs19
+
+ xvmaddasp vs40, vs4, vs20
+ xvmaddasp vs41, vs5, vs20
+
+ xvmaddasp vs42, vs4, vs21
+ xvmaddasp vs43, vs5, vs21
+
+ xvmaddasp vs44, vs4, vs22
+ xvmaddasp vs45, vs5, vs22
+
+ xvmaddasp vs46, vs4, vs23
+ xvmaddasp vs47, vs5, vs23
+
+
+.endm
+
+.macro KERNEL8x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+ xvmaddasp vs36, vs4, vs18
+ xvmaddasp vs37, vs5, vs18
+
+ xvmaddasp vs38, vs4, vs19
+ xvmaddasp vs39, vs5, vs19
+
+ xvmaddasp vs40, vs4, vs20
+ xvmaddasp vs41, vs5, vs20
+
+ xvmaddasp vs42, vs4, vs21
+ xvmaddasp vs43, vs5, vs21
+
+ xvmaddasp vs44, vs4, vs22
+ xvmaddasp vs45, vs5, vs22
+
+ xvmaddasp vs46, vs4, vs23
+ xvmaddasp vs47, vs5, vs23
+
+
+.endm
+
+.macro KERNEL8x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+ xvmulsp vs36, vs0, vs10
+ xvmulsp vs37, vs1, vs10
+
+ xvmulsp vs38, vs0, vs11
+ xvmulsp vs39, vs1, vs11
+
+ xvmulsp vs40, vs0, vs12
+ xvmulsp vs41, vs1, vs12
+
+ xvmulsp vs42, vs0, vs13
+ xvmulsp vs43, vs1, vs13
+
+ xvmulsp vs44, vs0, vs14
+ xvmulsp vs45, vs1, vs14
+
+ xvmulsp vs46, vs0, vs15
+ xvmulsp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x8_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+ xvmaddasp vs36, vs0, vs10
+ xvmaddasp vs37, vs1, vs10
+
+ xvmaddasp vs38, vs0, vs11
+ xvmaddasp vs39, vs1, vs11
+
+ xvmaddasp vs40, vs0, vs12
+ xvmaddasp vs41, vs1, vs12
+
+ xvmaddasp vs42, vs0, vs13
+ xvmaddasp vs43, vs1, vs13
+
+ xvmaddasp vs44, vs0, vs14
+ xvmaddasp vs45, vs1, vs14
+
+ xvmaddasp vs46, vs0, vs15
+ xvmaddasp vs47, vs1, vs15
+
+
+.endm
+
+.macro SAVE8x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs34, alpha_vr
+ xvmulsp vs1, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs34, alpha_vr
+ xvmaddasp vs1, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs36, alpha_vr
+ xvmulsp vs1, vs37, alpha_vr
+#else
+ xvmaddasp vs0, vs36, alpha_vr
+ xvmaddasp vs1, vs37, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs38, alpha_vr
+ xvmulsp vs1, vs39, alpha_vr
+#else
+ xvmaddasp vs0, vs38, alpha_vr
+ xvmaddasp vs1, vs39, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs40, alpha_vr
+ xvmulsp vs1, vs41, alpha_vr
+#else
+ xvmaddasp vs0, vs40, alpha_vr
+ xvmaddasp vs1, vs41, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs42, alpha_vr
+ xvmulsp vs1, vs43, alpha_vr
+#else
+ xvmaddasp vs0, vs42, alpha_vr
+ xvmaddasp vs1, vs43, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs44, alpha_vr
+ xvmulsp vs1, vs45, alpha_vr
+#else
+ xvmaddasp vs0, vs44, alpha_vr
+ xvmaddasp vs1, vs45, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs46, alpha_vr
+ xvmulsp vs1, vs47, alpha_vr
+#else
+ xvmaddasp vs0, vs46, alpha_vr
+ xvmaddasp vs1, vs47, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=4
+**********************************************************************************************/
+
+.macro LOAD8x4_1
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL8x4_I1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs20, vs29, 0
+ xxspltw vs21, vs29, 1
+ xxspltw vs22, vs29, 2
+ xxspltw vs23, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+ xvmulsp vs34, vs0, vs10
+
+ xvmulsp vs35, vs0, vs11
+
+ xvmulsp vs36, vs0, vs12
+
+ xvmulsp vs37, vs0, vs13
+
+ xvmulsp vs38, vs0, vs14
+
+ xvmulsp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x4_1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs20, vs29, 0
+ xxspltw vs21, vs29, 1
+ xxspltw vs22, vs29, 2
+ xxspltw vs23, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+ xvmaddasp vs34, vs0, vs10
+
+ xvmaddasp vs35, vs0, vs11
+
+ xvmaddasp vs36, vs0, vs12
+
+ xvmaddasp vs37, vs0, vs13
+
+ xvmaddasp vs38, vs0, vs14
+
+ xvmaddasp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x4_2
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+ xvmaddasp vs34, vs4, vs18
+
+ xvmaddasp vs35, vs4, vs19
+
+ xvmaddasp vs36, vs4, vs20
+
+ xvmaddasp vs37, vs4, vs21
+
+ xvmaddasp vs38, vs4, vs22
+
+ xvmaddasp vs39, vs4, vs23
+
+
+.endm
+
+.macro KERNEL8x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+ xvmaddasp vs34, vs4, vs18
+
+ xvmaddasp vs35, vs4, vs19
+
+ xvmaddasp vs36, vs4, vs20
+
+ xvmaddasp vs37, vs4, vs21
+
+ xvmaddasp vs38, vs4, vs22
+
+ xvmaddasp vs39, vs4, vs23
+
+
+.endm
+
+.macro KERNEL8x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+ xvmulsp vs34, vs0, vs10
+
+ xvmulsp vs35, vs0, vs11
+
+ xvmulsp vs36, vs0, vs12
+
+ xvmulsp vs37, vs0, vs13
+
+ xvmulsp vs38, vs0, vs14
+
+ xvmulsp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x4_SUB1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+ xvmaddasp vs34, vs0, vs10
+
+ xvmaddasp vs35, vs0, vs11
+
+ xvmaddasp vs36, vs0, vs12
+
+ xvmaddasp vs37, vs0, vs13
+
+ xvmaddasp vs38, vs0, vs14
+
+ xvmaddasp vs39, vs0, vs15
+
+
+.endm
+
+.macro SAVE8x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs33, alpha_vr
+#else
+ xvmaddasp vs0, vs33, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs34, alpha_vr
+#else
+ xvmaddasp vs0, vs34, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs36, alpha_vr
+#else
+ xvmaddasp vs0, vs36, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs37, alpha_vr
+#else
+ xvmaddasp vs0, vs37, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs38, alpha_vr
+#else
+ xvmaddasp vs0, vs38, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs39, alpha_vr
+#else
+ xvmaddasp vs0, vs39, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=2
+**********************************************************************************************/
+
+.macro LOAD8x2_1
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL8x2_I1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs20, o0, T1
+ lxsspx vs21, o4, T1
+ lxsspx vs22, o8, T1
+ lxsspx vs23, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+ xsmuldp vs34, vs0, vs9
+ xsmuldp vs35, vs1, vs9
+
+ xsmuldp vs36, vs0, vs10
+ xsmuldp vs37, vs1, vs10
+
+ xsmuldp vs38, vs0, vs11
+ xsmuldp vs39, vs1, vs11
+
+ xsmuldp vs40, vs0, vs12
+ xsmuldp vs41, vs1, vs12
+
+ xsmuldp vs42, vs0, vs13
+ xsmuldp vs43, vs1, vs13
+
+ xsmuldp vs44, vs0, vs14
+ xsmuldp vs45, vs1, vs14
+
+ xsmuldp vs46, vs0, vs15
+ xsmuldp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x2_1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs20, o0, T1
+ lxsspx vs21, o4, T1
+ lxsspx vs22, o8, T1
+ lxsspx vs23, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+ xsmaddadp vs34, vs0, vs9
+ xsmaddadp vs35, vs1, vs9
+
+ xsmaddadp vs36, vs0, vs10
+ xsmaddadp vs37, vs1, vs10
+
+ xsmaddadp vs38, vs0, vs11
+ xsmaddadp vs39, vs1, vs11
+
+ xsmaddadp vs40, vs0, vs12
+ xsmaddadp vs41, vs1, vs12
+
+ xsmaddadp vs42, vs0, vs13
+ xsmaddadp vs43, vs1, vs13
+
+ xsmaddadp vs44, vs0, vs14
+ xsmaddadp vs45, vs1, vs14
+
+ xsmaddadp vs46, vs0, vs15
+ xsmaddadp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x2_2
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+ xsmaddadp vs34, vs4, vs17
+ xsmaddadp vs35, vs5, vs17
+
+ xsmaddadp vs36, vs4, vs18
+ xsmaddadp vs37, vs5, vs18
+
+ xsmaddadp vs38, vs4, vs19
+ xsmaddadp vs39, vs5, vs19
+
+ xsmaddadp vs40, vs4, vs20
+ xsmaddadp vs41, vs5, vs20
+
+ xsmaddadp vs42, vs4, vs21
+ xsmaddadp vs43, vs5, vs21
+
+ xsmaddadp vs44, vs4, vs22
+ xsmaddadp vs45, vs5, vs22
+
+ xsmaddadp vs46, vs4, vs23
+ xsmaddadp vs47, vs5, vs23
+
+
+.endm
+
+.macro KERNEL8x2_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+ xsmaddadp vs34, vs4, vs17
+ xsmaddadp vs35, vs5, vs17
+
+ xsmaddadp vs36, vs4, vs18
+ xsmaddadp vs37, vs5, vs18
+
+ xsmaddadp vs38, vs4, vs19
+ xsmaddadp vs39, vs5, vs19
+
+ xsmaddadp vs40, vs4, vs20
+ xsmaddadp vs41, vs5, vs20
+
+ xsmaddadp vs42, vs4, vs21
+ xsmaddadp vs43, vs5, vs21
+
+ xsmaddadp vs44, vs4, vs22
+ xsmaddadp vs45, vs5, vs22
+
+ xsmaddadp vs46, vs4, vs23
+ xsmaddadp vs47, vs5, vs23
+
+
+.endm
+
+.macro KERNEL8x2_SUBI1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+ xsmuldp vs34, vs0, vs9
+ xsmuldp vs35, vs1, vs9
+
+ xsmuldp vs36, vs0, vs10
+ xsmuldp vs37, vs1, vs10
+
+ xsmuldp vs38, vs0, vs11
+ xsmuldp vs39, vs1, vs11
+
+ xsmuldp vs40, vs0, vs12
+ xsmuldp vs41, vs1, vs12
+
+ xsmuldp vs42, vs0, vs13
+ xsmuldp vs43, vs1, vs13
+
+ xsmuldp vs44, vs0, vs14
+ xsmuldp vs45, vs1, vs14
+
+ xsmuldp vs46, vs0, vs15
+ xsmuldp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x2_SUB1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+ xsmaddadp vs34, vs0, vs9
+ xsmaddadp vs35, vs1, vs9
+
+ xsmaddadp vs36, vs0, vs10
+ xsmaddadp vs37, vs1, vs10
+
+ xsmaddadp vs38, vs0, vs11
+ xsmaddadp vs39, vs1, vs11
+
+ xsmaddadp vs40, vs0, vs12
+ xsmaddadp vs41, vs1, vs12
+
+ xsmaddadp vs42, vs0, vs13
+ xsmaddadp vs43, vs1, vs13
+
+ xsmaddadp vs44, vs0, vs14
+ xsmaddadp vs45, vs1, vs14
+
+ xsmaddadp vs46, vs0, vs15
+ xsmaddadp vs47, vs1, vs15
+
+
+.endm
+
+.macro SAVE8x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+ xsmuldp vs1, vs33, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+ xsmaddadp vs1, vs33, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs34, alpha_r
+ xsmuldp vs1, vs35, alpha_r
+#else
+ xsmaddadp vs0, vs34, alpha_r
+ xsmaddadp vs1, vs35, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs36, alpha_r
+ xsmuldp vs1, vs37, alpha_r
+#else
+ xsmaddadp vs0, vs36, alpha_r
+ xsmaddadp vs1, vs37, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs38, alpha_r
+ xsmuldp vs1, vs39, alpha_r
+#else
+ xsmaddadp vs0, vs38, alpha_r
+ xsmaddadp vs1, vs39, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs40, alpha_r
+ xsmuldp vs1, vs41, alpha_r
+#else
+ xsmaddadp vs0, vs40, alpha_r
+ xsmaddadp vs1, vs41, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs42, alpha_r
+ xsmuldp vs1, vs43, alpha_r
+#else
+ xsmaddadp vs0, vs42, alpha_r
+ xsmaddadp vs1, vs43, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs44, alpha_r
+ xsmuldp vs1, vs45, alpha_r
+#else
+ xsmaddadp vs0, vs44, alpha_r
+ xsmaddadp vs1, vs45, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs46, alpha_r
+ xsmuldp vs1, vs47, alpha_r
+#else
+ xsmaddadp vs0, vs46, alpha_r
+ xsmaddadp vs1, vs47, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=1
+**********************************************************************************************/
+
+.macro LOAD8x1_1
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL8x1_I1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs20, o0, T1
+ lxsspx vs21, o4, T1
+ lxsspx vs22, o8, T1
+ lxsspx vs23, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs8
+
+ xsmuldp vs33, vs0, vs9
+
+ xsmuldp vs34, vs0, vs10
+
+ xsmuldp vs35, vs0, vs11
+
+ xsmuldp vs36, vs0, vs12
+
+ xsmuldp vs37, vs0, vs13
+
+ xsmuldp vs38, vs0, vs14
+
+ xsmuldp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x1_1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs20, o0, T1
+ lxsspx vs21, o4, T1
+ lxsspx vs22, o8, T1
+ lxsspx vs23, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs8
+
+ xsmaddadp vs33, vs0, vs9
+
+ xsmaddadp vs34, vs0, vs10
+
+ xsmaddadp vs35, vs0, vs11
+
+ xsmaddadp vs36, vs0, vs12
+
+ xsmaddadp vs37, vs0, vs13
+
+ xsmaddadp vs38, vs0, vs14
+
+ xsmaddadp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x1_2
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs4, vs16
+
+ xsmaddadp vs33, vs4, vs17
+
+ xsmaddadp vs34, vs4, vs18
+
+ xsmaddadp vs35, vs4, vs19
+
+ xsmaddadp vs36, vs4, vs20
+
+ xsmaddadp vs37, vs4, vs21
+
+ xsmaddadp vs38, vs4, vs22
+
+ xsmaddadp vs39, vs4, vs23
+
+
+.endm
+
+.macro KERNEL8x1_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+
+ xsmaddadp vs33, vs4, vs17
+
+ xsmaddadp vs34, vs4, vs18
+
+ xsmaddadp vs35, vs4, vs19
+
+ xsmaddadp vs36, vs4, vs20
+
+ xsmaddadp vs37, vs4, vs21
+
+ xsmaddadp vs38, vs4, vs22
+
+ xsmaddadp vs39, vs4, vs23
+
+
+.endm
+
+.macro KERNEL8x1_SUBI1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs8
+
+ xsmuldp vs33, vs0, vs9
+
+ xsmuldp vs34, vs0, vs10
+
+ xsmuldp vs35, vs0, vs11
+
+ xsmuldp vs36, vs0, vs12
+
+ xsmuldp vs37, vs0, vs13
+
+ xsmuldp vs38, vs0, vs14
+
+ xsmuldp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x1_SUB1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs8
+
+ xsmaddadp vs33, vs0, vs9
+
+ xsmaddadp vs34, vs0, vs10
+
+ xsmaddadp vs35, vs0, vs11
+
+ xsmaddadp vs36, vs0, vs12
+
+ xsmaddadp vs37, vs0, vs13
+
+ xsmaddadp vs38, vs0, vs14
+
+ xsmaddadp vs39, vs0, vs15
+
+
+.endm
+
+.macro SAVE8x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs33, alpha_r
+#else
+ xsmaddadp vs0, vs33, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs34, alpha_r
+#else
+ xsmaddadp vs0, vs34, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs35, alpha_r
+#else
+ xsmaddadp vs0, vs35, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs36, alpha_r
+#else
+ xsmaddadp vs0, vs36, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs37, alpha_r
+#else
+ xsmaddadp vs0, vs37, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs38, alpha_r
+#else
+ xsmaddadp vs0, vs38, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs39, alpha_r
+#else
+ xsmaddadp vs0, vs39, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro LOAD4x16_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL4x16_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+ xvmulsp vs40, vs0, vs10
+ xvmulsp vs41, vs1, vs10
+ xvmulsp vs42, vs2, vs10
+ xvmulsp vs43, vs3, vs10
+
+ xvmulsp vs44, vs0, vs11
+ xvmulsp vs45, vs1, vs11
+ xvmulsp vs46, vs2, vs11
+ xvmulsp vs47, vs3, vs11
+
+
+.endm
+
+.macro KERNEL4x16_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+ xvmaddasp vs40, vs0, vs10
+ xvmaddasp vs41, vs1, vs10
+ xvmaddasp vs42, vs2, vs10
+ xvmaddasp vs43, vs3, vs10
+
+ xvmaddasp vs44, vs0, vs11
+ xvmaddasp vs45, vs1, vs11
+ xvmaddasp vs46, vs2, vs11
+ xvmaddasp vs47, vs3, vs11
+
+
+.endm
+
+.macro KERNEL4x16_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+ xvmaddasp vs40, vs4, vs18
+ xvmaddasp vs41, vs5, vs18
+ xvmaddasp vs42, vs6, vs18
+ xvmaddasp vs43, vs7, vs18
+
+ xvmaddasp vs44, vs4, vs19
+ xvmaddasp vs45, vs5, vs19
+ xvmaddasp vs46, vs6, vs19
+ xvmaddasp vs47, vs7, vs19
+
+
+.endm
+
+.macro KERNEL4x16_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+ xvmaddasp vs40, vs4, vs18
+ xvmaddasp vs41, vs5, vs18
+ xvmaddasp vs42, vs6, vs18
+ xvmaddasp vs43, vs7, vs18
+
+ xvmaddasp vs44, vs4, vs19
+ xvmaddasp vs45, vs5, vs19
+ xvmaddasp vs46, vs6, vs19
+ xvmaddasp vs47, vs7, vs19
+
+
+.endm
+
+.macro KERNEL4x16_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+ xvmulsp vs40, vs0, vs10
+ xvmulsp vs41, vs1, vs10
+ xvmulsp vs42, vs2, vs10
+ xvmulsp vs43, vs3, vs10
+
+ xvmulsp vs44, vs0, vs11
+ xvmulsp vs45, vs1, vs11
+ xvmulsp vs46, vs2, vs11
+ xvmulsp vs47, vs3, vs11
+
+
+.endm
+
+.macro KERNEL4x16_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+ xvmaddasp vs40, vs0, vs10
+ xvmaddasp vs41, vs1, vs10
+ xvmaddasp vs42, vs2, vs10
+ xvmaddasp vs43, vs3, vs10
+
+ xvmaddasp vs44, vs0, vs11
+ xvmaddasp vs45, vs1, vs11
+ xvmaddasp vs46, vs2, vs11
+ xvmaddasp vs47, vs3, vs11
+
+
+.endm
+
+.macro SAVE4x16
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+ xvmulsp vs2, vs34, alpha_vr
+ xvmulsp vs3, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+ xvmaddasp vs2, vs34, alpha_vr
+ xvmaddasp vs3, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs36, alpha_vr
+ xvmulsp vs1, vs37, alpha_vr
+ xvmulsp vs2, vs38, alpha_vr
+ xvmulsp vs3, vs39, alpha_vr
+#else
+ xvmaddasp vs0, vs36, alpha_vr
+ xvmaddasp vs1, vs37, alpha_vr
+ xvmaddasp vs2, vs38, alpha_vr
+ xvmaddasp vs3, vs39, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs40, alpha_vr
+ xvmulsp vs1, vs41, alpha_vr
+ xvmulsp vs2, vs42, alpha_vr
+ xvmulsp vs3, vs43, alpha_vr
+#else
+ xvmaddasp vs0, vs40, alpha_vr
+ xvmaddasp vs1, vs41, alpha_vr
+ xvmaddasp vs2, vs42, alpha_vr
+ xvmaddasp vs3, vs43, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs44, alpha_vr
+ xvmulsp vs1, vs45, alpha_vr
+ xvmulsp vs2, vs46, alpha_vr
+ xvmulsp vs3, vs47, alpha_vr
+#else
+ xvmaddasp vs0, vs44, alpha_vr
+ xvmaddasp vs1, vs45, alpha_vr
+ xvmaddasp vs2, vs46, alpha_vr
+ xvmaddasp vs3, vs47, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL4x8_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+ xvmulsp vs36, vs0, vs10
+ xvmulsp vs37, vs1, vs10
+
+ xvmulsp vs38, vs0, vs11
+ xvmulsp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x8_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+ xvmaddasp vs36, vs0, vs10
+ xvmaddasp vs37, vs1, vs10
+
+ xvmaddasp vs38, vs0, vs11
+ xvmaddasp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x8_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+ xvmaddasp vs36, vs4, vs18
+ xvmaddasp vs37, vs5, vs18
+
+ xvmaddasp vs38, vs4, vs19
+ xvmaddasp vs39, vs5, vs19
+
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+ xvmaddasp vs36, vs4, vs18
+ xvmaddasp vs37, vs5, vs18
+
+ xvmaddasp vs38, vs4, vs19
+ xvmaddasp vs39, vs5, vs19
+
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+ xvmulsp vs36, vs0, vs10
+ xvmulsp vs37, vs1, vs10
+
+ xvmulsp vs38, vs0, vs11
+ xvmulsp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+ xvmaddasp vs36, vs0, vs10
+ xvmaddasp vs37, vs1, vs10
+
+ xvmaddasp vs38, vs0, vs11
+ xvmaddasp vs39, vs1, vs11
+
+
+.endm
+
+.macro SAVE4x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs34, alpha_vr
+ xvmulsp vs1, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs34, alpha_vr
+ xvmaddasp vs1, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs36, alpha_vr
+ xvmulsp vs1, vs37, alpha_vr
+#else
+ xvmaddasp vs0, vs36, alpha_vr
+ xvmaddasp vs1, vs37, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs38, alpha_vr
+ xvmulsp vs1, vs39, alpha_vr
+#else
+ xvmaddasp vs0, vs38, alpha_vr
+ xvmaddasp vs1, vs39, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL4x4_I1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+ xvmulsp vs34, vs0, vs10
+
+ xvmulsp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x4_1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+ xvmaddasp vs34, vs0, vs10
+
+ xvmaddasp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x4_2
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+ xvmaddasp vs34, vs4, vs18
+
+ xvmaddasp vs35, vs4, vs19
+
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+ xvmaddasp vs34, vs4, vs18
+
+ xvmaddasp vs35, vs4, vs19
+
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+ xvmulsp vs34, vs0, vs10
+
+ xvmulsp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+ xvmaddasp vs34, vs0, vs10
+
+ xvmaddasp vs35, vs0, vs11
+
+
+.endm
+
+.macro SAVE4x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs33, alpha_vr
+#else
+ xvmaddasp vs0, vs33, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs34, alpha_vr
+#else
+ xvmaddasp vs0, vs34, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro LOAD4x2_1
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL4x2_I1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+ xsmuldp vs34, vs0, vs9
+ xsmuldp vs35, vs1, vs9
+
+ xsmuldp vs36, vs0, vs10
+ xsmuldp vs37, vs1, vs10
+
+ xsmuldp vs38, vs0, vs11
+ xsmuldp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x2_1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+ xsmaddadp vs34, vs0, vs9
+ xsmaddadp vs35, vs1, vs9
+
+ xsmaddadp vs36, vs0, vs10
+ xsmaddadp vs37, vs1, vs10
+
+ xsmaddadp vs38, vs0, vs11
+ xsmaddadp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x2_2
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+ xsmaddadp vs34, vs4, vs17
+ xsmaddadp vs35, vs5, vs17
+
+ xsmaddadp vs36, vs4, vs18
+ xsmaddadp vs37, vs5, vs18
+
+ xsmaddadp vs38, vs4, vs19
+ xsmaddadp vs39, vs5, vs19
+
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+ xsmaddadp vs34, vs4, vs17
+ xsmaddadp vs35, vs5, vs17
+
+ xsmaddadp vs36, vs4, vs18
+ xsmaddadp vs37, vs5, vs18
+
+ xsmaddadp vs38, vs4, vs19
+ xsmaddadp vs39, vs5, vs19
+
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+ xsmuldp vs34, vs0, vs9
+ xsmuldp vs35, vs1, vs9
+
+ xsmuldp vs36, vs0, vs10
+ xsmuldp vs37, vs1, vs10
+
+ xsmuldp vs38, vs0, vs11
+ xsmuldp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+ xsmaddadp vs34, vs0, vs9
+ xsmaddadp vs35, vs1, vs9
+
+ xsmaddadp vs36, vs0, vs10
+ xsmaddadp vs37, vs1, vs10
+
+ xsmaddadp vs38, vs0, vs11
+ xsmaddadp vs39, vs1, vs11
+
+
+.endm
+
+.macro SAVE4x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+ xsmuldp vs1, vs33, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+ xsmaddadp vs1, vs33, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs34, alpha_r
+ xsmuldp vs1, vs35, alpha_r
+#else
+ xsmaddadp vs0, vs34, alpha_r
+ xsmaddadp vs1, vs35, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs36, alpha_r
+ xsmuldp vs1, vs37, alpha_r
+#else
+ xsmaddadp vs0, vs36, alpha_r
+ xsmaddadp vs1, vs37, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs38, alpha_r
+ xsmuldp vs1, vs39, alpha_r
+#else
+ xsmaddadp vs0, vs38, alpha_r
+ xsmaddadp vs1, vs39, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro LOAD4x1_1
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL4x1_I1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs8
+
+ xsmuldp vs33, vs0, vs9
+
+ xsmuldp vs34, vs0, vs10
+
+ xsmuldp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x1_1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs8
+
+ xsmaddadp vs33, vs0, vs9
+
+ xsmaddadp vs34, vs0, vs10
+
+ xsmaddadp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x1_2
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs4, vs16
+
+ xsmaddadp vs33, vs4, vs17
+
+ xsmaddadp vs34, vs4, vs18
+
+ xsmaddadp vs35, vs4, vs19
+
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+
+ xsmaddadp vs33, vs4, vs17
+
+ xsmaddadp vs34, vs4, vs18
+
+ xsmaddadp vs35, vs4, vs19
+
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs8
+
+ xsmuldp vs33, vs0, vs9
+
+ xsmuldp vs34, vs0, vs10
+
+ xsmuldp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs8
+
+ xsmaddadp vs33, vs0, vs9
+
+ xsmaddadp vs34, vs0, vs10
+
+ xsmaddadp vs35, vs0, vs11
+
+
+.endm
+
+.macro SAVE4x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs33, alpha_r
+#else
+ xsmaddadp vs0, vs33, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs34, alpha_r
+#else
+ xsmaddadp vs0, vs34, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs35, alpha_r
+#else
+ xsmaddadp vs0, vs35, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+.macro LOAD2x16_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL2x16_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+
+.endm
+
+.macro KERNEL2x16_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+
+.endm
+
+.macro KERNEL2x16_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+
+.endm
+
+.macro SAVE2x16
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+ xvmulsp vs2, vs34, alpha_vr
+ xvmulsp vs3, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+ xvmaddasp vs2, vs34, alpha_vr
+ xvmaddasp vs3, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs36, alpha_vr
+ xvmulsp vs1, vs37, alpha_vr
+ xvmulsp vs2, vs38, alpha_vr
+ xvmulsp vs3, vs39, alpha_vr
+#else
+ xvmaddasp vs0, vs36, alpha_vr
+ xvmaddasp vs1, vs37, alpha_vr
+ xvmaddasp vs2, vs38, alpha_vr
+ xvmaddasp vs3, vs39, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL2x8_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+
+.endm
+
+.macro SAVE2x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs34, alpha_vr
+ xvmulsp vs1, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs34, alpha_vr
+ xvmaddasp vs1, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL2x4_I1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+
+.endm
+
+.macro SAVE2x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs33, alpha_vr
+#else
+ xvmaddasp vs0, vs33, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL2x2_I1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+ xsmuldp vs34, vs0, vs9
+ xsmuldp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+ xsmaddadp vs34, vs0, vs9
+ xsmaddadp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+ xsmaddadp vs34, vs4, vs17
+ xsmaddadp vs35, vs5, vs17
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+ xsmaddadp vs34, vs4, vs17
+ xsmaddadp vs35, vs5, vs17
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+ xsmuldp vs34, vs0, vs9
+ xsmuldp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+ xsmaddadp vs34, vs0, vs9
+ xsmaddadp vs35, vs1, vs9
+
+
+.endm
+
+.macro SAVE2x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+ xsmuldp vs1, vs33, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+ xsmaddadp vs1, vs33, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs34, alpha_r
+ xsmuldp vs1, vs35, alpha_r
+#else
+ xsmaddadp vs0, vs34, alpha_r
+ xsmaddadp vs1, vs35, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL2x1_I1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmuldp vs32, vs0, vs8
+
+ xsmuldp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs0, vs8
+
+ xsmaddadp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs4, vs16
+
+ xsmaddadp vs33, vs4, vs17
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+
+ xsmaddadp vs33, vs4, vs17
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmuldp vs32, vs0, vs8
+
+ xsmuldp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs0, vs8
+
+ xsmaddadp vs33, vs0, vs9
+
+
+.endm
+
+.macro SAVE2x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs33, alpha_r
+#else
+ xsmaddadp vs0, vs33, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+.macro LOAD1x16_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+.endm
+
+.macro KERNEL1x16_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+
+.endm
+
+.macro KERNEL1x16_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+
+.endm
+
+.macro KERNEL1x16_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+
+.endm
+
+.macro SAVE1x16
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+ xvmulsp vs2, vs34, alpha_vr
+ xvmulsp vs3, vs35, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+ xvmaddasp vs2, vs34, alpha_vr
+ xvmaddasp vs3, vs35, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+.endm
+
+.macro KERNEL1x8_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+
+.endm
+
+.macro SAVE1x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+.endm
+
+.macro KERNEL1x4_I1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmulsp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs4, vs16
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmulsp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs0, vs8
+
+
+.endm
+
+.macro SAVE1x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xvmulsp vs0, vs32, alpha_vr
+#else
+ xvmaddasp vs0, vs32, alpha_vr
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+.endm
+
+.macro KERNEL1x2_I1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+ xsmaddadp vs33, vs5, vs16
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmuldp vs32, vs0, vs8
+ xsmuldp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmaddadp vs32, vs0, vs8
+ xsmaddadp vs33, vs1, vs8
+
+
+.endm
+
+.macro SAVE1x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+ xsmuldp vs1, vs33, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+ xsmaddadp vs1, vs33, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+.endm
+
+.macro KERNEL1x1_I1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmuldp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmaddadp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmaddadp vs32, vs4, vs16
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+ xsmaddadp vs32, vs4, vs16
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmuldp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmaddadp vs32, vs0, vs8
+
+
+.endm
+
+.macro SAVE1x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+ xsmuldp vs0, vs32, alpha_r
+#else
+ xsmaddadp vs0, vs32, alpha_r
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 4
+
+.endm
+
diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c
new file mode 100644
index 000000000..abd6ec08a
--- /dev/null
+++ b/kernel/power/zasum.c
@@ -0,0 +1,149 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+#include
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#if defined(POWER8)
+#include "zasum_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void zasum_kernel_8(BLASLONG n, FLOAT *x1, FLOAT *svec)
+{
+
+ BLASLONG i=0;
+ FLOAT *x = x1;
+ FLOAT temp0, temp1, temp2, temp3;
+ FLOAT temp4, temp5, temp6, temp7;
+ FLOAT sum0 = 0.0;
+ FLOAT sum1 = 0.0;
+ FLOAT sum2 = 0.0;
+ FLOAT sum3 = 0.0;
+
+ while ( i< n )
+ {
+
+ temp0 = ABS(x[0]);
+ temp1 = ABS(x[1]);
+ temp2 = ABS(x[2]);
+ temp3 = ABS(x[3]);
+ temp4 = ABS(x[4]);
+ temp5 = ABS(x[5]);
+ temp6 = ABS(x[6]);
+ temp7 = ABS(x[7]);
+
+ sum0 += temp0;
+ sum1 += temp1;
+ sum2 += temp2;
+ sum3 += temp3;
+
+ sum0 += temp4;
+ sum1 += temp5;
+ sum2 += temp6;
+ sum3 += temp7;
+
+ x+=8;
+ i+=4;
+
+ }
+
+ svec[0] = sum0+sum1+sum2+sum3;
+ svec[1] = 0.0;
+
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ip=0;
+ FLOAT sumf = 0.0;
+ FLOAT svec[2] __attribute__ ((aligned (16)));;
+ BLASLONG n1;
+ BLASLONG inc_x2;
+
+ if (n <= 0 || inc_x <= 0) return(sumf);
+
+ if ( inc_x == 1 )
+ {
+
+ n1 = n & -8;
+ if ( n1 > 0 )
+ {
+
+ zasum_kernel_8(n1, x, svec);
+ sumf = svec[0] + svec[1];
+ i=n1;
+ ip=2*n1;
+ }
+
+ while(i < n)
+ {
+ sumf += ABS(x[ip]) + ABS(x[ip+1]);
+ i++;
+ ip+=2;
+ }
+
+ }
+ else
+ {
+ inc_x2 = 2* inc_x;
+
+ while(i < n)
+ {
+ sumf += ABS(x[ip]) + ABS(x[ip+1]);
+ ip+=inc_x2;
+ i++;
+ }
+
+ }
+ return(sumf);
+}
+
+
diff --git a/kernel/power/zasum_microk_power8.c b/kernel/power/zasum_microk_power8.c
new file mode 100644
index 000000000..b9f6c0ac6
--- /dev/null
+++ b/kernel/power/zasum_microk_power8.c
@@ -0,0 +1,177 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_8 1
+static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
+
+static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ BLASLONG pre = 384;
+
+ __asm__ __volatile__
+ (
+
+ "dcbt %2 , %4 \n\t"
+
+ "xxlxor 32,32,32 \n\t"
+ "xxlxor 33,33,33 \n\t"
+ "xxlxor 34,34,34 \n\t"
+ "xxlxor 35,35,35 \n\t"
+ "xxlxor 36,36,36 \n\t"
+ "xxlxor 37,37,37 \n\t"
+ "xxlxor 38,38,38 \n\t"
+ "xxlxor 39,39,39 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -8 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "dcbt %2 , %4 \n\t"
+
+ "xvabsdp 48, 40 \n\t"
+ "xvabsdp 49, 41 \n\t"
+ "xvabsdp 50, 42 \n\t"
+ "xvabsdp 51, 43 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+
+ "xvabsdp 52, 44 \n\t"
+ "xvabsdp 53, 45 \n\t"
+
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+
+ "xvabsdp 54, 46 \n\t"
+ "xvabsdp 55, 47 \n\t"
+
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+
+ "xvadddp 32, 32, 48 \n\t"
+ "xvadddp 33, 33, 49 \n\t"
+
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "xvadddp 34, 34, 50 \n\t"
+ "xvadddp 35, 35, 51 \n\t"
+ "addi %2, %2, 128 \n\t"
+ "xvadddp 36, 36, 52 \n\t"
+ "xvadddp 37, 37, 53 \n\t"
+ "addic. %0 , %0 , -8 \n\t"
+ "xvadddp 38, 38, 54 \n\t"
+ "xvadddp 39, 39, 55 \n\t"
+
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+
+ "xvabsdp 48, 40 \n\t"
+ "xvabsdp 49, 41 \n\t"
+ "xvabsdp 50, 42 \n\t"
+ "xvabsdp 51, 43 \n\t"
+ "xvabsdp 52, 44 \n\t"
+ "xvabsdp 53, 45 \n\t"
+ "xvabsdp 54, 46 \n\t"
+ "xvabsdp 55, 47 \n\t"
+
+ "xvadddp 32, 32, 48 \n\t"
+ "xvadddp 33, 33, 49 \n\t"
+ "xvadddp 34, 34, 50 \n\t"
+ "xvadddp 35, 35, 51 \n\t"
+ "xvadddp 36, 36, 52 \n\t"
+ "xvadddp 37, 37, 53 \n\t"
+ "xvadddp 38, 38, 54 \n\t"
+ "xvadddp 39, 39, 55 \n\t"
+
+ "xvadddp 32, 32, 33 \n\t"
+ "xvadddp 34, 34, 35 \n\t"
+ "xvadddp 36, 36, 37 \n\t"
+ "xvadddp 38, 38, 39 \n\t"
+
+ "xvadddp 32, 32, 34 \n\t"
+ "xvadddp 36, 36, 38 \n\t"
+
+ "xvadddp 32, 32, 36 \n\t"
+
+
+ "stxvd2x 32, 0, %3 \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x1), // 2
+ "r" (svec), // 3
+ "r" (pre), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c
new file mode 100644
index 000000000..0ee0c1bf9
--- /dev/null
+++ b/kernel/power/zaxpy.c
@@ -0,0 +1,140 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/23 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+
+
+#if defined(POWER8)
+#include "zaxpy_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_4
+
+static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG register i = 0;
+ BLASLONG register ix = 0;
+ FLOAT da_r = alpha[0];
+ FLOAT da_i = alpha[1];
+
+
+ while(i < n)
+ {
+#if !defined(CONJ)
+ y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
+ y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
+ y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
+ y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
+#else
+ y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
+ y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
+ y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
+ y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
+#endif
+
+ ix+=4 ;
+ i+=2 ;
+
+ }
+
+}
+
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+ FLOAT da[4];
+
+ if ( n <= 0 ) return(0);
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ BLASLONG n1 = n & -16;
+
+ if ( n1 )
+ {
+ da[0] = da_r;
+ da[1] = da_r;
+ da[2] = da_i;
+ da[3] = da_i;
+ zaxpy_kernel_4(n1, x, y , da );
+ ix = 2 * n1;
+ }
+ i = n1;
+ while(i < n)
+ {
+#if !defined(CONJ)
+ y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
+ y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
+#else
+ y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
+ y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
+#endif
+ i++ ;
+ ix += 2;
+
+ }
+ return(0);
+
+
+ }
+
+ inc_x *=2;
+ inc_y *=2;
+
+ while(i < n)
+ {
+
+#if !defined(CONJ)
+ y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
+ y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
+#else
+ y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
+ y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
+#endif
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/power/zaxpy_microk_power8.c b/kernel/power/zaxpy_microk_power8.c
new file mode 100644
index 000000000..c8a529fd9
--- /dev/null
+++ b/kernel/power/zaxpy_microk_power8.c
@@ -0,0 +1,250 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/23 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+#define HAVE_KERNEL_4 1
+static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
+
+static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ FLOAT *y2=y+1;
+ BLASLONG pre = 384;
+
+#if !defined(CONJ)
+ FLOAT mvec[2] = { -1.0, 1.0 };
+#else
+ FLOAT mvec[2] = { 1.0, -1.0 };
+#endif
+
+
+ __asm__ __volatile__
+ (
+
+ "lxsdx 34, 0 , %4 \n\t" // alpha_r
+ "lxsdx 35, %5, %4 \n\t" // alpha_i
+ "xxspltd 32, 34, 0 \n\t"
+ "xxspltd 33, 35, 0 \n\t"
+
+ "lxvd2x 36, 0, %9 \n\t" // mvec
+
+#if !defined(CONJ)
+ "xvmuldp 33, 33 , 36 \n\t" // alpha_i * mvec
+#else
+ "xvmuldp 32, 32 , 36 \n\t" // alpha_r * mvec
+#endif
+
+ "addi %8, %8, -8 \n\t"
+
+ "dcbt %2, %10 \n\t"
+ "dcbt %3, %10 \n\t"
+
+
+ "lxvd2x 40, 0, %2 \n\t" // x0
+ "lxvd2x 41, %5, %2 \n\t" // x1
+ "lxvd2x 42, %6, %2 \n\t" // x2
+ "lxvd2x 43, %7, %2 \n\t" // x3
+
+ "lxvd2x 48, 0, %3 \n\t" // y0
+ "lxvd2x 49, %5, %3 \n\t" // y1
+ "lxvd2x 50, %6, %3 \n\t" // y2
+ "lxvd2x 51, %7, %3 \n\t" // y3
+
+ "xxswapd 56, 40 \n\t" // exchange real and imag part
+ "xxswapd 57, 41 \n\t" // exchange real and imag part
+ "xxswapd 58, 42 \n\t" // exchange real and imag part
+ "xxswapd 59, 43 \n\t" // exchange real and imag part
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 64 \n\t"
+
+ "lxvd2x 44, 0, %2 \n\t" // x4
+ "lxvd2x 45, %5, %2 \n\t" // x5
+ "lxvd2x 46, %6, %2 \n\t" // x6
+ "lxvd2x 47, %7, %2 \n\t" // x7
+
+ "lxvd2x 52, 0, %3 \n\t" // y4
+ "lxvd2x 53, %5, %3 \n\t" // y5
+ "lxvd2x 54, %6, %3 \n\t" // y6
+ "lxvd2x 55, %7, %3 \n\t" // y7
+
+ "xxswapd 60, 44 \n\t" // exchange real and imag part
+ "xxswapd 61, 45 \n\t" // exchange real and imag part
+ "xxswapd 62, 46 \n\t" // exchange real and imag part
+ "xxswapd 63, 47 \n\t" // exchange real and imag part
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 64 \n\t"
+
+ "addic. %0 , %0 , -8 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "dcbt %2, %10 \n\t"
+ "dcbt %3, %10 \n\t"
+
+ "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
+ "xvmaddadp 49, 41, 32 \n\t"
+ "lxvd2x 40, 0, %2 \n\t" // x0
+ "lxvd2x 41, %5, %2 \n\t" // x1
+ "xvmaddadp 50, 42, 32 \n\t"
+ "xvmaddadp 51, 43, 32 \n\t"
+ "lxvd2x 42, %6, %2 \n\t" // x2
+ "lxvd2x 43, %7, %2 \n\t" // x3
+
+ "xvmaddadp 52, 44, 32 \n\t"
+ "addi %2, %2, 64 \n\t"
+ "xvmaddadp 53, 45, 32 \n\t"
+ "lxvd2x 44, 0, %2 \n\t" // x4
+ "lxvd2x 45, %5, %2 \n\t" // x5
+ "xvmaddadp 54, 46, 32 \n\t"
+ "xvmaddadp 55, 47, 32 \n\t"
+ "lxvd2x 46, %6, %2 \n\t" // x6
+ "lxvd2x 47, %7, %2 \n\t" // x7
+
+ "xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
+ "addi %2, %2, 64 \n\t"
+ "xvmaddadp 49, 57, 33 \n\t"
+ "xvmaddadp 50, 58, 33 \n\t"
+ "xvmaddadp 51, 59, 33 \n\t"
+
+ "xvmaddadp 52, 60, 33 \n\t"
+ "xvmaddadp 53, 61, 33 \n\t"
+ "xvmaddadp 54, 62, 33 \n\t"
+ "xvmaddadp 55, 63, 33 \n\t"
+
+ "stxvd2x 48, 0, %8 \n\t"
+ "stxvd2x 49, %5, %8 \n\t"
+ "stxvd2x 50, %6, %8 \n\t"
+ "stxvd2x 51, %7, %8 \n\t"
+
+ "addi %8, %8, 64 \n\t"
+
+ "stxvd2x 52, 0, %8 \n\t"
+ "stxvd2x 53, %5, %8 \n\t"
+ "stxvd2x 54, %6, %8 \n\t"
+ "stxvd2x 55, %7, %8 \n\t"
+
+ "addi %8, %8, 64 \n\t"
+
+ "xxswapd 56, 40 \n\t" // exchange real and imag part
+ "xxswapd 57, 41 \n\t" // exchange real and imag part
+ "lxvd2x 48, 0, %3 \n\t" // y0
+ "lxvd2x 49, %5, %3 \n\t" // y1
+ "xxswapd 58, 42 \n\t" // exchange real and imag part
+ "xxswapd 59, 43 \n\t" // exchange real and imag part
+ "lxvd2x 50, %6, %3 \n\t" // y2
+ "lxvd2x 51, %7, %3 \n\t" // y3
+
+ "xxswapd 60, 44 \n\t" // exchange real and imag part
+ "addi %3, %3, 64 \n\t"
+ "xxswapd 61, 45 \n\t" // exchange real and imag part
+ "lxvd2x 52, 0, %3 \n\t" // y4
+ "lxvd2x 53, %5, %3 \n\t" // y5
+ "xxswapd 62, 46 \n\t" // exchange real and imag part
+ "xxswapd 63, 47 \n\t" // exchange real and imag part
+ "lxvd2x 54, %6, %3 \n\t" // y6
+ "lxvd2x 55, %7, %3 \n\t" // y7
+
+ "addi %3, %3, 64 \n\t"
+
+ "addic. %0 , %0 , -8 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
+ "xvmaddadp 49, 41, 32 \n\t"
+ "xvmaddadp 50, 42, 32 \n\t"
+ "xvmaddadp 51, 43, 32 \n\t"
+
+ "xvmaddadp 52, 44, 32 \n\t"
+ "xvmaddadp 53, 45, 32 \n\t"
+ "xvmaddadp 54, 46, 32 \n\t"
+ "xvmaddadp 55, 47, 32 \n\t"
+
+ "xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
+ "xvmaddadp 49, 57, 33 \n\t"
+ "xvmaddadp 50, 58, 33 \n\t"
+ "xvmaddadp 51, 59, 33 \n\t"
+
+ "xvmaddadp 52, 60, 33 \n\t"
+ "xvmaddadp 53, 61, 33 \n\t"
+ "xvmaddadp 54, 62, 33 \n\t"
+ "xvmaddadp 55, 63, 33 \n\t"
+
+
+ "stxvd2x 48, 0, %8 \n\t"
+ "stxvd2x 49, %5, %8 \n\t"
+ "stxvd2x 50, %6, %8 \n\t"
+ "stxvd2x 51, %7, %8 \n\t"
+
+ "addi %8, %8, 64 \n\t"
+
+ "stxvd2x 52, 0, %8 \n\t"
+ "stxvd2x 53, %5, %8 \n\t"
+ "stxvd2x 54, %6, %8 \n\t"
+ "stxvd2x 55, %7, %8 \n\t"
+
+ "addi %8, %8, 64 \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x1), // 2
+ "r" (y1), // 3
+ "r" (alpha), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (y2), // 8
+ "r" (mvec), // 9
+ "r" (pre) // 10
+ : "cr0", "%0", "%2" , "%3", "%8", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c
new file mode 100644
index 000000000..a7658f7ab
--- /dev/null
+++ b/kernel/power/zcopy.c
@@ -0,0 +1,140 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8)
+#include "zcopy_microk_power8.c"
+#endif
+
+#ifndef HAVE_KERNEL_16
+
+static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG i=0;
+ FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+
+ while ( i 0 )
+ {
+ zcopy_kernel_16(n1, x, y);
+ i=n1;
+ ix=n1*2;
+ iy=n1*2;
+ }
+
+ while(i < n)
+ {
+ y[iy] = x[iy] ;
+ y[iy+1] = x[ix+1] ;
+ ix+=2;
+ iy+=2;
+ i++ ;
+
+ }
+
+
+ }
+ else
+ {
+
+ BLASLONG inc_x2 = 2 * inc_x;
+ BLASLONG inc_y2 = 2 * inc_y;
+
+ while(i < n)
+ {
+ y[iy] = x[ix] ;
+ y[iy+1] = x[ix+1] ;
+ ix += inc_x2 ;
+ iy += inc_y2 ;
+ i++ ;
+
+ }
+
+ }
+ return(0);
+
+
+}
+
+
diff --git a/kernel/power/zcopy_microk_power8.c b/kernel/power/zcopy_microk_power8.c
new file mode 100644
index 000000000..73abe084e
--- /dev/null
+++ b/kernel/power/zcopy_microk_power8.c
@@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ BLASLONG pre = 384;
+ BLASLONG alpha=0;
+
+ __asm__ __volatile__
+ (
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "lxvd2x 50, 0, %2 \n\t"
+ "lxvd2x 51, %5, %2 \n\t"
+ "lxvd2x 52, %6, %2 \n\t"
+ "lxvd2x 53, %7, %2 \n\t"
+ "lxvd2x 54, %8, %2 \n\t"
+ "lxvd2x 55, %9, %2 \n\t"
+ "lxvd2x 56, %10, %2 \n\t"
+ "lxvd2x 57, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -16 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "stxvd2x 40, 0, %1 \n\t"
+ "stxvd2x 41, %5, %1 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "stxvd2x 42, %6, %1 \n\t"
+ "stxvd2x 43, %7, %1 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "stxvd2x 44, %8, %1 \n\t"
+ "stxvd2x 45, %9, %1 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "stxvd2x 46, %10, %1 \n\t"
+ "stxvd2x 47, %11, %1 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+
+ "addi %1, %1, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "stxvd2x 50, 0, %1 \n\t"
+ "stxvd2x 51, %5, %1 \n\t"
+ "lxvd2x 50, 0, %2 \n\t"
+ "lxvd2x 51, %5, %2 \n\t"
+ "stxvd2x 52, %6, %1 \n\t"
+ "stxvd2x 53, %7, %1 \n\t"
+ "lxvd2x 52, %6, %2 \n\t"
+ "lxvd2x 53, %7, %2 \n\t"
+ "stxvd2x 54, %8, %1 \n\t"
+ "stxvd2x 55, %9, %1 \n\t"
+ "lxvd2x 54, %8, %2 \n\t"
+ "lxvd2x 55, %9, %2 \n\t"
+ "stxvd2x 56, %10, %1 \n\t"
+ "stxvd2x 57, %11, %1 \n\t"
+ "lxvd2x 56, %10, %2 \n\t"
+ "lxvd2x 57, %11, %2 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -16 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ "stxvd2x 40, 0, %1 \n\t"
+ "stxvd2x 41, %5, %1 \n\t"
+ "stxvd2x 42, %6, %1 \n\t"
+ "stxvd2x 43, %7, %1 \n\t"
+ "stxvd2x 44, %8, %1 \n\t"
+ "stxvd2x 45, %9, %1 \n\t"
+ "stxvd2x 46, %10, %1 \n\t"
+ "stxvd2x 47, %11, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+
+ "stxvd2x 50, 0, %1 \n\t"
+ "stxvd2x 51, %5, %1 \n\t"
+ "stxvd2x 52, %6, %1 \n\t"
+ "stxvd2x 53, %7, %1 \n\t"
+ "stxvd2x 54, %8, %1 \n\t"
+ "stxvd2x 55, %9, %1 \n\t"
+ "stxvd2x 56, %10, %1 \n\t"
+ "stxvd2x 57, %11, %1 \n\t"
+
+
+ :
+ :
+ "r" (i), // 0
+ "r" (y1), // 1
+ "r" (x1), // 2
+ "r" (alpha), // 3
+ "r" (pre), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2" , "%1", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c
new file mode 100644
index 000000000..1205b34b6
--- /dev/null
+++ b/kernel/power/zdot.c
@@ -0,0 +1,167 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+#include
+
+
+#if defined(POWER8)
+#include "zdot_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline));
+
+static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
+{
+ BLASLONG register i = 0;
+ FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 };
+ BLASLONG j=0;
+
+ while( i < n )
+ {
+
+ dot[0] += x[j] * y[j] ;
+ dot[1] += x[j+1] * y[j+1] ;
+ dot[2] += x[j] * y[j+1] ;
+ dot[3] += x[j+1] * y[j] ;
+
+ dot[0] += x[j+2] * y[j+2] ;
+ dot[1] += x[j+3] * y[j+3] ;
+ dot[2] += x[j+2] * y[j+3] ;
+ dot[3] += x[j+3] * y[j+2] ;
+
+ dot[0] += x[j+4] * y[j+4] ;
+ dot[1] += x[j+5] * y[j+5] ;
+ dot[2] += x[j+4] * y[j+5] ;
+ dot[3] += x[j+5] * y[j+4] ;
+
+ dot[0] += x[j+6] * y[j+6] ;
+ dot[1] += x[j+7] * y[j+7] ;
+ dot[2] += x[j+6] * y[j+7] ;
+ dot[3] += x[j+7] * y[j+6] ;
+
+ j+=8;
+ i+=4;
+
+ }
+ d[0] = dot[0];
+ d[1] = dot[1];
+ d[2] = dot[2];
+ d[3] = dot[3];
+
+}
+
+#endif
+
+FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i;
+ BLASLONG ix,iy;
+ FLOAT _Complex result;
+ FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ;
+
+ if ( n <= 0 )
+ {
+ __real__ result = 0.0 ;
+ __imag__ result = 0.0 ;
+ return(result);
+
+ }
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ BLASLONG n1 = n & -8;
+
+ if ( n1 )
+ zdot_kernel_8(n1, x, y , dot );
+
+ i = n1;
+ BLASLONG j = i * 2;
+
+ while( i < n )
+ {
+
+ dot[0] += x[j] * y[j] ;
+ dot[1] += x[j+1] * y[j+1] ;
+ dot[2] += x[j] * y[j+1] ;
+ dot[3] += x[j+1] * y[j] ;
+
+ j+=2;
+ i++ ;
+
+ }
+
+
+ }
+ else
+ {
+ i=0;
+ ix=0;
+ iy=0;
+ inc_x <<= 1;
+ inc_y <<= 1;
+ while(i < n)
+ {
+
+ dot[0] += x[ix] * y[iy] ;
+ dot[1] += x[ix+1] * y[iy+1] ;
+ dot[2] += x[ix] * y[iy+1] ;
+ dot[3] += x[ix+1] * y[iy] ;
+
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ }
+
+#if !defined(CONJ)
+ __real__ result = dot[0] - dot[1];
+ __imag__ result = dot[2] + dot[3];
+#else
+ __real__ result = dot[0] + dot[1];
+ __imag__ result = dot[2] - dot[3];
+
+#endif
+
+ return(result);
+
+}
+
+
diff --git a/kernel/power/zdot_microk_power8.c b/kernel/power/zdot_microk_power8.c
new file mode 100644
index 000000000..296d3d469
--- /dev/null
+++ b/kernel/power/zdot_microk_power8.c
@@ -0,0 +1,219 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_8 1
+static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
+
+static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ BLASLONG pre = 384;
+
+ __asm__ __volatile__
+ (
+ "xxlxor 32,32,32 \n\t"
+ "xxlxor 33,33,33 \n\t"
+ "xxlxor 34,34,34 \n\t"
+ "xxlxor 35,35,35 \n\t"
+ "xxlxor 36,36,36 \n\t"
+ "xxlxor 37,37,37 \n\t"
+ "xxlxor 38,38,38 \n\t"
+ "xxlxor 39,39,39 \n\t"
+
+ "dcbt %2, %8 \n\t"
+ "dcbt %3, %8 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
+ "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
+ "lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i
+ "lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i
+ "lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i
+ "lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i
+ "lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i
+ "lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i
+
+ "xxswapd 52,48 \n\t" // y0_i, y0_r
+ "xxswapd 53,49 \n\t" // y1_i, y1_r
+ "xxswapd 54,50 \n\t" // y2_i, y2_r
+ "xxswapd 55,51 \n\t" // y3_i, y3_r
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 64 \n\t"
+
+
+ "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i
+ "lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i
+ "lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i
+ "lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i
+ "lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i
+ "lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i
+ "lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i
+ "lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i
+
+ "xxswapd 60,56 \n\t" // y0_i, y0_r
+ "xxswapd 61,57 \n\t" // y1_i, y1_r
+ "xxswapd 62,58 \n\t" // y2_i, y2_r
+ "xxswapd 63,59 \n\t" // y3_i, y3_r
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 64 \n\t"
+
+ "addic. %0 , %0 , -8 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "dcbt %2, %8 \n\t"
+ "dcbt %3, %8 \n\t"
+
+ "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
+ "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
+ "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
+ "lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i
+
+ "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
+ "lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i
+ "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
+ "lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i
+
+ "xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r
+ "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
+ "xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r
+ "lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i
+
+ "xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r
+ "lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i
+ "xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r
+ "lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i
+
+ "xxswapd 52,48 \n\t" // y0_i, y0_r
+ "xxswapd 53,49 \n\t" // y1_i, y1_r
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 64 \n\t"
+
+ "xxswapd 54,50 \n\t" // y2_i, y2_r
+ "xxswapd 55,51 \n\t" // y3_i, y3_r
+
+ "xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i
+ "lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i
+ "xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i
+ "lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i
+ "xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i
+ "lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i
+ "xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i
+ "lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i
+
+ "xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
+ "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i
+ "xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
+ "lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i
+ "xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
+ "lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i
+ "xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
+ "lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i
+
+ "xxswapd 60,56 \n\t" // y0_i, y0_r
+ "xxswapd 61,57 \n\t" // y1_i, y1_r
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 64 \n\t"
+
+ "xxswapd 62,58 \n\t" // y2_i, y2_r
+ "xxswapd 63,59 \n\t" // y3_i, y3_r
+
+ "addic. %0 , %0 , -8 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
+ "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
+ "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
+ "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
+
+ "xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r
+ "xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r
+ "xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r
+ "xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r
+
+ "xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i
+ "xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i
+ "xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i
+ "xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i
+
+ "xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
+ "xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
+ "xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
+ "xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
+
+
+ "xvadddp 32, 32, 34 \n\t"
+ "xvadddp 36, 36, 38 \n\t"
+
+ "xvadddp 33, 33, 35 \n\t"
+ "xvadddp 37, 37, 39 \n\t"
+
+ "xvadddp 32, 32, 36 \n\t"
+ "xvadddp 33, 33, 37 \n\t"
+
+ "stxvd2x 32, 0, %4 \n\t"
+ "stxvd2x 33, %5, %4 \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x1), // 2
+ "r" (y1), // 3
+ "r" (dot), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (pre) // 8
+ : "cr0", "%0", "%2" , "%3", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S
index a7665f749..336b13b1f 100644
--- a/kernel/power/zgemm_kernel_8x2_power8.S
+++ b/kernel/power/zgemm_kernel_8x2_power8.S
@@ -1,38 +1,3 @@
-/***************************************************************************
-Copyright (c) 2013-2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
-* BLASTEST : OK
-* CTEST : OK
-* TEST : OK
-* LAPACK-TEST : OK
-**************************************************************************************/
-
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
@@ -82,7 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef __64BIT__
-#define STACKSIZE 320
+#define STACKSIZE 32000
#define ALPHA_R_SP 296(SP)
#define ALPHA_I_SP 304(SP)
#define FZERO 312(SP)
@@ -133,11 +98,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha_r vs30
#define alpha_i vs31
+
+#define FRAMEPOINTER r12
+
+#define BBUFFER r14
+
#define L r15
#define ALPHA r16
#define o24 r17
#define T2 r19
-#define KK r20
+#define BBO r20
#define o8 r21
#define I r22
#define J r23
@@ -156,8 +126,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PROFCODE
- addi SP, SP, -STACKSIZE
- li r0, 0
+ mr FRAMEPOINTER, SP
+ addi SP, SP, -STACKSIZE
+ addi SP, SP, -STACKSIZE
+ addi SP, SP, -STACKSIZE
+ addi SP, SP, -STACKSIZE
+ li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
@@ -200,6 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
+ std r14, 280(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
@@ -226,37 +201,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef linux
#ifdef __64BIT__
- ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
+ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
- ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
+ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#else
#ifdef DOUBLE
- lwz B, FRAMESLOT(0) + STACKSIZE(SP)
- lwz C, FRAMESLOT(1) + STACKSIZE(SP)
- lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
+ lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER)
+ lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER)
+ lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
#else
- lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
+ lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#endif
#endif
#ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__)
- ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+ ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
- ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+ ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#else
#ifdef DOUBLE
- lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
+ lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER)
#else
- lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+ lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif
#endif
#endif
@@ -268,34 +243,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "zgemm_macros_8x2_power8.S"
cmpwi cr0, M, 0
- ble .L999
+ ble L999
cmpwi cr0, N, 0
- ble .L999
+ ble L999
cmpwi cr0, K, 0
- ble .L999
+ ble L999
slwi LDC, LDC, ZBASE_SHIFT
- li PRE, 256
+ li PRE, 384
li o8 , 8
li o16 , 16
li o24 , 24
li o32 , 32
li o48 , 48
+ addi BBUFFER, SP, 512+4096
+ li T1, -4096
+ and BBUFFER, BBUFFER, T1
+
#ifdef __64BIT__
addi ALPHA, SP, 296
#else
addi ALPHA, SP, 224
#endif
- lxvdsx alpha_r, 0, ALPHA
- lxvdsx alpha_i, o8, ALPHA
+ lxsdx alpha_r, 0, ALPHA
+ lxsdx alpha_i, o8, ALPHA
- .align 5
+ .align 4
#include "zgemm_logic_8x2_power8.S"
-.L999:
+L999:
addi r3, 0, 0
lfd f14, 0(SP)
@@ -339,6 +318,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
+ ld r14, 280(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
@@ -360,6 +340,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
addi SP, SP, STACKSIZE
+ addi SP, SP, STACKSIZE
+ addi SP, SP, STACKSIZE
+ addi SP, SP, STACKSIZE
blr
diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S
index 5fcade5bf..96612da82 100644
--- a/kernel/power/zgemm_logic_8x2_power8.S
+++ b/kernel/power/zgemm_logic_8x2_power8.S
@@ -1,83 +1,111 @@
srawi. J, N, 1
- ble .LZGEMM_L2_END
+ ble ZGEMM_L2_END
+
+ZGEMM_L2_BEGIN:
+
+ mr BO, B
+ mr BBO, BBUFFER
+ slwi T1, K, 1
+
+ZGEMM_L2_COPYB:
+
+ lxvdsx vs4, o0, BO // b0_r
+ lxvdsx vs5, o8, BO // b0_i
+ addi BO, BO, 16
+ stxvd2x vs4, o0, BBO
+ stxvd2x vs5, o16, BBO
+ addic. T1, T1, -1
+ addi BBO, BBO, 32
+
+ bge ZGEMM_L2_COPYB
-.LZGEMM_L2_BEGIN:
mr CO, C
mr AO, A
slwi T1, LDC , 1
add C, C, T1
srawi. I, M, 3
- ble .LZGEMM_L2x8_END
+ ble ZGEMM_L2x8_END
-.LZGEMM_L2x8_BEGIN:
+ZGEMM_L2x8_BEGIN:
- mr BO, B
+ mr BO, BBUFFER
srawi. L, K, 3
- ble .LZGEMM_L2x8_SUB0
+ ble ZGEMM_L2x8_SUB0
cmpwi cr0, L, 1
- ble .LZGEMM_L2x8_SUB4
+ ble ZGEMM_L2x8_SUB4
-.LZGEMM_L2x8_LOOP_START:
+ZGEMM_L2x8_LOOP_START:
dcbt AO, PRE
+ dcbt BO, PRE
LOAD2x8_1
dcbt AO, PRE
KERNEL2x8_I1
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL2x8_2
addic. L, L, -2
- ble .LZGEMM_L2x8_LOOP_END
+ ble ZGEMM_L2x8_LOOP_END
.align 5
-.LZGEMM_L2x8_LOOP:
+ZGEMM_L2x8_LOOP:
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL2x8_2
addic. L, L, -1
- bgt .LZGEMM_L2x8_LOOP
+ bgt ZGEMM_L2x8_LOOP
-.LZGEMM_L2x8_LOOP_END:
+ZGEMM_L2x8_LOOP_END:
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
@@ -88,9 +116,9 @@
KERNEL2x8_1
KERNEL2x8_E2
- b .LZGEMM_L2x8_SUB1
+ b ZGEMM_L2x8_SUB1
-.LZGEMM_L2x8_SUB4:
+ZGEMM_L2x8_SUB4:
dcbt AO, PRE
KERNEL2x8_SUBI1
@@ -106,53 +134,53 @@
KERNEL2x8_SUB1
KERNEL2x8_SUB1
- b .LZGEMM_L2x8_SUB1
+ b ZGEMM_L2x8_SUB1
-.LZGEMM_L2x8_SUB0:
+ZGEMM_L2x8_SUB0:
andi. L, K, 7
KERNEL2x8_SUBI1
addic. L, L, -1
- ble .LZGEMM_L2x8_SAVE
- b .LZGEMM_L2x8_SUB2
+ ble ZGEMM_L2x8_SAVE
+ b ZGEMM_L2x8_SUB2
-.LZGEMM_L2x8_SUB1:
+ZGEMM_L2x8_SUB1:
andi. L, K, 7
- ble .LZGEMM_L2x8_SAVE
+ ble ZGEMM_L2x8_SAVE
-.LZGEMM_L2x8_SUB2:
+ZGEMM_L2x8_SUB2:
KERNEL2x8_SUB1
addic. L, L, -1
- bgt .LZGEMM_L2x8_SUB2
+ bgt ZGEMM_L2x8_SUB2
-.LZGEMM_L2x8_SAVE:
+ZGEMM_L2x8_SAVE:
SAVE2x8
addic. I, I, -1
- bgt .LZGEMM_L2x8_BEGIN
+ bgt ZGEMM_L2x8_BEGIN
-.LZGEMM_L2x8_END:
+ZGEMM_L2x8_END:
-.LZGEMM_L2x4_BEGIN:
+ZGEMM_L2x4_BEGIN:
andi. T2, M, 7
- ble .LZGEMM_L2x1_END
+ ble ZGEMM_L2x1_END
andi. T1, M, 4
- ble .LZGEMM_L2x4_END
- mr BO, B
+ ble ZGEMM_L2x4_END
+ mr BO, BBUFFER
srawi. L, K, 3
- ble .LZGEMM_L2x4_SUB0
+ ble ZGEMM_L2x4_SUB0
cmpwi cr0, L, 1
- ble .LZGEMM_L2x4_SUB4
+ ble ZGEMM_L2x4_SUB4
-.LZGEMM_L2x4_LOOP_START:
+ZGEMM_L2x4_LOOP_START:
LOAD2x4_1
KERNEL2x4_I1
@@ -166,11 +194,11 @@
KERNEL2x4_2
addic. L, L, -2
- ble .LZGEMM_L2x4_LOOP_END
+ ble ZGEMM_L2x4_LOOP_END
.align 5
-.LZGEMM_L2x4_LOOP:
+ZGEMM_L2x4_LOOP:
KERNEL2x4_1
KERNEL2x4_2
@@ -183,9 +211,9 @@
KERNEL2x4_2
addic. L, L, -1
- bgt .LZGEMM_L2x4_LOOP
+ bgt ZGEMM_L2x4_LOOP
-.LZGEMM_L2x4_LOOP_END:
+ZGEMM_L2x4_LOOP_END:
KERNEL2x4_1
KERNEL2x4_2
@@ -197,9 +225,9 @@
KERNEL2x4_1
KERNEL2x4_E2
- b .LZGEMM_L2x4_SUB1
+ b ZGEMM_L2x4_SUB1
-.LZGEMM_L2x4_SUB4:
+ZGEMM_L2x4_SUB4:
KERNEL2x4_SUBI1
KERNEL2x4_SUB1
@@ -211,48 +239,48 @@
KERNEL2x4_SUB1
KERNEL2x4_SUB1
- b .LZGEMM_L2x4_SUB1
+ b ZGEMM_L2x4_SUB1
-.LZGEMM_L2x4_SUB0:
+ZGEMM_L2x4_SUB0:
andi. L, K, 7
KERNEL2x4_SUBI1
addic. L, L, -1
- ble .LZGEMM_L2x4_SAVE
- b .LZGEMM_L2x4_SUB2
+ ble ZGEMM_L2x4_SAVE
+ b ZGEMM_L2x4_SUB2
-.LZGEMM_L2x4_SUB1:
+ZGEMM_L2x4_SUB1:
andi. L, K, 7
- ble .LZGEMM_L2x4_SAVE
+ ble ZGEMM_L2x4_SAVE
-.LZGEMM_L2x4_SUB2:
+ZGEMM_L2x4_SUB2:
KERNEL2x4_SUB1
addic. L, L, -1
- bgt .LZGEMM_L2x4_SUB2
+ bgt ZGEMM_L2x4_SUB2
-.LZGEMM_L2x4_SAVE:
+ZGEMM_L2x4_SAVE:
SAVE2x4
-.LZGEMM_L2x4_END:
+ZGEMM_L2x4_END:
-.LZGEMM_L2x2_BEGIN:
+ZGEMM_L2x2_BEGIN:
andi. T1, M, 2
- ble .LZGEMM_L2x2_END
- mr BO, B
+ ble ZGEMM_L2x2_END
+ mr BO, BBUFFER
srawi. L, K, 3
- ble .LZGEMM_L2x2_SUB0
+ ble ZGEMM_L2x2_SUB0
cmpwi cr0, L, 1
- ble .LZGEMM_L2x2_SUB4
+ ble ZGEMM_L2x2_SUB4
-.LZGEMM_L2x2_LOOP_START:
+ZGEMM_L2x2_LOOP_START:
LOAD2x2_1
KERNEL2x2_I1
@@ -266,11 +294,11 @@
KERNEL2x2_2
addic. L, L, -2
- ble .LZGEMM_L2x2_LOOP_END
+ ble ZGEMM_L2x2_LOOP_END
.align 5
-.LZGEMM_L2x2_LOOP:
+ZGEMM_L2x2_LOOP:
KERNEL2x2_1
KERNEL2x2_2
@@ -283,9 +311,9 @@
KERNEL2x2_2
addic. L, L, -1
- bgt .LZGEMM_L2x2_LOOP
+ bgt ZGEMM_L2x2_LOOP
-.LZGEMM_L2x2_LOOP_END:
+ZGEMM_L2x2_LOOP_END:
KERNEL2x2_1
KERNEL2x2_2
@@ -297,9 +325,9 @@
KERNEL2x2_1
KERNEL2x2_E2
- b .LZGEMM_L2x2_SUB1
+ b ZGEMM_L2x2_SUB1
-.LZGEMM_L2x2_SUB4:
+ZGEMM_L2x2_SUB4:
KERNEL2x2_SUBI1
KERNEL2x2_SUB1
@@ -311,48 +339,48 @@
KERNEL2x2_SUB1
KERNEL2x2_SUB1
- b .LZGEMM_L2x2_SUB1
+ b ZGEMM_L2x2_SUB1
-.LZGEMM_L2x2_SUB0:
+ZGEMM_L2x2_SUB0:
andi. L, K, 7
KERNEL2x2_SUBI1
addic. L, L, -1
- ble .LZGEMM_L2x2_SAVE
- b .LZGEMM_L2x2_SUB2
+ ble ZGEMM_L2x2_SAVE
+ b ZGEMM_L2x2_SUB2
-.LZGEMM_L2x2_SUB1:
+ZGEMM_L2x2_SUB1:
andi. L, K, 7
- ble .LZGEMM_L2x2_SAVE
+ ble ZGEMM_L2x2_SAVE
-.LZGEMM_L2x2_SUB2:
+ZGEMM_L2x2_SUB2:
KERNEL2x2_SUB1
addic. L, L, -1
- bgt .LZGEMM_L2x2_SUB2
+ bgt ZGEMM_L2x2_SUB2
-.LZGEMM_L2x2_SAVE:
+ZGEMM_L2x2_SAVE:
SAVE2x2
-.LZGEMM_L2x2_END:
+ZGEMM_L2x2_END:
-.LZGEMM_L2x1_BEGIN:
+ZGEMM_L2x1_BEGIN:
andi. T1, M, 1
- ble .LZGEMM_L2x1_END
- mr BO, B
+ ble ZGEMM_L2x1_END
+ mr BO, BBUFFER
srawi. L, K, 3
- ble .LZGEMM_L2x1_SUB0
+ ble ZGEMM_L2x1_SUB0
cmpwi cr0, L, 1
- ble .LZGEMM_L2x1_SUB4
+ ble ZGEMM_L2x1_SUB4
-.LZGEMM_L2x1_LOOP_START:
+ZGEMM_L2x1_LOOP_START:
LOAD2x1_1
KERNEL2x1_I1
@@ -366,11 +394,11 @@
KERNEL2x1_2
addic. L, L, -2
- ble .LZGEMM_L2x1_LOOP_END
+ ble ZGEMM_L2x1_LOOP_END
.align 5
-.LZGEMM_L2x1_LOOP:
+ZGEMM_L2x1_LOOP:
KERNEL2x1_1
KERNEL2x1_2
@@ -383,9 +411,9 @@
KERNEL2x1_2
addic. L, L, -1
- bgt .LZGEMM_L2x1_LOOP
+ bgt ZGEMM_L2x1_LOOP
-.LZGEMM_L2x1_LOOP_END:
+ZGEMM_L2x1_LOOP_END:
KERNEL2x1_1
KERNEL2x1_2
@@ -397,9 +425,9 @@
KERNEL2x1_1
KERNEL2x1_E2
- b .LZGEMM_L2x1_SUB1
+ b ZGEMM_L2x1_SUB1
-.LZGEMM_L2x1_SUB4:
+ZGEMM_L2x1_SUB4:
KERNEL2x1_SUBI1
KERNEL2x1_SUB1
@@ -411,72 +439,89 @@
KERNEL2x1_SUB1
KERNEL2x1_SUB1
- b .LZGEMM_L2x1_SUB1
+ b ZGEMM_L2x1_SUB1
-.LZGEMM_L2x1_SUB0:
+ZGEMM_L2x1_SUB0:
andi. L, K, 7
KERNEL2x1_SUBI1
addic. L, L, -1
- ble .LZGEMM_L2x1_SAVE
- b .LZGEMM_L2x1_SUB2
+ ble ZGEMM_L2x1_SAVE
+ b ZGEMM_L2x1_SUB2
-.LZGEMM_L2x1_SUB1:
+ZGEMM_L2x1_SUB1:
andi. L, K, 7
- ble .LZGEMM_L2x1_SAVE
+ ble ZGEMM_L2x1_SAVE
-.LZGEMM_L2x1_SUB2:
+ZGEMM_L2x1_SUB2:
KERNEL2x1_SUB1
addic. L, L, -1
- bgt .LZGEMM_L2x1_SUB2
+ bgt ZGEMM_L2x1_SUB2
-.LZGEMM_L2x1_SAVE:
+ZGEMM_L2x1_SAVE:
SAVE2x1
-.LZGEMM_L2x1_END:
+ZGEMM_L2x1_END:
slwi T1, K, 5
add B, B, T1
addic. J, J, -1
- bgt .LZGEMM_L2_BEGIN
+ bgt ZGEMM_L2_BEGIN
andi. T2, N, 1
- ble .L999
+ ble L999
-.LZGEMM_L2_END:
+ZGEMM_L2_END:
- b .LZGEMM_L1_BEGIN
+ b ZGEMM_L1_BEGIN
-.L999_H1:
+L999_H1:
- b .L999
+ b L999
+
+ZGEMM_L1_BEGIN:
+
+ mr BO, B
+ mr BBO, BBUFFER
+ slwi T1, K, 0
+
+ZGEMM_L1_COPYB:
+
+ lxvdsx vs4, o0, BO // b0_r
+ lxvdsx vs5, o8, BO // b0_i
+ addi BO, BO, 16
+ stxvd2x vs4, o0, BBO
+ stxvd2x vs5, o16, BBO
+ addic. T1, T1, -1
+ addi BBO, BBO, 32
+
+ bge ZGEMM_L1_COPYB
-.LZGEMM_L1_BEGIN:
andi. T1, N, 1
- ble .LZGEMM_L1_END
+ ble ZGEMM_L1_END
mr CO, C
mr AO, A
srawi. I, M, 3
- ble .LZGEMM_L1x8_END
+ ble ZGEMM_L1x8_END
-.LZGEMM_L1x8_BEGIN:
+ZGEMM_L1x8_BEGIN:
- mr BO, B
+ mr BO, BBUFFER
srawi. L, K, 3
- ble .LZGEMM_L1x8_SUB0
+ ble ZGEMM_L1x8_SUB0
cmpwi cr0, L, 1
- ble .LZGEMM_L1x8_SUB4
+ ble ZGEMM_L1x8_SUB4
-.LZGEMM_L1x8_LOOP_START:
+ZGEMM_L1x8_LOOP_START:
dcbt AO, PRE
LOAD1x8_1
@@ -499,11 +544,11 @@
KERNEL1x8_2
addic. L, L, -2
- ble .LZGEMM_L1x8_LOOP_END
+ ble ZGEMM_L1x8_LOOP_END
.align 5
-.LZGEMM_L1x8_LOOP:
+ZGEMM_L1x8_LOOP:
dcbt AO, PRE
KERNEL1x8_1
@@ -524,9 +569,9 @@
KERNEL1x8_2
addic. L, L, -1
- bgt .LZGEMM_L1x8_LOOP
+ bgt ZGEMM_L1x8_LOOP
-.LZGEMM_L1x8_LOOP_END:
+ZGEMM_L1x8_LOOP_END:
dcbt AO, PRE
KERNEL1x8_1
@@ -545,9 +590,9 @@
KERNEL1x8_1
KERNEL1x8_E2
- b .LZGEMM_L1x8_SUB1
+ b ZGEMM_L1x8_SUB1
-.LZGEMM_L1x8_SUB4:
+ZGEMM_L1x8_SUB4:
dcbt AO, PRE
KERNEL1x8_SUBI1
@@ -563,53 +608,53 @@
KERNEL1x8_SUB1
KERNEL1x8_SUB1
- b .LZGEMM_L1x8_SUB1
+ b ZGEMM_L1x8_SUB1
-.LZGEMM_L1x8_SUB0:
+ZGEMM_L1x8_SUB0:
andi. L, K, 7
KERNEL1x8_SUBI1
addic. L, L, -1
- ble .LZGEMM_L1x8_SAVE
- b .LZGEMM_L1x8_SUB2
+ ble ZGEMM_L1x8_SAVE
+ b ZGEMM_L1x8_SUB2
-.LZGEMM_L1x8_SUB1:
+ZGEMM_L1x8_SUB1:
andi. L, K, 7
- ble .LZGEMM_L1x8_SAVE
+ ble ZGEMM_L1x8_SAVE
-.LZGEMM_L1x8_SUB2:
+ZGEMM_L1x8_SUB2:
KERNEL1x8_SUB1
addic. L, L, -1
- bgt .LZGEMM_L1x8_SUB2
+ bgt ZGEMM_L1x8_SUB2
-.LZGEMM_L1x8_SAVE:
+ZGEMM_L1x8_SAVE:
SAVE1x8
addic. I, I, -1
- bgt .LZGEMM_L1x8_BEGIN
+ bgt ZGEMM_L1x8_BEGIN
-.LZGEMM_L1x8_END:
+ZGEMM_L1x8_END:
-.LZGEMM_L1x4_BEGIN:
+ZGEMM_L1x4_BEGIN:
andi. T2, M, 7
- ble .LZGEMM_L1x1_END
+ ble ZGEMM_L1x1_END
andi. T1, M, 4
- ble .LZGEMM_L1x4_END
- mr BO, B
+ ble ZGEMM_L1x4_END
+ mr BO, BBUFFER
srawi. L, K, 3
- ble .LZGEMM_L1x4_SUB0
+ ble ZGEMM_L1x4_SUB0
cmpwi cr0, L, 1
- ble .LZGEMM_L1x4_SUB4
+ ble ZGEMM_L1x4_SUB4
-.LZGEMM_L1x4_LOOP_START:
+ZGEMM_L1x4_LOOP_START:
LOAD1x4_1
KERNEL1x4_I1
@@ -623,11 +668,11 @@
KERNEL1x4_2
addic. L, L, -2
- ble .LZGEMM_L1x4_LOOP_END
+ ble ZGEMM_L1x4_LOOP_END
.align 5
-.LZGEMM_L1x4_LOOP:
+ZGEMM_L1x4_LOOP:
KERNEL1x4_1
KERNEL1x4_2
@@ -640,9 +685,9 @@
KERNEL1x4_2
addic. L, L, -1
- bgt .LZGEMM_L1x4_LOOP
+ bgt ZGEMM_L1x4_LOOP
-.LZGEMM_L1x4_LOOP_END:
+ZGEMM_L1x4_LOOP_END:
KERNEL1x4_1
KERNEL1x4_2
@@ -654,9 +699,9 @@
KERNEL1x4_1
KERNEL1x4_E2
- b .LZGEMM_L1x4_SUB1
+ b ZGEMM_L1x4_SUB1
-.LZGEMM_L1x4_SUB4:
+ZGEMM_L1x4_SUB4:
KERNEL1x4_SUBI1
KERNEL1x4_SUB1
@@ -668,48 +713,48 @@
KERNEL1x4_SUB1
KERNEL1x4_SUB1
- b .LZGEMM_L1x4_SUB1
+ b ZGEMM_L1x4_SUB1
-.LZGEMM_L1x4_SUB0:
+ZGEMM_L1x4_SUB0:
andi. L, K, 7
KERNEL1x4_SUBI1
addic. L, L, -1
- ble .LZGEMM_L1x4_SAVE
- b .LZGEMM_L1x4_SUB2
+ ble ZGEMM_L1x4_SAVE
+ b ZGEMM_L1x4_SUB2
-.LZGEMM_L1x4_SUB1:
+ZGEMM_L1x4_SUB1:
andi. L, K, 7
- ble .LZGEMM_L1x4_SAVE
+ ble ZGEMM_L1x4_SAVE
-.LZGEMM_L1x4_SUB2:
+ZGEMM_L1x4_SUB2:
KERNEL1x4_SUB1
addic. L, L, -1
- bgt .LZGEMM_L1x4_SUB2
+ bgt ZGEMM_L1x4_SUB2
-.LZGEMM_L1x4_SAVE:
+ZGEMM_L1x4_SAVE:
SAVE1x4
-.LZGEMM_L1x4_END:
+ZGEMM_L1x4_END:
-.LZGEMM_L1x2_BEGIN:
+ZGEMM_L1x2_BEGIN:
andi. T1, M, 2
- ble .LZGEMM_L1x2_END
- mr BO, B
+ ble ZGEMM_L1x2_END
+ mr BO, BBUFFER
srawi. L, K, 3
- ble .LZGEMM_L1x2_SUB0
+ ble ZGEMM_L1x2_SUB0
cmpwi cr0, L, 1
- ble .LZGEMM_L1x2_SUB4
+ ble ZGEMM_L1x2_SUB4
-.LZGEMM_L1x2_LOOP_START:
+ZGEMM_L1x2_LOOP_START:
LOAD1x2_1
KERNEL1x2_I1
@@ -723,11 +768,11 @@
KERNEL1x2_2
addic. L, L, -2
- ble .LZGEMM_L1x2_LOOP_END
+ ble ZGEMM_L1x2_LOOP_END
.align 5
-.LZGEMM_L1x2_LOOP:
+ZGEMM_L1x2_LOOP:
KERNEL1x2_1
KERNEL1x2_2
@@ -740,9 +785,9 @@
KERNEL1x2_2
addic. L, L, -1
- bgt .LZGEMM_L1x2_LOOP
+ bgt ZGEMM_L1x2_LOOP
-.LZGEMM_L1x2_LOOP_END:
+ZGEMM_L1x2_LOOP_END:
KERNEL1x2_1
KERNEL1x2_2
@@ -754,9 +799,9 @@
KERNEL1x2_1
KERNEL1x2_E2
- b .LZGEMM_L1x2_SUB1
+ b ZGEMM_L1x2_SUB1
-.LZGEMM_L1x2_SUB4:
+ZGEMM_L1x2_SUB4:
KERNEL1x2_SUBI1
KERNEL1x2_SUB1
@@ -768,48 +813,48 @@
KERNEL1x2_SUB1
KERNEL1x2_SUB1
- b .LZGEMM_L1x2_SUB1
+ b ZGEMM_L1x2_SUB1
-.LZGEMM_L1x2_SUB0:
+ZGEMM_L1x2_SUB0:
andi. L, K, 7
KERNEL1x2_SUBI1
addic. L, L, -1
- ble .LZGEMM_L1x2_SAVE
- b .LZGEMM_L1x2_SUB2
+ ble ZGEMM_L1x2_SAVE
+ b ZGEMM_L1x2_SUB2
-.LZGEMM_L1x2_SUB1:
+ZGEMM_L1x2_SUB1:
andi. L, K, 7
- ble .LZGEMM_L1x2_SAVE
+ ble ZGEMM_L1x2_SAVE
-.LZGEMM_L1x2_SUB2:
+ZGEMM_L1x2_SUB2:
KERNEL1x2_SUB1
addic. L, L, -1
- bgt .LZGEMM_L1x2_SUB2
+ bgt ZGEMM_L1x2_SUB2
-.LZGEMM_L1x2_SAVE:
+ZGEMM_L1x2_SAVE:
SAVE1x2
-.LZGEMM_L1x2_END:
+ZGEMM_L1x2_END:
-.LZGEMM_L1x1_BEGIN:
+ZGEMM_L1x1_BEGIN:
andi. T1, M, 1
- ble .LZGEMM_L1x1_END
- mr BO, B
+ ble ZGEMM_L1x1_END
+ mr BO, BBUFFER
srawi. L, K, 3
- ble .LZGEMM_L1x1_SUB0
+ ble ZGEMM_L1x1_SUB0
cmpwi cr0, L, 1
- ble .LZGEMM_L1x1_SUB4
+ ble ZGEMM_L1x1_SUB4
-.LZGEMM_L1x1_LOOP_START:
+ZGEMM_L1x1_LOOP_START:
LOAD1x1_1
KERNEL1x1_I1
@@ -823,11 +868,11 @@
KERNEL1x1_2
addic. L, L, -2
- ble .LZGEMM_L1x1_LOOP_END
+ ble ZGEMM_L1x1_LOOP_END
.align 5
-.LZGEMM_L1x1_LOOP:
+ZGEMM_L1x1_LOOP:
KERNEL1x1_1
KERNEL1x1_2
@@ -840,9 +885,9 @@
KERNEL1x1_2
addic. L, L, -1
- bgt .LZGEMM_L1x1_LOOP
+ bgt ZGEMM_L1x1_LOOP
-.LZGEMM_L1x1_LOOP_END:
+ZGEMM_L1x1_LOOP_END:
KERNEL1x1_1
KERNEL1x1_2
@@ -854,9 +899,9 @@
KERNEL1x1_1
KERNEL1x1_E2
- b .LZGEMM_L1x1_SUB1
+ b ZGEMM_L1x1_SUB1
-.LZGEMM_L1x1_SUB4:
+ZGEMM_L1x1_SUB4:
KERNEL1x1_SUBI1
KERNEL1x1_SUB1
@@ -868,34 +913,34 @@
KERNEL1x1_SUB1
KERNEL1x1_SUB1
- b .LZGEMM_L1x1_SUB1
+ b ZGEMM_L1x1_SUB1
-.LZGEMM_L1x1_SUB0:
+ZGEMM_L1x1_SUB0:
andi. L, K, 7
KERNEL1x1_SUBI1
addic. L, L, -1
- ble .LZGEMM_L1x1_SAVE
- b .LZGEMM_L1x1_SUB2
+ ble ZGEMM_L1x1_SAVE
+ b ZGEMM_L1x1_SUB2
-.LZGEMM_L1x1_SUB1:
+ZGEMM_L1x1_SUB1:
andi. L, K, 7
- ble .LZGEMM_L1x1_SAVE
+ ble ZGEMM_L1x1_SAVE
-.LZGEMM_L1x1_SUB2:
+ZGEMM_L1x1_SUB2:
KERNEL1x1_SUB1
addic. L, L, -1
- bgt .LZGEMM_L1x1_SUB2
+ bgt ZGEMM_L1x1_SUB2
-.LZGEMM_L1x1_SAVE:
+ZGEMM_L1x1_SAVE:
SAVE1x1
-.LZGEMM_L1x1_END:
+ZGEMM_L1x1_END:
-.LZGEMM_L1_END:
+ZGEMM_L1_END:
diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S
index 701ec65c8..a0fbb2e11 100644
--- a/kernel/power/zgemm_macros_8x2_power8.S
+++ b/kernel/power/zgemm_macros_8x2_power8.S
@@ -1,39 +1,3 @@
-/***************************************************************************
-Copyright (c) 2013-2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
-* BLASTEST : OK
-* CTEST : OK
-* TEST : OK
-* LAPACK-TEST : OK
-**************************************************************************************/
-
-
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define XSFADD_R1 xsadddp
@@ -70,12 +34,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x8_1
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
@@ -110,12 +74,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
- lxvdsx vs22, o16, BO // load real part from B
- lxvdsx vs23, o24, BO // load imag part from B
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
+ lxvd2x vs22, o32, BO // load real part from B
+ lxvd2x vs23, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -156,36 +120,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x8_1
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs12, o0, AO // load real,imag from A
+ lxvd2x vs13, o16, AO // load real,imag from A
+ lxvd2x vs14, o32, AO // load real,imag from A
+ lxvd2x vs15, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
+ lxvd2x vs22, o32, BO // load real part from B
+ lxvd2x vs23, o48, BO // load imag part from B
+
+ addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
-
- lxvdsx vs22, o16, BO // load real part from B
- lxvdsx vs23, o24, BO // load imag part from B
-
xvmaddadp vs36, vs2, vs16 // real*real, imag*real
xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
xvmaddadp vs38, vs3, vs16 // real*real, imag*real
xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
-
- lxvd2x vs8, o0, AO // load real,imag from A
- lxvd2x vs9, o16, AO // load real,imag from A
-
xvmaddadp vs40, vs4, vs16 // real*real, imag*real
xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
xvmaddadp vs42, vs5, vs16 // real*real, imag*real
xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
-
- lxvd2x vs10, o32, AO // load real,imag from A
- lxvd2x vs11, o48, AO // load real,imag from A
-
xvmaddadp vs44, vs6, vs16 // real*real, imag*real
xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
-
- addi AO, AO, 64
-
xvmaddadp vs46, vs7, vs16 // real*real, imag*real
xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
@@ -193,101 +162,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
xvmaddadp vs50, vs1, vs18 // real*real, imag*real
xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
-
- lxvd2x vs12, o0, AO // load real,imag from A
- lxvd2x vs13, o16, AO // load real,imag from A
-
xvmaddadp vs52, vs2, vs18 // real*real, imag*real
xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
xvmaddadp vs54, vs3, vs18 // real*real, imag*real
xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
-
- lxvd2x vs14, o32, AO // load real,imag from A
- lxvd2x vs15, o48, AO // load real,imag from A
-
xvmaddadp vs56, vs4, vs18 // real*real, imag*real
xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
xvmaddadp vs58, vs5, vs18 // real*real, imag*real
xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
-
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
-
xvmaddadp vs60, vs6, vs18 // real*real, imag*real
xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
xvmaddadp vs62, vs7, vs18 // real*real, imag*real
xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
- addi AO, AO, 64
- addi BO, BO, 32
.endm
.macro KERNEL2x8_2
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
+
+ addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
-
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
-
xvmaddadp vs36, vs10, vs20 // real*real, imag*real
xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
xvmaddadp vs38, vs11, vs20 // real*real, imag*real
xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
-
- lxvd2x vs0, o0, AO // load real,imag from A
- lxvd2x vs1, o16, AO // load real,imag from A
-
xvmaddadp vs40, vs12, vs20 // real*real, imag*real
xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
xvmaddadp vs42, vs13, vs20 // real*real, imag*real
xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
-
- lxvd2x vs2, o32, AO // load real,imag from A
- lxvd2x vs3, o48, AO // load real,imag from A
-
xvmaddadp vs44, vs14, vs20 // real*real, imag*real
xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
xvmaddadp vs46, vs15, vs20 // real*real, imag*real
xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
- addi AO, AO, 64
-
xvmaddadp vs48, vs8, vs22 // real*real, imag*real
xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
xvmaddadp vs50, vs9, vs22 // real*real, imag*real
xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
-
- lxvd2x vs4, o0, AO // load real,imag from A
- lxvd2x vs5, o16, AO // load real,imag from A
-
xvmaddadp vs52, vs10, vs22 // real*real, imag*real
xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
xvmaddadp vs54, vs11, vs22 // real*real, imag*real
xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
-
- lxvd2x vs6, o32, AO // load real,imag from A
- lxvd2x vs7, o48, AO // load real,imag from A
-
xvmaddadp vs56, vs12, vs22 // real*real, imag*real
xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
xvmaddadp vs58, vs13, vs22 // real*real, imag*real
xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
-
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
-
xvmaddadp vs60, vs14, vs22 // real*real, imag*real
xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
xvmaddadp vs62, vs15, vs22 // real*real, imag*real
xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
- addi AO, AO, 64
- addi BO, BO, 32
.endm
@@ -347,12 +294,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -407,12 +354,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@@ -927,12 +874,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x4_1
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
@@ -953,12 +900,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
- lxvdsx vs22, o16, BO // load real part from B
- lxvdsx vs23, o24, BO // load imag part from B
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
+ lxvd2x vs22, o32, BO // load real part from B
+ lxvd2x vs23, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -990,12 +937,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
- lxvdsx vs22, o16, BO // load real part from B
- lxvdsx vs23, o24, BO // load imag part from B
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
+ lxvd2x vs22, o32, BO // load real part from B
+ lxvd2x vs23, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@@ -1027,12 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@@ -1088,12 +1035,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -1125,12 +1072,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@@ -1410,12 +1357,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x2_1
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
@@ -1432,12 +1379,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
- lxvdsx vs22, o16, BO // load real part from B
- lxvdsx vs23, o24, BO // load imag part from B
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
+ lxvd2x vs22, o32, BO // load real part from B
+ lxvd2x vs23, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -1459,12 +1406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
- lxvdsx vs22, o16, BO // load real part from B
- lxvdsx vs23, o24, BO // load imag part from B
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
+ lxvd2x vs22, o32, BO // load real part from B
+ lxvd2x vs23, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@@ -1486,12 +1433,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@@ -1529,12 +1476,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -1556,12 +1503,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@@ -1725,12 +1672,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x1_1
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A
@@ -1745,12 +1692,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
- lxvdsx vs22, o16, BO // load real part from B
- lxvdsx vs23, o24, BO // load imag part from B
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
+ lxvd2x vs22, o32, BO // load real part from B
+ lxvd2x vs23, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -1767,12 +1714,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
- lxvdsx vs22, o16, BO // load real part from B
- lxvdsx vs23, o24, BO // load imag part from B
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
+ lxvd2x vs22, o32, BO // load real part from B
+ lxvd2x vs23, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@@ -1789,12 +1736,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@@ -1823,12 +1770,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -1845,12 +1792,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
- lxvdsx vs18, o16, BO // load real part from B
- lxvdsx vs19, o24, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
+ lxvd2x vs18, o32, BO // load real part from B
+ lxvd2x vs19, o48, BO // load imag part from B
- addi BO, BO, 32
+ addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@@ -1956,10 +1903,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x8_1
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
@@ -1994,10 +1941,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -2035,10 +1982,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@@ -2076,10 +2023,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@@ -2140,10 +2087,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -2181,10 +2128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@@ -2452,10 +2399,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x4_1
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
@@ -2476,10 +2423,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -2502,10 +2449,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@@ -2528,10 +2475,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@@ -2569,10 +2516,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -2595,10 +2542,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@@ -2748,10 +2695,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x2_1
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
@@ -2768,10 +2715,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -2788,10 +2735,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@@ -2808,10 +2755,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@@ -2839,10 +2786,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -2859,10 +2806,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@@ -2954,10 +2901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x1_1
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A
@@ -2972,10 +2919,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -2989,10 +2936,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
- lxvdsx vs20, o0, BO // load real part from B
- lxvdsx vs21, o8, BO // load imag part from B
+ lxvd2x vs20, o0, BO // load real part from B
+ lxvd2x vs21, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@@ -3006,10 +2953,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@@ -3032,10 +2979,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@@ -3049,10 +2996,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
- lxvdsx vs16, o0, BO // load real part from B
- lxvdsx vs17, o8, BO // load imag part from B
+ lxvd2x vs16, o0, BO // load real part from B
+ lxvd2x vs17, o16, BO // load imag part from B
- addi BO, BO, 16
+ addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c
new file mode 100644
index 000000000..213839a8f
--- /dev/null
+++ b/kernel/power/zscal.c
@@ -0,0 +1,176 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+#include "common.h"
+
+#pragma GCC optimize "O1"
+
+#if defined(POWER8)
+#include "zscal_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void zscal_kernel_8(BLASLONG n, FLOAT *x, FLOAT *alpha)
+{
+
+ BLASLONG i=0;
+ FLOAT *x1=x;
+ FLOAT alpha_r1=alpha[0];
+ FLOAT alpha_r2=alpha[1];
+ FLOAT alpha_i1=alpha[2];
+ FLOAT alpha_i2=alpha[3];
+ FLOAT temp00, temp01, temp10, temp11, temp20, temp21, temp30, temp31;
+ FLOAT x0_r, x0_i, x1_r, x1_i, x2_r, x2_i, x3_r, x3_i;
+
+ while ( i 0 )
+ {
+ alpha[0] = da_r;
+ alpha[1] = da_r;
+ alpha[2] = -da_i;
+ alpha[3] = da_i;
+ zscal_kernel_8(n1, x, alpha);
+ i=n1;
+ ip = n1 * 2;
+
+ }
+
+ while ( i < n )
+ {
+
+ temp = da_r * x[ip] - da_i * x[ip+1] ;
+ x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
+ x[ip] = temp;
+ ip += 2;
+ i++;
+ }
+
+ }
+ else
+ {
+
+ inc_x2 = 2 * inc_x;
+
+ while ( i < n )
+ {
+
+ temp = da_r * x[ip] - da_i * x[ip+1] ;
+ x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
+ x[ip] = temp;
+ ip += inc_x2;
+ i++;
+ }
+
+
+ }
+
+ return(0);
+
+}
+
+
diff --git a/kernel/power/zscal_microk_power8.c b/kernel/power/zscal_microk_power8.c
new file mode 100644
index 000000000..5e09d8d79
--- /dev/null
+++ b/kernel/power/zscal_microk_power8.c
@@ -0,0 +1,224 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+*
+* I don't use fused multipy-add ( lapack precision problems )
+*
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha) __attribute__ ((noinline));
+
+static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ FLOAT *x2=x+1;
+ BLASLONG pre = 384;
+
+ __asm__ __volatile__
+ (
+
+ "lxvd2x 32, 0, %3 \n\t" // alpha_r , alpha_r
+ "lxvd2x 33, %5, %3 \n\t" // -alpha_i , alpha_i
+ "addi %1, %1, -8 \n\t"
+
+ "dcbt %2, %4 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -8 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "dcbt %2, %4 \n\t"
+
+ "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
+ "xvmuldp 49, 41, 32 \n\t"
+ "xvmuldp 50, 42, 32 \n\t"
+ "xvmuldp 51, 43, 32 \n\t"
+ "xvmuldp 52, 44, 32 \n\t"
+ "xvmuldp 53, 45, 32 \n\t"
+ "xvmuldp 54, 46, 32 \n\t"
+ "xvmuldp 55, 47, 32 \n\t"
+
+ "xxswapd 56, 40 \n\t"
+ "xxswapd 57, 41 \n\t"
+ "xxswapd 58, 42 \n\t"
+ "xxswapd 59, 43 \n\t"
+ "xxswapd 60, 44 \n\t"
+ "xxswapd 61, 45 \n\t"
+ "xxswapd 62, 46 \n\t"
+ "xxswapd 63, 47 \n\t"
+
+ "xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
+ "xvmuldp 57, 57, 33 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
+ "lxvd2x 41, %5, %2 \n\t"
+
+ "xvmuldp 58, 58, 33 \n\t"
+ "xvmuldp 59, 59, 33 \n\t"
+
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+
+ "xvmuldp 60, 60, 33 \n\t"
+ "xvmuldp 61, 61, 33 \n\t"
+
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+
+ "xvmuldp 62, 62, 33 \n\t"
+ "xvmuldp 63, 63, 33 \n\t"
+
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "xvadddp 48, 48 , 56 \n\t"
+ "xvadddp 49, 49 , 57 \n\t"
+ "xvadddp 50, 50 , 58 \n\t"
+ "xvadddp 51, 51 , 59 \n\t"
+
+ "stxvd2x 48, 0, %1 \n\t"
+ "stxvd2x 49, %5, %1 \n\t"
+
+ "xvadddp 52, 52 , 60 \n\t"
+ "xvadddp 53, 53 , 61 \n\t"
+
+ "stxvd2x 50, %6, %1 \n\t"
+ "stxvd2x 51, %7, %1 \n\t"
+
+ "xvadddp 54, 54 , 62 \n\t"
+ "xvadddp 55, 55 , 63 \n\t"
+
+ "stxvd2x 52, %8, %1 \n\t"
+ "stxvd2x 53, %9, %1 \n\t"
+ "stxvd2x 54, %10, %1 \n\t"
+ "stxvd2x 55, %11, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %0 , %0 , -8 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
+ "xvmuldp 49, 41, 32 \n\t"
+ "xvmuldp 50, 42, 32 \n\t"
+ "xvmuldp 51, 43, 32 \n\t"
+ "xvmuldp 52, 44, 32 \n\t"
+ "xvmuldp 53, 45, 32 \n\t"
+ "xvmuldp 54, 46, 32 \n\t"
+ "xvmuldp 55, 47, 32 \n\t"
+
+ "xxswapd 56, 40 \n\t"
+ "xxswapd 57, 41 \n\t"
+ "xxswapd 58, 42 \n\t"
+ "xxswapd 59, 43 \n\t"
+ "xxswapd 60, 44 \n\t"
+ "xxswapd 61, 45 \n\t"
+ "xxswapd 62, 46 \n\t"
+ "xxswapd 63, 47 \n\t"
+
+ "xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
+ "xvmuldp 57, 57, 33 \n\t"
+ "xvmuldp 58, 58, 33 \n\t"
+ "xvmuldp 59, 59, 33 \n\t"
+ "xvmuldp 60, 60, 33 \n\t"
+ "xvmuldp 61, 61, 33 \n\t"
+ "xvmuldp 62, 62, 33 \n\t"
+ "xvmuldp 63, 63, 33 \n\t"
+
+ "xvadddp 48, 48 , 56 \n\t"
+ "xvadddp 49, 49 , 57 \n\t"
+ "xvadddp 50, 50 , 58 \n\t"
+ "xvadddp 51, 51 , 59 \n\t"
+ "xvadddp 52, 52 , 60 \n\t"
+ "xvadddp 53, 53 , 61 \n\t"
+ "xvadddp 54, 54 , 62 \n\t"
+ "xvadddp 55, 55 , 63 \n\t"
+
+ "stxvd2x 48, 0, %1 \n\t"
+ "stxvd2x 49, %5, %1 \n\t"
+ "stxvd2x 50, %6, %1 \n\t"
+ "stxvd2x 51, %7, %1 \n\t"
+ "stxvd2x 52, %8, %1 \n\t"
+ "stxvd2x 53, %9, %1 \n\t"
+ "stxvd2x 54, %10, %1 \n\t"
+ "stxvd2x 55, %11, %1 \n\t"
+
+
+ :
+ :
+ "r" (i), // 0
+ "r" (x2), // 1
+ "r" (x1), // 2
+ "r" (alpha), // 3
+ "r" (pre), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2" , "%1", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c
new file mode 100644
index 000000000..5ec1eee2e
--- /dev/null
+++ b/kernel/power/zswap.c
@@ -0,0 +1,175 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#include "common.h"
+
+
+#if defined(POWER8)
+#include "zswap_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG i=0;
+ FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+ FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+
+ while ( i 0 )
+ {
+ zswap_kernel_16(n1, x, y);
+ i=n1;
+ ix = 2* n1;
+ iy = 2* n1;
+ }
+
+ while(i < n)
+ {
+
+ temp[0] = x[ix] ;
+ temp[1] = x[ix+1] ;
+ x[ix] = y[iy] ;
+ x[ix+1] = y[iy+1] ;
+ y[iy] = temp[0] ;
+ y[iy+1] = temp[1] ;
+
+ ix += 2 ;
+ iy += 2 ;
+ i++ ;
+
+
+ }
+
+
+ }
+ else
+ {
+
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ while(i < n)
+ {
+
+ temp[0] = x[ix] ;
+ temp[1] = x[ix+1] ;
+ x[ix] = y[iy] ;
+ x[ix+1] = y[iy+1] ;
+ y[iy] = temp[0] ;
+ y[iy+1] = temp[1] ;
+
+ ix += inc_x2 ;
+ iy += inc_y2 ;
+ i++ ;
+
+ }
+
+ }
+ return(0);
+
+
+}
+
+
diff --git a/kernel/power/zswap_microk_power8.c b/kernel/power/zswap_microk_power8.c
new file mode 100644
index 000000000..9e5623752
--- /dev/null
+++ b/kernel/power/zswap_microk_power8.c
@@ -0,0 +1,180 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+ BLASLONG i = n;
+ BLASLONG o16 = 16;
+ BLASLONG o32 = 32;
+ BLASLONG o48 = 48;
+ BLASLONG o64 = 64;
+ BLASLONG o80 = 80;
+ BLASLONG o96 = 96;
+ BLASLONG o112 = 112;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ FLOAT *x2=x+1;
+ FLOAT *y2=y+1;
+ BLASLONG pre = 384;
+ BLASLONG alpha=0;
+
+ __asm__ __volatile__
+ (
+
+ "addi %3, %3, -8 \n\t"
+ "addi %4, %4, -8 \n\t"
+
+ ".align 5 \n\t"
+ "1: \n\t"
+
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %5, %2 \n\t"
+ "lxvd2x 34, %6, %2 \n\t"
+ "lxvd2x 35, %7, %2 \n\t"
+ "lxvd2x 36, %8, %2 \n\t"
+ "lxvd2x 37, %9, %2 \n\t"
+ "lxvd2x 38, %10, %2 \n\t"
+ "lxvd2x 39, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "lxvd2x 48, 0, %1 \n\t"
+ "lxvd2x 49, %5, %1 \n\t"
+ "lxvd2x 50, %6, %1 \n\t"
+ "lxvd2x 51, %7, %1 \n\t"
+ "lxvd2x 52, %8, %1 \n\t"
+ "lxvd2x 53, %9, %1 \n\t"
+ "lxvd2x 54, %10, %1 \n\t"
+ "lxvd2x 55, %11, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+
+ "lxvd2x 56, 0, %1 \n\t"
+ "lxvd2x 57, %5, %1 \n\t"
+ "lxvd2x 58, %6, %1 \n\t"
+ "lxvd2x 59, %7, %1 \n\t"
+ "lxvd2x 60, %8, %1 \n\t"
+ "lxvd2x 61, %9, %1 \n\t"
+ "lxvd2x 62, %10, %1 \n\t"
+ "lxvd2x 63, %11, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvd2x 48, 0, %4 \n\t"
+ "stxvd2x 49, %5, %4 \n\t"
+ "stxvd2x 50, %6, %4 \n\t"
+ "stxvd2x 51, %7, %4 \n\t"
+ "stxvd2x 52, %8, %4 \n\t"
+ "stxvd2x 53, %9, %4 \n\t"
+ "stxvd2x 54, %10, %4 \n\t"
+ "stxvd2x 55, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "stxvd2x 56, 0, %4 \n\t"
+ "stxvd2x 57, %5, %4 \n\t"
+ "stxvd2x 58, %6, %4 \n\t"
+ "stxvd2x 59, %7, %4 \n\t"
+ "stxvd2x 60, %8, %4 \n\t"
+ "stxvd2x 61, %9, %4 \n\t"
+ "stxvd2x 62, %10, %4 \n\t"
+ "stxvd2x 63, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "addic. %0 , %0 , -16 \n\t"
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (y1), // 1
+ "r" (x1), // 2
+ "r" (y2), // 3
+ "r" (x2), // 4
+ "r" (o16), // 5
+ "r" (o32), // 6
+ "r" (o48), // 7
+ "r" (o64), // 8
+ "r" (o80), // 9
+ "r" (o96), // 10
+ "r" (o112) // 11
+ : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
+ );
+
+}
+
+
diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S
index 8b953765e..0cfe613d5 100644
--- a/kernel/power/ztrmm_kernel_8x2_power8.S
+++ b/kernel/power/ztrmm_kernel_8x2_power8.S
@@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#endif
-#include "zgemm_macros_8x2_power8.S"
+#include "ztrmm_macros_8x2_power8.S"
cmpwi cr0, M, 0
ble .L999
diff --git a/kernel/power/ztrmm_macros_8x2_power8.S b/kernel/power/ztrmm_macros_8x2_power8.S
new file mode 100644
index 000000000..701ec65c8
--- /dev/null
+++ b/kernel/power/ztrmm_macros_8x2_power8.S
@@ -0,0 +1,3110 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xssubdp
+ #define XSFADD_I1 xsadddp
+ #define XSFADD_I2 xsadddp
+
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xsadddp
+ #define XSFADD_I1 xssubdp
+ #define XSFADD_I2 xsadddp
+
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xsadddp
+ #define XSFADD_I1 xsadddp
+ #define XSFADD_I2 xssubdp
+
+#else // CC || CR || RC || RR
+
+ #define XSFADD_R1 xsadddp
+ #define XSFADD_R2 xssubdp
+ #define XSFADD_I1 xssubdp
+ #define XSFADD_I2 xssubdp
+
+#endif
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+
+.endm
+
+.macro KERNEL2x8_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs12, o0, AO // load real,imag from A
+ lxvd2x vs13, o16, AO // load real,imag from A
+ lxvd2x vs14, o32, AO // load real,imag from A
+ lxvd2x vs15, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmuldp vs40, vs4, vs16 // real*real, imag*real
+ xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmuldp vs42, vs5, vs16 // real*real, imag*real
+ xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmuldp vs44, vs6, vs16 // real*real, imag*real
+ xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmuldp vs46, vs7, vs16 // real*real, imag*real
+ xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
+
+ xvmuldp vs48, vs0, vs18 // real*real, imag*real
+ xvmuldp vs49, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs50, vs1, vs18 // real*real, imag*real
+ xvmuldp vs51, vs1, vs19 // real*imag, imag*imag
+ xvmuldp vs52, vs2, vs18 // real*real, imag*real
+ xvmuldp vs53, vs2, vs19 // real*imag, imag*imag
+ xvmuldp vs54, vs3, vs18 // real*real, imag*real
+ xvmuldp vs55, vs3, vs19 // real*imag, imag*imag
+ xvmuldp vs56, vs4, vs18 // real*real, imag*real
+ xvmuldp vs57, vs4, vs19 // real*imag, imag*imag
+ xvmuldp vs58, vs5, vs18 // real*real, imag*real
+ xvmuldp vs59, vs5, vs19 // real*imag, imag*imag
+ xvmuldp vs60, vs6, vs18 // real*real, imag*real
+ xvmuldp vs61, vs6, vs19 // real*imag, imag*imag
+ xvmuldp vs62, vs7, vs18 // real*real, imag*real
+ xvmuldp vs63, vs7, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
+
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
+
+ addi AO, AO, 64
+
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs48, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs50, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
+
+ lxvd2x vs12, o0, AO // load real,imag from A
+ lxvd2x vs13, o16, AO // load real,imag from A
+
+ xvmaddadp vs52, vs2, vs18 // real*real, imag*real
+ xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
+ xvmaddadp vs54, vs3, vs18 // real*real, imag*real
+ xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
+
+ lxvd2x vs14, o32, AO // load real,imag from A
+ lxvd2x vs15, o48, AO // load real,imag from A
+
+ xvmaddadp vs56, vs4, vs18 // real*real, imag*real
+ xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
+ xvmaddadp vs58, vs5, vs18 // real*real, imag*real
+ xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ xvmaddadp vs60, vs6, vs18 // real*real, imag*real
+ xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
+ xvmaddadp vs62, vs7, vs18 // real*real, imag*real
+ xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ xvmaddadp vs40, vs12, vs20 // real*real, imag*real
+ xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
+ xvmaddadp vs42, vs13, vs20 // real*real, imag*real
+ xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
+
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ xvmaddadp vs44, vs14, vs20 // real*real, imag*real
+ xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
+ xvmaddadp vs46, vs15, vs20 // real*real, imag*real
+ xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
+
+ addi AO, AO, 64
+
+ xvmaddadp vs48, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs50, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+
+ xvmaddadp vs52, vs10, vs22 // real*real, imag*real
+ xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
+ xvmaddadp vs54, vs11, vs22 // real*real, imag*real
+ xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
+
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ xvmaddadp vs56, vs12, vs22 // real*real, imag*real
+ xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
+ xvmaddadp vs58, vs13, vs22 // real*real, imag*real
+ xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
+
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ xvmaddadp vs60, vs14, vs22 // real*real, imag*real
+ xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
+ xvmaddadp vs62, vs15, vs22 // real*real, imag*real
+ xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+ xvmaddadp vs40, vs12, vs20 // real*real, imag*real
+ xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
+ xvmaddadp vs42, vs13, vs20 // real*real, imag*real
+ xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
+ xvmaddadp vs44, vs14, vs20 // real*real, imag*real
+ xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
+ xvmaddadp vs46, vs15, vs20 // real*real, imag*real
+ xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs48, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs50, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
+ xvmaddadp vs52, vs10, vs22 // real*real, imag*real
+ xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
+ xvmaddadp vs54, vs11, vs22 // real*real, imag*real
+ xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
+ xvmaddadp vs56, vs12, vs22 // real*real, imag*real
+ xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
+ xvmaddadp vs58, vs13, vs22 // real*real, imag*real
+ xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
+ xvmaddadp vs60, vs14, vs22 // real*real, imag*real
+ xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
+ xvmaddadp vs62, vs15, vs22 // real*real, imag*real
+ xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmuldp vs40, vs4, vs16 // real*real, imag*real
+ xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmuldp vs42, vs5, vs16 // real*real, imag*real
+ xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmuldp vs44, vs6, vs16 // real*real, imag*real
+ xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmuldp vs46, vs7, vs16 // real*real, imag*real
+ xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
+
+ xvmuldp vs48, vs0, vs18 // real*real, imag*real
+ xvmuldp vs49, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs50, vs1, vs18 // real*real, imag*real
+ xvmuldp vs51, vs1, vs19 // real*imag, imag*imag
+ xvmuldp vs52, vs2, vs18 // real*real, imag*real
+ xvmuldp vs53, vs2, vs19 // real*imag, imag*imag
+ xvmuldp vs54, vs3, vs18 // real*real, imag*real
+ xvmuldp vs55, vs3, vs19 // real*imag, imag*imag
+ xvmuldp vs56, vs4, vs18 // real*real, imag*real
+ xvmuldp vs57, vs4, vs19 // real*imag, imag*imag
+ xvmuldp vs58, vs5, vs18 // real*real, imag*real
+ xvmuldp vs59, vs5, vs19 // real*imag, imag*imag
+ xvmuldp vs60, vs6, vs18 // real*real, imag*real
+ xvmuldp vs61, vs6, vs19 // real*imag, imag*imag
+ xvmuldp vs62, vs7, vs18 // real*real, imag*real
+ xvmuldp vs63, vs7, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs48, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs50, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
+ xvmaddadp vs52, vs2, vs18 // real*real, imag*real
+ xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
+ xvmaddadp vs54, vs3, vs18 // real*real, imag*real
+ xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
+ xvmaddadp vs56, vs4, vs18 // real*real, imag*real
+ xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
+ xvmaddadp vs58, vs5, vs18 // real*real, imag*real
+ xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
+ xvmaddadp vs60, vs6, vs18 // real*real, imag*real
+ xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
+ xvmaddadp vs62, vs7, vs18 // real*real, imag*real
+ xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x8
+
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+ lxvd2x vs20, o0, T2
+ lxvd2x vs21, o16, T2
+ lxvd2x vs22, o32, T2
+ lxvd2x vs23, o48, T2
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs36 // realA*realB
+ XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
+
+ xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs36 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs37 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs38 // realA*realB
+ XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
+
+ xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs38 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs39 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs40 // realA*realB
+ XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
+
+ xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs40 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs41 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs42 // realA*realB
+ XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
+
+ xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs42 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs43 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs44 // realA*realB
+ XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
+
+ xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs44 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs45 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs46 // realA*realB
+ XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
+
+ xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs46 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs47 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+ xvadddp vs12, vs12, vs20
+ xvadddp vs13, vs13, vs21
+ xvadddp vs14, vs14, vs22
+ xvadddp vs15, vs15, vs23
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+ stxvd2x vs12, o0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+ lxvd2x vs20, o0, T2
+ lxvd2x vs21, o16, T2
+ lxvd2x vs22, o32, T2
+ lxvd2x vs23, o48, T2
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs48 // realA*realB
+ XSFADD_R2 vs0, vs0, vs49 // imagA*imagB
+
+ xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs48 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs49 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs50 // realA*realB
+ XSFADD_R2 vs0, vs0, vs51 // imagA*imagB
+
+ xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs50 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs51 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs52 // realA*realB
+ XSFADD_R2 vs0, vs0, vs53 // imagA*imagB
+
+ xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs52 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs53 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs54 // realA*realB
+ XSFADD_R2 vs0, vs0, vs55 // imagA*imagB
+
+ xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs54 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs55 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs56 // realA*realB
+ XSFADD_R2 vs0, vs0, vs57 // imagA*imagB
+
+ xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs56 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs57 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs58 // realA*realB
+ XSFADD_R2 vs0, vs0, vs59 // imagA*imagB
+
+ xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs58 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs59 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs60 // realA*realB
+ XSFADD_R2 vs0, vs0, vs61 // imagA*imagB
+
+ xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs60 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs61 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs62 // realA*realB
+ XSFADD_R2 vs0, vs0, vs63 // imagA*imagB
+
+ xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs62 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs63 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+ xvadddp vs12, vs12, vs20
+ xvadddp vs13, vs13, vs21
+ xvadddp vs14, vs14, vs22
+ xvadddp vs15, vs15, vs23
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+ stxvd2x vs12, o0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+ addi CO, CO, 128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+
+.endm
+
+.macro KERNEL2x4_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+
+ xvmuldp vs40, vs0, vs18 // real*real, imag*real
+ xvmuldp vs41, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs42, vs1, vs18 // real*real, imag*real
+ xvmuldp vs43, vs1, vs19 // real*imag, imag*imag
+ xvmuldp vs44, vs2, vs18 // real*real, imag*real
+ xvmuldp vs45, vs2, vs19 // real*imag, imag*imag
+ xvmuldp vs46, vs3, vs18 // real*real, imag*real
+ xvmuldp vs47, vs3, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs40, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs42, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
+ xvmaddadp vs44, vs2, vs18 // real*real, imag*real
+ xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
+ xvmaddadp vs46, vs3, vs18 // real*real, imag*real
+ xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs40, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs42, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
+ xvmaddadp vs44, vs10, vs22 // real*real, imag*real
+ xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
+ xvmaddadp vs46, vs11, vs22 // real*real, imag*real
+ xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs40, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs42, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
+ xvmaddadp vs44, vs10, vs22 // real*real, imag*real
+ xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
+ xvmaddadp vs46, vs11, vs22 // real*real, imag*real
+ xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+
+ xvmuldp vs40, vs0, vs18 // real*real, imag*real
+ xvmuldp vs41, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs42, vs1, vs18 // real*real, imag*real
+ xvmuldp vs43, vs1, vs19 // real*imag, imag*imag
+ xvmuldp vs44, vs2, vs18 // real*real, imag*real
+ xvmuldp vs45, vs2, vs19 // real*imag, imag*imag
+ xvmuldp vs46, vs3, vs18 // real*real, imag*real
+ xvmuldp vs47, vs3, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs40, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs42, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
+ xvmaddadp vs44, vs2, vs18 // real*real, imag*real
+ xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
+ xvmaddadp vs46, vs3, vs18 // real*real, imag*real
+ xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x4
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs36 // realA*realB
+ XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
+
+ xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs36 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs37 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs38 // realA*realB
+ XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
+
+ xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs38 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs39 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs40 // realA*realB
+ XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
+
+ xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs40 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs41 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs42 // realA*realB
+ XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
+
+ xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs42 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs43 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs44 // realA*realB
+ XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
+
+ xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs44 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs45 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs46 // realA*realB
+ XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
+
+ xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs46 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs47 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+
+.endm
+
+.macro KERNEL2x2_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+
+ xvmuldp vs36, vs0, vs18 // real*real, imag*real
+ xvmuldp vs37, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs38, vs1, vs18 // real*real, imag*real
+ xvmuldp vs39, vs1, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs36, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs38, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs36, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs38, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs36, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag
+ xvmaddadp vs38, vs9, vs22 // real*real, imag*real
+ xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+
+ xvmuldp vs36, vs0, vs18 // real*real, imag*real
+ xvmuldp vs37, vs0, vs19 // real*imag, imag*imag
+ xvmuldp vs38, vs1, vs18 // real*real, imag*real
+ xvmuldp vs39, vs1, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs36, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs38, vs1, vs18 // real*real, imag*real
+ xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x2
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs36 // realA*realB
+ XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
+
+ xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs36 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs37 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs38 // realA*realB
+ XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
+
+ xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs38 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs39 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+
+.endm
+
+.macro KERNEL2x1_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+
+ xvmuldp vs34, vs0, vs18 // real*real, imag*real
+ xvmuldp vs35, vs0, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+ lxvdsx vs22, o16, BO // load real part from B
+ lxvdsx vs23, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs34, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs34, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+
+ xvmaddadp vs34, vs8, vs22 // real*real, imag*real
+ xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+
+ xvmuldp vs34, vs0, vs18 // real*real, imag*real
+ xvmuldp vs35, vs0, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+ lxvdsx vs18, o16, BO // load real part from B
+ lxvdsx vs19, o24, BO // load imag part from B
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+
+ xvmaddadp vs34, vs0, vs18 // real*real, imag*real
+ xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x1
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+
+#endif
+
+ stxvd2x vs8, o0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+
+#endif
+
+ stxvd2x vs8, o0, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+
+.endm
+
+.macro KERNEL1x8_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs12, o0, AO // load real,imag from A
+ lxvd2x vs13, o16, AO // load real,imag from A
+ lxvd2x vs14, o32, AO // load real,imag from A
+ lxvd2x vs15, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmuldp vs40, vs4, vs16 // real*real, imag*real
+ xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmuldp vs42, vs5, vs16 // real*real, imag*real
+ xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmuldp vs44, vs6, vs16 // real*real, imag*real
+ xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmuldp vs46, vs7, vs16 // real*real, imag*real
+ xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs12, o0, AO // load real,imag from A
+ lxvd2x vs13, o16, AO // load real,imag from A
+ lxvd2x vs14, o32, AO // load real,imag from A
+ lxvd2x vs15, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+ xvmaddadp vs40, vs12, vs20 // real*real, imag*real
+ xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
+ xvmaddadp vs42, vs13, vs20 // real*real, imag*real
+ xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
+ xvmaddadp vs44, vs14, vs20 // real*real, imag*real
+ xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
+ xvmaddadp vs46, vs15, vs20 // real*real, imag*real
+ xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+ xvmaddadp vs40, vs12, vs20 // real*real, imag*real
+ xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
+ xvmaddadp vs42, vs13, vs20 // real*real, imag*real
+ xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
+ xvmaddadp vs44, vs14, vs20 // real*real, imag*real
+ xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
+ xvmaddadp vs46, vs15, vs20 // real*real, imag*real
+ xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmuldp vs40, vs4, vs16 // real*real, imag*real
+ xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmuldp vs42, vs5, vs16 // real*real, imag*real
+ xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmuldp vs44, vs6, vs16 // real*real, imag*real
+ xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmuldp vs46, vs7, vs16 // real*real, imag*real
+ xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO // load real,imag from A
+ lxvd2x vs5, o16, AO // load real,imag from A
+ lxvd2x vs6, o32, AO // load real,imag from A
+ lxvd2x vs7, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x8
+
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+ lxvd2x vs20, o0, T2
+ lxvd2x vs21, o16, T2
+ lxvd2x vs22, o32, T2
+ lxvd2x vs23, o48, T2
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs36 // realA*realB
+ XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
+
+ xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs36 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs37 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs38 // realA*realB
+ XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
+
+ xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs38 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs39 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs40 // realA*realB
+ XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
+
+ xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs40 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs41 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs42 // realA*realB
+ XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
+
+ xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs42 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs43 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs44 // realA*realB
+ XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
+
+ xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs44 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs45 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs46 // realA*realB
+ XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
+
+ xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs46 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs47 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+ xvadddp vs12, vs12, vs20
+ xvadddp vs13, vs13, vs21
+ xvadddp vs14, vs14, vs22
+ xvadddp vs15, vs15, vs23
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+ stxvd2x vs12, o0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+ addi CO, CO, 128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+
+.endm
+
+.macro KERNEL1x4_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+ lxvd2x vs10, o32, AO // load real,imag from A
+ lxvd2x vs11, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmuldp vs36, vs2, vs16 // real*real, imag*real
+ xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmuldp vs38, vs3, vs16 // real*real, imag*real
+ xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+ lxvd2x vs2, o32, AO // load real,imag from A
+ lxvd2x vs3, o48, AO // load real,imag from A
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x4
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+ lxvd2x vs18, o32, T1
+ lxvd2x vs19, o48, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs36 // realA*realB
+ XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
+
+ xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs36 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs37 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs38 // realA*realB
+ XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
+
+ xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs38 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs39 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+ xvadddp vs10, vs10, vs18
+ xvadddp vs11, vs11, vs19
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+
+.endm
+
+.macro KERNEL1x2_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+ lxvd2x vs9, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmuldp vs34, vs1, vs16 // real*real, imag*real
+ xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+ lxvd2x vs1, o16, AO // load real,imag from A
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x2
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+ lxvd2x vs17, o16, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs34 // realA*realB
+ XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
+
+ xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs34 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs35 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+ xvadddp vs9, vs9, vs17
+
+#endif
+
+ stxvd2x vs8, o0, T1
+ stxvd2x vs9, o16, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+
+.endm
+
+.macro KERNEL1x1_I1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+ lxvd2x vs8, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs20, o0, BO // load real part from B
+ lxvdsx vs21, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmuldp vs32, vs0, vs16 // real*real, imag*real
+ xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+ lxvd2x vs0, o0, AO // load real,imag from A
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO // load real part from B
+ lxvdsx vs17, o8, BO // load imag part from B
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x1
+
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvd2x vs16, o0, T1
+
+#endif
+
+
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+ XSFADD_R1 vs0, vs0, vs32 // realA*realB
+ XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
+
+ xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
+ xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+ XSFADD_I1 vs1, vs1, vs32 // realA*imagB
+ XSFADD_I2 vs1, vs1, vs33 // imagA*realB
+
+ xsmuldp vs4, vs0, alpha_r // real*alpha_r
+ xsmuldp vs5, vs1, alpha_i // imag*alpha_i
+ xsmuldp vs6, vs0, alpha_i // real*alpha_i
+ xsmuldp vs7, vs1, alpha_r // imag*alpha_r
+
+ xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
+ xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
+ xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+ xvadddp vs8, vs8, vs16
+
+#endif
+
+ stxvd2x vs8, o0, T1
+
+ add T1, T1, LDC
+ addi CO, CO, 16
+
+.endm
+
diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER
index f14c82303..4ec748284 100644
--- a/kernel/x86_64/KERNEL.STEAMROLLER
+++ b/kernel/x86_64/KERNEL.STEAMROLLER
@@ -24,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c
DGEMVNKERNEL = dgemv_n_4.c
DGEMVTKERNEL = dgemv_t_4.c
-ZGEMVNKERNEL = zgemv_t_4.c
+ZGEMVNKERNEL = zgemv_n_4.c
ZGEMVTKERNEL = zgemv_t_4.c
DCOPYKERNEL = dcopy_bulldozer.S
diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c
index a6da1fea7..a3d20d276 100644
--- a/kernel/x86_64/sdot.c
+++ b/kernel/x86_64/sdot.c
@@ -72,18 +72,20 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
+ double dot = 0.0 ;
- FLOAT dot = 0.0 ;
+ FLOAT mydot=0.0;
+ BLASLONG n1;
if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
- BLASLONG n1 = n & -32;
+ n1 = n & (BLASLONG)(-32);
if ( n1 )
- sdot_kernel_16(n1, x, y , &dot );
+ sdot_kernel_16(n1, x, y , &mydot );
i = n1;
@@ -94,12 +96,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
i++ ;
}
+ dot+=mydot;
return(dot);
}
- BLASLONG n1 = n & -2;
+ n1 = n & (BLASLONG)(-2);
while(i < n1)
{
@@ -124,4 +127,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
}
-
diff --git a/param.h b/param.h
index 31125d8e4..a6ead4b64 100644
--- a/param.h
+++ b/param.h
@@ -1961,35 +1961,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(POWER8)
-#define SNUMOPT 4
+#define SNUMOPT 16
#define DNUMOPT 8
-#define GEMM_DEFAULT_OFFSET_A 384
-#define GEMM_DEFAULT_OFFSET_B 1024
+#define GEMM_DEFAULT_OFFSET_A 4096
+#define GEMM_DEFAULT_OFFSET_B 4096
#define GEMM_DEFAULT_ALIGN 0x03fffUL
-#define SGEMM_DEFAULT_UNROLL_M 4
-#define SGEMM_DEFAULT_UNROLL_N 4
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 16
#define DGEMM_DEFAULT_UNROLL_N 4
-#define CGEMM_DEFAULT_UNROLL_M 2
-#define CGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 2
-#define SGEMM_DEFAULT_P 992
+#define SGEMM_DEFAULT_P 960
#define DGEMM_DEFAULT_P 480
-#define CGEMM_DEFAULT_P 488
-#define ZGEMM_DEFAULT_P 240
+#define CGEMM_DEFAULT_P 720
+#define ZGEMM_DEFAULT_P 480
-#define SGEMM_DEFAULT_Q 504
+#define SGEMM_DEFAULT_Q 720
#define DGEMM_DEFAULT_Q 720
-#define CGEMM_DEFAULT_Q 400
-#define ZGEMM_DEFAULT_Q 360
+#define CGEMM_DEFAULT_Q 720
+#define ZGEMM_DEFAULT_Q 720
-#define SGEMM_DEFAULT_R 28800
+#define SGEMM_DEFAULT_R 21600
#define DGEMM_DEFAULT_R 14400
-#define ZGEMM_DEFAULT_R 7200
+#define CGEMM_DEFAULT_R 16200
+#define ZGEMM_DEFAULT_R 21600
#define SYMV_P 8